In [None]:
# Basic Libraries
import numpy as np
import pandas as pd
from warnings import filterwarnings
from collections import Counter

# Visualizations Libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.offline as pyo
import plotly.express as px
import plotly.graph_objs as go
pyo.init_notebook_mode()
import plotly.figure_factory as ff
import missingno as msno

# Data Pre-processing Libraries
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.model_selection import train_test_split

# Modelling Libraries
from sklearn import metrics
from xgboost import XGBClassifier, plot_importance
from sklearn.linear_model import LogisticRegression,RidgeClassifier,SGDClassifier,PassiveAggressiveClassifier
from sklearn.linear_model import Perceptron
from sklearn.svm import SVC,LinearSVC,NuSVC
from sklearn.neighbors import KNeighborsClassifier,NearestCentroid
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB,BernoulliNB
from sklearn.ensemble import VotingClassifier

# Evaluation & CV Libraries
from sklearn.metrics import precision_score,accuracy_score
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV,RepeatedStratifiedKFold


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



# Feature Description

1) ph: pH of 1. water (0 to 14).

2) Hardness: Capacity of water to precipitate soap in mg/L.

3) Solids: Total dissolved solids in ppm.

4) Chloramines: Amount of Chloramines in ppm.

5) Sulfate: Amount of Sulfates dissolved in mg/L.

6) Conductivity: Electrical conductivity of water in μS/cm.

7) Organic_carbon: Amount of organic carbon in ppm.

8) Trihalomethanes: Amount of Trihalomethanes in μg/L.

9) Turbidity: Measure of light emiting property of water in NTU.

10) Potability: Indicates if water is safe for human consumption. Potable - 1 and Not potable - 0



# İmport The DataSet

In [None]:
df=pd.read_csv('../input/water-potability/water_potability.csv')

In [None]:
df.info()

Replacing missing value by mean of all values in respective column
def replace_nan_by_mean(info):
    for col in info.columns:
        info[col].fillna(np.mean(info[col]),inplace=True)
    return info
data=replace_nan_by_mean(data)

In [None]:
df[['ph','Hardness','Solids','Chloramines','Sulfate','Conductivity','Organic_carbon','Trihalomethanes','Turbidity',"Potability"]].groupby(["Potability"], as_index = False).mean().sort_values(by = "Potability").style.background_gradient("Wistia")


In [None]:
target = 'Potability'
features_list = list(df.columns)
features_list.remove(target)

# Features distributions
  * Univariate Analysis (features and target 'Potability')


In [None]:
df[features_list].hist(bins=40, edgecolor='b', linewidth=1.0,
                          xlabelsize=8, ylabelsize=8, grid=False, 
                          figsize=(16,6), color='red')    
plt.tight_layout(rect=(0, 0, 1.2, 1.2))   
plt.suptitle('Water Potability', x=0.65, y=1.25, fontsize=14);  

# Correlation Matrix

In [None]:
# for visualizing correlations
f, ax = plt.subplots(figsize=(10, 6))
corr = df.corr()
hm = sns.heatmap(round(corr,2), annot=True, ax=ax, cmap="Reds",fmt='.2f',
            linewidths=.05)
f.subplots_adjust(top=0.93)
t= f.suptitle('Water Attributes Correlation Heatmap', fontsize=14)

### We will take a look at if there is any missing data in our data. If there are, we will try to eliminate them.

In [None]:
print("Do we have data with null in columns?")
df.columns[df.isnull().any()]

In [None]:
df.isnull().sum()

In [None]:
fig = msno.matrix(df,color=(0,0.5,0.5))

# Replacing missing value by mean of all values in respective column

In [None]:
def replace_nan_by_mean(info):
    for col in info.columns:
        info[col].fillna(np.mean(info[col]),inplace=True)
    return info
df=replace_nan_by_mean(df)

In [None]:
fig = msno.matrix(df,color=(0,0.5,0.5))

In [None]:
df['ph'].fillna(value=df['ph'].median(),inplace=True)
df['Sulfate'].fillna(value=df['Sulfate'].median(),inplace=True)
df['Trihalomethanes'].fillna(value=df['Trihalomethanes'].median(),inplace=True)

In [None]:
df.isnull().sum()

In [None]:
X = df.drop('Potability',axis=1).values
y = df['Potability'].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Logistic Regression

In [None]:
lr = LogisticRegression(multi_class = 'multinomial', solver = 'lbfgs')
lr.fit(X_train, y_train)

In [None]:
# accuracy score

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

lr_acc = accuracy_score(y_test, lr.predict(X_test))
print(f"Accuracy Score of Training Data is {accuracy_score(y_train, lr.predict(X_train))}")
print(f"Accuracy Score of Training Data is {lr_acc}\n")

# KNN

In [None]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

In [None]:
# accuracy score

knn_acc = accuracy_score(y_test, knn.predict(X_test))
print(f"Accuracy Score of Training Data is {accuracy_score(y_train, knn.predict(X_train))}")
print(f"Accuracy Score of Training Data is {knn_acc}\n")

# SVC

In [None]:
svc = SVC()
svc.fit(X_train, y_train)

In [None]:
# accuracy score

svc_acc = accuracy_score(y_test, svc.predict(X_test))
print(f"Accuracy Score of Training Data is {accuracy_score(y_train, svc.predict(X_train))}")
print(f"Accuracy Score of Training Data is {svc_acc}\n")

# SGD Classifier

In [None]:
from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier()
parameters = {
    'alpha' : [0.0001, 0.001, 0.01, 0.1, 1],
    'loss' : ['hinge', 'log'],
    'penalty' : ['l1', 'l2']
}

grid_search = GridSearchCV(sgd, parameters, cv = 10, n_jobs = -1)
grid_search.fit(X_train, y_train)

In [None]:
# best parameter and best score

print(grid_search.best_params_)
print(grid_search.best_score_)

In [None]:
sgd = SGDClassifier(alpha = 0.001, loss = 'log', penalty = 'l1')
sgd.fit(X_train, y_train)

y_pred = sgd.predict(X_test)

print(accuracy_score(y_train, sgd.predict(X_train)))

sgd_acc = accuracy_score(y_test, sgd.predict(X_test))
print(sgd_acc)

# Decision Tree

In [None]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)

In [None]:
# accuracy score

dtc_acc = accuracy_score(y_test, dtc.predict(X_test))
print(f"Accuracy Score of Training Data is {accuracy_score(y_train, dtc.predict(X_train))}")
print(f"Accuracy Score of Training Data is {dtc_acc}\n")

# Random Forest

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [None]:
# accuracy score

rf_acc = accuracy_score(y_test, rf.predict(X_test))
print(f"Accuracy Score of Training Data is {accuracy_score(y_train, rf.predict(X_train))}")
print(f"Accuracy Score of Training Data is {rf_acc}\n")

# Ada Boost Classifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier(base_estimator = dtc)
ada.fit(X_train, y_train)

In [None]:
# accuracy score

ada_acc = accuracy_score(y_test, ada.predict(X_test))
print(f"Accuracy Score of Training Data is {accuracy_score(y_train, ada.predict(X_train))}")
print(f"Accuracy Score of Training Data is {ada_acc}\n")

In [None]:
# hyper parameter tuning using grid search cv

grid_param = {
    'n_estimators' : [40, 50, 70, 80, 100],
    'learning_rate' : [0.01, 0.1, 0.05, 0.5, 1, 10],
    'algorithm' : ['SAMME', 'SAMME.R']
}

grid_search = GridSearchCV(ada, grid_param, cv = 5, n_jobs = -1, verbose = 1)
grid_search.fit(X_train, y_train)

In [None]:
# best parameters and best score

print(grid_search.best_params_)
print(grid_search.best_score_)

In [None]:
ada = AdaBoostClassifier(base_estimator = ada, algorithm = 'SAMME.R', learning_rate = 0.5, n_estimators = 50)
ada.fit(X_train, y_train)

In [None]:
# accuracy score

ada_acc = accuracy_score(y_test, ada.predict(X_test))
print(f"Accuracy Score of Training Data is {accuracy_score(y_train, ada.predict(X_train))}")
print(f"Accuracy Score of Training Data is {ada_acc}\n")

# Gradient Boosting Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)

In [None]:
# accuracy score

gb_acc = accuracy_score(y_test, gb.predict(X_test))
print(f"Accuracy Score of Training Data is {accuracy_score(y_train, gb.predict(X_train))}")
print(f"Accuracy Score of Training Data is {gb_acc}\n")

# Stochastic Gradient Boosting (SGB)

In [None]:
sgb = GradientBoostingClassifier(subsample = 0.9, max_features = 0.8)
sgb.fit(X_train, y_train)

In [None]:
# accuracy score

sgb_acc = accuracy_score(y_test, sgb.predict(X_test))
print(f"Accuracy Score of Training Data is {accuracy_score(y_train, sgb.predict(X_train))}")
print(f"Accuracy Score of Training Data is {sgb_acc}\n")

# XgBoost

In [None]:
xgb = XGBClassifier(learning_rate = 0.1, loss = 'deviance', n_estimators = 100)
xgb.fit(X_train, y_train)

In [None]:
# accuracy score

xgb_acc = accuracy_score(y_test, xgb.predict(X_test))

print(f"Accuracy Score of Training Data is {accuracy_score(y_train, xgb.predict(X_train))}")
print(f"Accuracy Score of Training Data is {xgb_acc}\n")

# Light Gradient Boosting Classifier

In [None]:
from lightgbm import LGBMClassifier

lgbm = LGBMClassifier()
lgbm.fit(X_train, y_train)

lgbm_acc = accuracy_score(y_test, lgbm.predict(X_test))

print(f"Training Accuracy of Decision Tree Classifier is {accuracy_score(y_train, lgbm.predict(X_train))}")
print(f"Test Accuracy of Decision Tree Classifier is {lgbm_acc} \n")


# Cat Boost Classifier

In [None]:
from catboost import CatBoostClassifier

cat = CatBoostClassifier()
cat.fit(X_train, y_train)

In [None]:
cat_acc = accuracy_score(y_test, cat.predict(X_test))

print(f"Training Accuracy of Decision Tree Classifier is {accuracy_score(y_train, cat.predict(X_train))}")
print(f"Test Accuracy of Decision Tree Classifier is {cat_acc} \n")

# Extra Trees Classifier

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

etc = ExtraTreesClassifier()
etc.fit(X_train, y_train)

In [None]:
etc_acc = accuracy_score(y_test, etc.predict(X_test))

print(f"Training Accuracy of Decision Tree Classifier is {accuracy_score(y_train, etc.predict(X_train))}")
print(f"Test Accuracy of Decision Tree Classifier is {etc_acc} \n")

In [None]:
models = pd.DataFrame({
    'Model' : ['Logistic Regression', 'KNN', 'SVC', 'SGD',  'Decision Tree', 'Random Forest','Ada Boost',
             'Gradient Boosting', 'SGB', 'XgBoost', 'LGBM', 'Cat Boost', 'Extra Tree'],
    'Score' : [lr_acc, knn_acc, svc_acc, sgd_acc, dtc_acc, rf_acc, ada_acc, gb_acc, sgb_acc, xgb_acc, lgbm_acc, cat_acc, etc_acc]
})


models.sort_values(by = 'Score', ascending = False)

In [None]:
plt.figure(figsize = (20, 8))

sns.barplot(x = 'Model', y = 'Score', data = models)
plt.ylim(0.45, 0.75)
plt.show()

# If you like my kernel, please do upvote :)