In this Notebook we will try to predict whether the water quality is safe to drink for a human.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv('../input/water-potability/water_potability.csv')

In [None]:
data.info()

In [None]:
data.head()

In [None]:
data.describe()

Exploring the data we see that 3 columns are missing value's, we will have to fix them.

In [None]:
data.isnull().sum()[data.isnull().sum()>0]

In [None]:
sns.countplot(data['Potability'])

In [None]:
data['Potability'].value_counts()

We see that target value is not equally matched, 0 has 720 more values then 1.

In [None]:
fig = plt.figure(figsize=(18,16))
for index,col in enumerate(data.drop('Potability',axis=1).columns):
    plt.subplot(5,2,index+1)
    sns.distplot(data.drop('Potability', axis=1).loc[:,col].dropna(), kde=False)
fig.tight_layout(pad=1.0)

Most of our data seem to follow the normal curve, except solids nothing seems skewed.

In [None]:
fig = plt.figure(figsize=(14,15))
for index,col in enumerate(data.drop('Potability', axis=1).columns):
    plt.subplot(5,2,index+1)
    sns.boxplot(y=col, data=data.drop('Potability', axis=1).dropna())
fig.tight_layout(pad=1.0)

No extreme outliers can be seen in boxplot.


In [None]:
fig = plt.figure(figsize=(10,10))
sns.heatmap(data.corr(), annot=True, cmap='gray')

Feature's have very low correlation with target variable, it means the effects of feature's on target variable is minimum.

In [None]:
data['ph'].fillna(data['ph'].mean(),inplace=True)
data['Sulfate'].fillna(data['Sulfate'].mean(),inplace=True)
data['Trihalomethanes'].fillna(data['Trihalomethanes'].mean(),inplace=True)

I filled the missing value's with their mean because they do not have strong correlation with any feature's 

In [None]:
data.isnull().sum()[data.isnull().sum()>0]

In [None]:
data = data.sample(frac = 1)

Shuffling the data.

In [None]:
X = data.drop('Potability', axis=1)
Y = data['Potability']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size= 0.25, random_state=42)

Splitting the data.

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)


In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

checking results of differrent models.

In [None]:
gnb = GaussianNB()
cv = cross_val_score(gnb,X_train,y_train,cv=5)
print(cv)
print(cv.mean())

In [None]:
lr = LogisticRegression(max_iter = 2000)
cv = cross_val_score(lr,X_train,y_train,cv=5)
print(cv)
print(cv.mean())

In [None]:
knn = KNeighborsClassifier(n_neighbors=4)
cv = cross_val_score(knn,X_train,y_train,cv=5)
print(cv)
print(cv.mean())

In [None]:
rf = RandomForestClassifier(random_state = 42)
cv = cross_val_score(rf,X_train,y_train,cv=5)
print(cv)
print(cv.mean())

In [None]:
svc = SVC(probability = True)
cv = cross_val_score(svc,X_train,y_train,cv=5)
print(cv)
print(cv.mean())

In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(random_state =1)
cv = cross_val_score(xgb,X_train,y_train,cv=5)
print(cv)
print(cv.mean())

In [None]:
from sklearn.ensemble import VotingClassifier
voting_clf = VotingClassifier(estimators = 
                              [('lr',lr),('knn',knn),('rf',rf),('gnb',gnb),('svc',svc),('xgb',xgb)], voting = 'soft') 

In [None]:
cv = cross_val_score(voting_clf,X_train,y_train,cv=5)
print(cv)
print(cv.mean())

In [None]:
from sklearn.metrics import accuracy_score
voting_clf.fit(X_train,y_train)
y_pred_vc_soft = voting_clf.predict(X_test).astype(int)
accuracy_score(y_pred_vc_soft, y_test)

In [None]:
voting_clf = VotingClassifier(estimators = 
                              [('rf',rf),('svc',svc),('xgb',xgb)], voting = 'soft') 

In [None]:
cv = cross_val_score(voting_clf,X_train,y_train,cv=5)
print(cv)
print(cv.mean())

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
def performance(classifier, model_name):
    print(model_name)
    print('Best Score :' + str(classifier.best_score_))
    print("Best Parameters :" + str(classifier.best_params_))

Cannot use GridSearch because it will take too long.

In [None]:
svc = SVC(probability = True)
param_grid = tuned_parameters = [{'kernel': ['rbf'], 'gamma': [.1,.5,1,2,5,10],
                                  'C': [.1, 1, 10, 100, 1000]},
                                 {'kernel': ['linear'], 'C': [.1, 1, 10, 100, 1000]},
                                 {'kernel': ['poly'], 'degree' : [2,3,4,5], 'C': [.1, 1, 10, 100, 1000]}]
clf_svc = RandomizedSearchCV(svc, param_grid, cv = 5, verbose = True, n_jobs = -1)
best_clf_svc = clf_svc.fit(X_train, y_train)
performance(best_clf_svc,'SVC')

In [None]:
best_svc = best_clf_svc.best_estimator_.fit(X_train, y_train)

In [None]:
y_pred = best_svc.predict(X_test)
accuracy_score(y_pred, y_test)

In [None]:
xgb = XGBClassifier(random_state = 42)

param_grid = {
    'n_estimators': [450,500,550],
    'colsample_bytree': [0.75,0.8,0.85],
    'max_depth': [None],
    'reg_alpha': [1],
    'reg_lambda': [2, 5, 10],
    'subsample': [0.55, 0.6, .65],
    'learning_rate':[0.5],
    'gamma':[.5,1,2],
    'min_child_weight':[0.01],
    'sampling_method': ['uniform']
}

clf_xgb = RandomizedSearchCV(xgb, param_grid, cv = 5, verbose = True, n_jobs = -1)
best_clf_xgb = clf_xgb.fit(X_train,y_train)
performance(best_clf_xgb,'XGB')

In [None]:
best_xgb = best_clf_xgb.best_estimator_.fit(X_train, y_train)
y_pred = best_xgb.predict(X_test)
accuracy_score(y_pred, y_test)

In [None]:
rf = RandomForestClassifier(random_state = 42)
param_grid =  {'n_estimators': [400,450,500,550],
               'criterion':['gini','entropy'],
                                  'bootstrap': [True],
                                  'max_depth': [15, 20, 25],
                                  'max_features': ['auto','sqrt', 10],
                                  'min_samples_leaf': [2,3],
                                  'min_samples_split': [2,3]}
                                  
clf_rf = RandomizedSearchCV(rf, param_grid, cv = 5, verbose = True, n_jobs = -1)
best_clf_rf = clf_rf.fit(X_train,y_train)
performance(best_clf_rf,'Random Forest')

In [None]:
best_rf = best_clf_rf.best_estimator_.fit(X_train, y_train)
y_pred = best_rf.predict(X_test)
accuracy_score(y_pred, y_test)

In [None]:
model = VotingClassifier(estimators=[('SVC', best_svc),
                                     ('XGB', best_xgb),
                                     ('RF', best_rf),
                                    ],voting='hard')


In [None]:
model.fit(X_train,y_train)

In [None]:
y_pred = model.predict(X_test)
accuracy_score(y_pred, y_test)

Final output is a bit unexpected, as i got an accuracy of 70% on local machine.

It is interesting to note that our untuned voting classifier with soft voting give's a slightly better result. Thanks for reading my Notebook, leave a upvote if you find it helpful.