# **DRINKING WATER POTABILITY - EDA and PREDICTION**

# LOADING REQUIRED LIBRARIES

In [None]:
####################### Loading the required libraries ###############################

import numpy as np
import pandas as pd

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Avoid Warnings
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

#Common model helpers

from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, confusion_matrix, r2_score, accuracy_score
from sklearn.model_selection import (GridSearchCV, KFold, train_test_split, cross_val_score)

from imblearn.over_sampling import SMOTE
from collections import Counter

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn import svm
from xgboost.sklearn import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier


In [None]:
################################ Read in water_potability file #########################################

waterDf = pd.read_csv('../input/water-potability/water_potability.csv')

################################ Make a copy ################################################

waterData = waterDf.copy()

# INFORMATION ON WATER-POTATBILITY DATA

In [None]:
############################# About the data ###################################

print('The water-potability file has')
print('   Rows      Columns')
print('   {}         {}\n' .format(waterData.shape[0], waterData.shape[1]))

print( waterData.info())


All 10 variables of the data are numerical. The target variable takes binary values 0 and 1. 
The feature variables are real numbers.

In [None]:
print('Information about features\n')
print(waterData.describe())

In [None]:
################################ How does the data look like? ############################
print('How does the water-potability data look like?\n')
print(waterData.head())

In [None]:
###################### We work on the missing data ##############################
print('There are missing values within the data.\n')
print('The nature of the missing values within the features are as follows:\n')
print(waterData.isna().sum())

There are 491, 781 and 162 missing data in 'ph', 'Sulphate', and 'Trihalomethanes' respectively. 
We shall impute the missing values with the mean of the respective features by grouping them w.r.t 'Potability'.

In [None]:
##################################### Imputing 'ph' value #####################################

phMean_0 = waterData[waterData['Potability'] == 0]['ph'].mean(skipna=True)
waterData.loc[(waterData['Potability'] == 0) & (waterData['ph'].isna()), 'ph'] = phMean_0
phMean_1 = waterData[waterData['Potability'] == 1]['ph'].mean(skipna=True)
waterData.loc[(waterData['Potability'] == 1) & (waterData['ph'].isna()), 'ph'] = phMean_1

##################################### Imputing 'Sulfate' value #####################################

SulfateMean_0 = waterData[waterData['Potability'] == 0]['Sulfate'].mean(skipna=True)
waterData.loc[(waterData['Potability'] == 0) & (waterData['Sulfate'].isna()), 'Sulfate'] = SulfateMean_0
SulfateMean_1 = waterData[waterData['Potability'] == 1]['Sulfate'].mean(skipna=True)
waterData.loc[(waterData['Potability'] == 1) & (waterData['Sulfate'].isna()), 'Sulfate'] = SulfateMean_1

################################ Imputing 'Trihalomethanes' value #####################################

TrihalomethanesMean_0 = waterData[waterData['Potability'] == 0]['Trihalomethanes'].mean(skipna=True)
waterData.loc[(waterData['Potability'] == 0) & (waterData['Trihalomethanes'].isna()), 'Trihalomethanes'] = TrihalomethanesMean_0
TrihalomethanesMean_1 = waterData[waterData['Potability'] == 1]['Trihalomethanes'].mean(skipna=True)
waterData.loc[(waterData['Potability'] == 1) & (waterData['Trihalomethanes'].isna()), 'Trihalomethanes'] = TrihalomethanesMean_1


In [None]:
########################################## Check ####################################
print('Checking to see any more missing data \n')
waterData.isna().sum()

In [None]:
######################### Convert 'Potability' to Category #######################

waterData['Potability'] = waterData['Potability'].astype('category')
waterData.info()

# EXPLORING DATA THROUGH VISUALS

In [None]:
print('Distribution of Target Variable within the sample data')

fig, ax = plt.subplots(ncols=2, nrows=1, figsize=(16,6))

waterData.Potability.value_counts().plot(kind='bar', color=['orange', 'steelblue'], rot=0, ax=ax[0])
# Iterrating over the bars one-by-one
for bar in ax[0].patches:
    ax[0].annotate(format(bar.get_height(), 'd'), (bar.get_x() + bar.get_width() / 2, bar.get_height()),
                                                    ha='center', va='center', size=15, xytext=(0, -10),
                                                    textcoords='offset points')
ax[0].tick_params(left=False, labelleft=False)
ax[0].xaxis.set_tick_params(labelsize=20)

labels = list(waterData['Potability'].unique())
## use the wedgeprops and textprops arguments to style the wedges and texts, respectively
ax[1].pie(waterData['Potability'].value_counts(), labels=labels, autopct = '%1.1f%%',
          colors=['orange', 'steelblue'], explode = [0.005]*len(labels),
          textprops={'size': 'x-large'},
         wedgeprops={'linewidth': 3.0, 'edgecolor': 'white'})

plt.show()

There are 1998 data with Potability=1 and 1278 with Potability=0. 
Hence the data is imbalanced. We shall balance the data through SMOTE.

Let's look at the correlation matrix of the features.

In [None]:
######################################### Correlation Matrix #############################################

Corrmat = waterData.corr()
plt.subplots(figsize=(7,7))
sns.heatmap(Corrmat, cmap="YlGnBu", square = True, annot=True, fmt='.2f')
plt.show()

The Correlation graph shows absence of multicollinearity. 

In [None]:
print('Boxplot and density distribution of different features by Potability\n')

fig, ax = plt.subplots(ncols=2, nrows=9, figsize=(14, 28))

features = list(waterData.columns.drop('Potability'))
i=0
for cols in features:
    sns.kdeplot(waterData[cols], fill=True, alpha=0.4, hue = waterData.Potability, 
                palette=('indianred', 'steelblue'), multiple='stack', ax=ax[i,0])
    
    sns.boxplot(data= waterData, y=cols, x='Potability', ax=ax[i, 1],
               palette=('indianred', 'steelblue'))
    ax[i,0].set_xlabel(' ')
    ax[i,1].set_xlabel(' ')
    ax[i,1].set_ylabel(' ')
    ax[i,1].xaxis.set_tick_params(labelsize=14)
    ax[i,0].tick_params(left=False, labelleft=False)
    ax[i,0].set_ylabel(cols, fontsize=16)
    i=i+1
      
plt.show()

From the kdeplots there seems to be very less difference in mean values of the features among the levels of Potability.

In [None]:
print('Correlation of Potability with feature variables:')
features = list(waterData.columns.drop('Potability'))

Corr = list()
for cols in features:
    Corr.append(waterData[cols].corr(waterData['Potability']))

corrDf = pd.DataFrame({'Features' : features, 'Corr' : Corr})
corrDf['Corr'] = corrDf['Corr'].abs()
corrDf.sort_values(by='Corr', ascending = True)


# PREPARING THE DATA FOR MODELLING

In [None]:
##################### Preparing the Data for Modelling ######################

X = waterData.drop('Potability', axis = 1).copy()
y = waterData['Potability'].copy()

############################# Train-Test split ############################
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25)

########################## Synthetic OverSampling ###########################
print('Balancing the data by SMOTE - Oversampling of Minority level\n')
smt = SMOTE()
counter = Counter(y_train)
print('Before SMOTE', counter)
X_train, y_train = smt.fit_resample(X_train, y_train)
counter = Counter(y_train)
print('\nAfter SMOTE', counter)

################################# Scaling #################################
ssc = StandardScaler()

X_train = ssc.fit_transform(X_train)
X_test = ssc.transform(X_test)

modelAccuracy = list()

# DATA MODELLING

We shall be using the following algorithms to model the data and then find the accuracy on test data.

'LogisticRegression', 'DecisionTreeClassifier', 'NaiveBayesClassifier', 'RandomForestClassifier',
'ExtraTreesClassifier, 'SupportVactorClassifier', 'xgbClassifier', 'CatBoostClassifier'.

In [None]:
model = [LogisticRegression(), DecisionTreeClassifier(), GaussianNB(), RandomForestClassifier(), ExtraTreesClassifier(),
        svm.LinearSVC(), XGBClassifier(), CatBoostClassifier()]
trainAccuracy = list()
testAccuracy = list()
kfold = KFold(n_splits=10, random_state=7, shuffle=True)

for mdl in model:
    trainResult = cross_val_score(mdl, X_train, y_train, scoring='accuracy', cv=kfold)
    trainAccuracy.append(trainResult.mean())
    mdl.fit(X_train, y_train)
    y_pred = mdl.predict(X_test)
    testResult = metrics.accuracy_score(y_test, y_pred)
    testAccuracy.append(testResult)

# RESULTS

In [None]:
print('The comparision\n')
modelScore = pd.DataFrame({'Model' : model, 'Train_Accuracy' : trainAccuracy, 'Test_Accuracy' : testAccuracy})
modelScore

After applying 10 folds Cross Validation, we get more than 78% accuracy by RandomForestClassifier, CatBoostClassifier and XGBClassifier.

Let's look at the confusion matrix for each one of them

In [None]:
########################################## RandomForestClassfier #############################
print('Random Forest Classifier\n')
Rfc = RandomForestClassifier()
Rfc.fit(X_train, y_train)

y_Rfc = Rfc.predict(X_test)
print(metrics.classification_report(y_test, y_Rfc))
print(modelAccuracy.append(metrics.accuracy_score(y_test, y_Rfc)))

sns.heatmap(confusion_matrix(y_test, y_Rfc), annot=True, fmt='d')
plt.show()

In [None]:
#################################### XGB Classifier() #######################
print('XGB Classifier\n')
xgb = XGBClassifier()
xgb.fit(X_train, y_train)

y_xgb = xgb.predict(X_test)
print(metrics.classification_report(y_test, y_xgb))
print(modelAccuracy.append(metrics.accuracy_score(y_test, y_xgb)))

sns.heatmap(confusion_matrix(y_test, y_xgb), annot=True, fmt='d')
plt.show()


In [None]:
#################################### CatBoostClassifier() #######################
print('CatBoostClassifier\n')
cat = CatBoostClassifier(verbose=False)
cat.fit(X_train, y_train)

y_cat = cat.predict(X_test)
print(metrics.classification_report(y_test, y_cat))
print(modelAccuracy.append(metrics.accuracy_score(y_test, y_cat)))

sns.heatmap(confusion_matrix(y_test, y_cat), annot=True, fmt='d')
plt.show()


# CONCLUSION

CatBoostClassifier is giving a good accuracy score of 81%.
The accuracy score can be further increased by fine tuning of model hyperparameters.
I leave that to you to proceed further from here after forking the kernel.
Please upvote if you find this kernel useful and leave a comment.
Thank You for scrolling