In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df=pd.read_csv('../input/water-potability/water_potability.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
#check Potable vs Non-Potable
d= pd.DataFrame(df['Potability'].value_counts())
d

In [None]:
# Visualizations Libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.offline as pyo
import plotly.express as px
import plotly.graph_objs as  go
pyo.init_notebook_mode()
import plotly.figure_factory as ff
import missingno as msno

#### Deal with Missing Values

In [None]:
fig = msno.matrix(df)

In [None]:
df.isnull().sum()

In [None]:
df[df['Potability']==0].describe()

In [None]:
df[df['Potability']==1].describe()

We see that ph, sulfate, trihalomethanes have missing values, but looking at the statistical distriution, we can impute them with median values.

### Plotly Visualizations

In [None]:
colors_blue = ["#132C33", "#264D58", '#17869E', '#51C4D3', '#B4DBE9']
colors_dark = ["#1F1F1F", "#313131", '#636363', '#AEAEAE', '#DADADA']
colors_green = ['#01411C','#4B6F44','#4F7942','#74C365','#D0F0C0']
sns.palplot(colors_blue)
sns.palplot(colors_green)
sns.palplot(colors_dark)

In [None]:
df.columns

In [None]:
d= pd.DataFrame(df['Potability'].value_counts())
fig = px.pie(d, values ='Potability',names=['Not Potable','Potable'], hole=0.4, opacity =0.7,
            color_discrete_sequence= [colors_green[2], colors_blue[2]],
            labels={'label':'Potability','Potability':'No. Of Samples'})

fig.add_annotation(text='**Resampling of data can be done to get a balanced dataset',
                  x=1.3,y=0.6,showarrow=False, font_size = 10)
fig.add_annotation(text='Potability',
                   x=0.5,y=0.5,showarrow=False)

fig.update_layout(title=dict(text='How many samples of water are Potable?',x=0.47,y=0.95,
               font=dict(color=colors_dark[2],size=20)))

fig.update_traces(textposition='outside', textinfo='percent+label')
fig.show()

### Hardness

In [None]:
from collections import Counter
fig = px.histogram(df, x= 'Hardness', y=Counter(df['Hardness']), color ='Potability', template ='plotly_white',
                  marginal ='box', opacity=0.7, nbins=100,
                  color_discrete_sequence=[colors_green[2],colors_blue[3]],
                   barmode ='group', histfunc ='count'
                  )
fig.add_vline(x=151, line_width=1, line_color=colors_dark[1],line_dash='dot',opacity=0.7)
fig.add_vline(x=301, line_width=1, line_color=colors_dark[1],line_dash='dot',opacity=0.7)
fig.add_vline(x=76, line_width=1, line_color=colors_dark[1],line_dash='dot',opacity=0.7)


fig.add_annotation(text='<76 mg/L is<br> considered soft',x=40,y=130,showarrow=False,font_size=9)
fig.add_annotation(text='Between 76 and 150<br> (mg/L) is<br>moderately hard',x=113,y=130,showarrow=False,font_size=9)
fig.add_annotation(text='Between 151 and 300 (mg/L)<br> is considered hard',x=250,y=130,showarrow=False,font_size=9)
fig.add_annotation(text='>300 mg/L is<br> considered very hard',x=320,y=130,showarrow=False,font_size=9)

fig.update_layout(
    
    title=dict(text='Hardness Distribution',x=0.53,y=0.95,
               font=dict(color=colors_dark[2],size=20)),
    xaxis_title_text='Hardness (mg/L)',
    yaxis_title_text='Count',
    legend=dict(x=1,y=0.96,bordercolor=colors_dark[4],borderwidth=0,tracegroupgap=5),
    bargap=0.3,
)

fig.show()

### pH

In [None]:
fig = px.histogram(df, x= 'ph', y=Counter(df['ph']), color ='Potability', template ='plotly_white',
                  marginal ='box', opacity=0.7, nbins=100,
                  color_discrete_sequence=[colors_green[2],colors_blue[3]],
                   barmode ='group', histfunc ='count'
                  )

fig.add_vline(x=7, line_width=1, line_color=colors_dark[1],line_dash='dot',opacity=0.7)




fig.add_annotation(text='<7 ph is acidic',x=5,y=70,showarrow=False,font_size=10)
fig.add_annotation(text='>7 ph is basic',x=9,y=70,showarrow=False,font_size=10)

fig.update_layout(
    
    title=dict(text='pH Distribution',x=0.53,y=0.95,
               font=dict(color=colors_dark[2],size=20)),
    xaxis_title_text='pH Level',
    yaxis_title_text='Count',
    legend=dict(x=1,y=0.96,bordercolor=colors_dark[4],borderwidth=0,tracegroupgap=5),
    bargap=0.3,
)

fig.show()

### TDS - Total Dissolved Solids

In [None]:
fig = px.histogram(df, x= 'Solids', y=Counter(df['Solids']), color ='Potability', template ='plotly_white',
                  marginal ='box', opacity=0.7, nbins=100,
                  color_discrete_sequence=[colors_green[2],colors_blue[3]],
                   barmode ='group', histfunc ='count'
                  )

#fig.add_vline(x=7, line_width=1, line_color=colors_dark[1],line_dash='dot',opacity=0.7)




#fig.add_annotation(text='<7 ph is acidic',x=5,y=70,showarrow=False,font_size=10)
#fig.add_annotation(text='>7 ph is basic',x=9,y=70,showarrow=False,font_size=10)

fig.update_layout(
    
    title=dict(text='TDS Distribution',x=0.53,y=0.95,
               font=dict(color=colors_dark[2],size=20)),
    xaxis_title_text='Dissolved solids in ppm',
    yaxis_title_text='Count',
    legend=dict(x=1,y=0.96,bordercolor=colors_dark[4],borderwidth=0,tracegroupgap=5),
    bargap=0.3,
)

fig.show()

In [None]:
df.head()

### Chloramines

In [None]:
fig = px.histogram(df, x= 'Chloramines', y=Counter(df['Chloramines']), color ='Potability', template ='plotly_white',
                  marginal ='box', opacity=0.7, nbins=100,
                  color_discrete_sequence=[colors_green[2],colors_blue[3]],
                   barmode ='group', histfunc ='count'
                  )

fig.add_vline(x=4, line_width=1, line_color=colors_dark[1],line_dash='dot',opacity=0.7)




fig.add_annotation(text='Anything <4 is considered <br>safe for drinking',x=2.9,y=70,showarrow=False,font_size=10)
#fig.add_annotation(text='>7 ph is basic',x=9,y=70,showarrow=False,font_size=10)

fig.update_layout(
    
    title=dict(text='Chloramines Distribution',x=0.53,y=0.95,
               font=dict(color=colors_dark[2],size=20)),
    xaxis_title_text='Chloramines(ppm)',
    yaxis_title_text='Count',
    legend=dict(x=1,y=0.96,bordercolor=colors_dark[4],borderwidth=0,tracegroupgap=5),
    bargap=0.3,
)

fig.show()

### Sulfate

In [None]:
#Sulfate
fig = px.histogram(df, x= 'Sulfate', y=Counter(df['Sulfate']), color ='Potability', template ='plotly_white',
                  marginal ='box', opacity=0.7, nbins=100,
                  color_discrete_sequence=[colors_green[2],colors_blue[3]],
                   barmode ='group', histfunc ='count'
                  )

fig.add_vline(x=250, line_width=1, line_color=colors_dark[1],line_dash='dot',opacity=0.7)




fig.add_annotation(text='<250 mg/L is considered <br>safe for drinking',x=220,y=70,showarrow=False,font_size=10)
#fig.add_annotation(text='>7 ph is basic',x=9,y=70,showarrow=False,font_size=10)

fig.update_layout(
    
    title=dict(text='Sulfates Distribution',x=0.53,y=0.95,
               font=dict(color=colors_dark[2],size=20)),
    xaxis_title_text='Sulfates(mg/L)',
    yaxis_title_text='Count',
    legend=dict(x=1,y=0.96,bordercolor=colors_dark[4],borderwidth=0,tracegroupgap=5),
    bargap=0.3,
)

fig.show()

### Conductivity

In [None]:
#Conductivity
fig = px.histogram(df, x= 'Conductivity', y=Counter(df['Conductivity']), color ='Potability', template ='plotly_white',
                  marginal ='box', opacity=0.7, nbins=100,
                  color_discrete_sequence=[colors_green[2],colors_blue[3]],
                   barmode ='group', histfunc ='count'
                  )

#fig.add_vline(x=250, line_width=1, line_color=colors_dark[1],line_dash='dot',opacity=0.7)




fig.add_annotation(text='The Conductivity range <br> is safe for both (200-800),<br> Potable and Non-Potable water',
                   x=220,y=70,showarrow=False,font_size=10)


fig.update_layout(
    
    title=dict(text='Conductivity Distribution',x=0.53,y=0.95,
               font=dict(color=colors_dark[2],size=20)),
    xaxis_title_text='Conductivity (μS/cm)',
    yaxis_title_text='Count',
    legend=dict(x=1,y=0.96,bordercolor=colors_dark[4],borderwidth=0,tracegroupgap=5),
    bargap=0.3,
)

fig.show()

### Organic Carbon

In [None]:
#Organic_carbon
fig = px.histogram(df, x= 'Organic_carbon', y=Counter(df['Organic_carbon']), color ='Potability', template ='plotly_white',
                  marginal ='box', opacity=0.7, nbins=100,
                  color_discrete_sequence=[colors_green[2],colors_blue[3]],
                   barmode ='group', histfunc ='count'
                  )

fig.add_vline(x=10, line_width=1, line_color=colors_dark[1],line_dash='dot',opacity=0.7)




fig.add_annotation(text='Typical Organic Carbon Level<br> is upto 10 ppm',
                   x=7,y=70,showarrow=False,font_size=10)


fig.update_layout(
    
    title=dict(text='Organic Carbon Distribution',x=0.53,y=0.95,
               font=dict(color=colors_dark[2],size=20)),
    xaxis_title_text='Organic Carbon(ppm)',
    yaxis_title_text='Count',
    legend=dict(x=1,y=0.96,bordercolor=colors_dark[4],borderwidth=0,tracegroupgap=5),
    bargap=0.3,
)

fig.show()

### Trihalomethanes

In [None]:
df.columns

In [None]:
#Trihalomethanes
fig = px.histogram(df, x= 'Trihalomethanes', y=Counter(df['Trihalomethanes']), color ='Potability', template ='plotly_white',
                  marginal ='box', opacity=0.7, nbins=100,
                  color_discrete_sequence=[colors_green[2],colors_blue[3]],
                   barmode ='group', histfunc ='count'
                  )

fig.add_vline(x=80, line_width=1, line_color=colors_dark[1],line_dash='dot',opacity=0.7)




fig.add_annotation(text='Upper limit of Trihalomethanes<br> is 80 μg/L',
                   x=93,y=70,showarrow=False,font_size=10)


fig.update_layout(
    
    title=dict(text='Trihalomethanes',x=0.53,y=0.95,
               font=dict(color=colors_dark[2],size=20)),
    xaxis_title_text='Trihalomethanes (μg/L))',
    yaxis_title_text='Count',
    legend=dict(x=1,y=0.96,bordercolor=colors_dark[4],borderwidth=0,tracegroupgap=5),
    bargap=0.3,
)

fig.show()

### Turbidity

In [None]:
#Turbidity
fig = px.histogram(df, x= 'Turbidity', y=Counter(df['Turbidity']), color ='Potability', template ='plotly_white',
                  marginal ='box', opacity=0.7, nbins=100,
                  color_discrete_sequence=[colors_green[2],colors_blue[3]],
                   barmode ='group', histfunc ='count'
                  )

fig.add_vline(x=5, line_width=1, line_color=colors_dark[1],line_dash='dot',opacity=0.7)




fig.add_annotation(text='<5 NTU Turbidity is considered safe',
                   x=5.8,y=70,showarrow=False,font_size=10)


fig.update_layout(
    
    title=dict(text='Turbidity Distribution',x=0.53,y=0.95,
               font=dict(color=colors_dark[2],size=20)),
    xaxis_title_text='Turbidity (NTU))',
    yaxis_title_text='Count',
    legend=dict(x=1,y=0.96,bordercolor=colors_dark[4],borderwidth=0,tracegroupgap=5),
    bargap=0.3,
)

fig.show()

### Scatter Plot Matrix b/w Features

In [None]:
fig = px.scatter_matrix(df,df.drop('Potability',axis=1),height=1250,width=1250,template='plotly_white',opacity=0.7,
                        color_discrete_sequence=[colors_blue[3],colors_green[3]],color='Potability',
                       symbol='Potability',color_continuous_scale=[colors_green[3],colors_blue[3]])

fig.update_layout(font_size=10,
                  coloraxis_showscale=False,
                 legend=dict(x=0.02,y=1.07,bgcolor=colors_dark[4]),
                 title=dict(text='Scatter Plot Matrix b/w Features',x=0.5,y=0.97,
                   font=dict(color=colors_dark[2],size=24)))
fig.show()

- There is not much correlation between features.
- We can plot a heatmap to confirm the correlation between features

In [None]:
cor=df.drop('Potability',axis=1).corr()
cor

In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(cor, annot=True, cmap='Greens')
plt.show()

In [None]:
fig = px.imshow(cor,height=800,width=800,color_continuous_scale=colors_blue,template='plotly_white')

fig.update_layout(
                title=dict(text='Correlation Heatmap',x=0.5,y=0.93,
                             font=dict(color=colors_dark[2],size=24)),
                coloraxis_colorbar=dict(len=0.85,x=1.1) 
                 )

fig.show()

### Data Preparation - Handling Missing Values

In [None]:
df.isnull().sum()

In [None]:
fig = msno.matrix(df)

In [None]:
df.describe()

In [None]:
df[['ph','Sulfate','Trihalomethanes']].describe()

In [None]:
df[['ph','Sulfate','Trihalomethanes']][df.Potability==0].describe()

In [None]:
df[['ph','Sulfate','Trihalomethanes']][df.Potability==1].describe()

- We can clearly see that the mean and median are close in all case for entire df or when Potable or Non potable
- Also the difference between mean/median for Potable vs Non potable water data is minimal.
- So we can replace the missing values here for 'ph','Sulfate','Trihalomethanes' with their medians for overall data.

In [None]:
df['ph'].fillna(value = df['ph'].median(),inplace=True)
df['Sulfate'].fillna(value = df['Sulfate'].median(),inplace=True)
df['Trihalomethanes'].fillna(value = df['Trihalomethanes'].median(),inplace=True)

In [None]:
df.describe()

In [None]:
df.isnull().sum()

### Data Standardization

In [None]:
# Data Pre-processing Libraries
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.model_selection import train_test_split

In [None]:
X=df.drop('Potability',axis=1).values
y=df['Potability'].values

In [None]:
#train-test split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=101)

In [None]:
#scaler
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

### Spotchecking for baselining top 5 classification models

In [None]:
#import modelling libraries
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier, PassiveAggressiveClassifier, Perceptron
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC, NuSVC, SVC
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.naive_bayes import GaussianNB, BernoulliNB

#Evaluation and cross-validation libraries

from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, RepeatedStratifiedKFold

#warnings
from warnings import filterwarnings

In [None]:
models = [("LR",LogisticRegression(max_iter=1000)),
         ("SVC",SVC()),
         ('KNN',KNeighborsClassifier(n_neighbors=10)),
         ('DTC',DecisionTreeClassifier()),
         ('GB',GaussianNB()),("SGDC", SGDClassifier()),
         ("Perc", Perceptron()),("NC",NearestCentroid()),
        ("Ridge", RidgeClassifier()),("NuSVC", NuSVC()),
          ("BNB", BernoulliNB()),
         ('RF',RandomForestClassifier()),('ADA',AdaBoostClassifier()),
        ('XGB',GradientBoostingClassifier()),('PAC',PassiveAggressiveClassifier())]

In [None]:
results = []
names = []
finalResults = []

for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = roc_auc_score(y_test, y_pred, average='macro')
    results.append(score)
    names.append(name)
    finalResults.append((name,score))

finalResults.sort(key=lambda x: x[1], reverse= True)

In [None]:
finalResults

We can consider RandomForest and XGBoost to proceed further and improve scores by cross validating the model

In [None]:
model_params = {
    'XGB':
    {
        'model':GradientBoostingClassifier(),
        'params':
        {
            'learning_rate':[0.0001,0.001,0.01,0.1],
            'n_estimators':[100,200,500,1000],
            'max_features':['sqrt','log2'],
            'max_depth':list(range(1,11))
        }
    },
    'Random Forest':
    {
        'model':RandomForestClassifier(),
        'params':
        {
            'n_estimators':[10,50,100,200],
            'max_features':['auto','sqrt','log2'],
            'max_depth':list(range(1,11))
        }
    }
}

In [None]:
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2)
scores =[]
for model_name, params in model_params.items():
    #print (model_name,params)
    rs = RandomizedSearchCV(params['model'],params['params'],cv=cv, n_iter=20)
    rs.fit(X,y)
    scores.append([model_name, dict(rs.best_params_), rs.best_score_])
data=pd.DataFrame(scores,columns=['Model','Parameters','Score'])
data

### Final Model

In [None]:
param = data['Parameters']

model =VotingClassifier(estimators= [('XGB', GradientBoostingClassifier(**param[0])),
                                     ('RF', RandomForestClassifier(**param[1]))],voting='hard')

accuracy =[]
precision = []
recall =[]
auc_roc =[]

scaler = StandardScaler()
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2)
cv.get_n_splits(X,y)

for train_idx, test_idx in cv.split(X,y):
    
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    acc_score = accuracy_score(y_test,y_pred)
    accuracy.append(acc_score)
    
    pr = precision_score(y_test, y_pred)
    precision.append(pr)
    
    rec = recall_score(y_test, y_pred)
    recall.append(rec)
    
    auc = roc_auc_score(y_test, y_pred)
    auc_roc.append(auc)

In [None]:
np.mean(accuracy)

In [None]:
np.mean(precision)

In [None]:
np.mean(recall)

In [None]:
recall

In [None]:
np.mean(auc_roc)

In [None]:
auc_roc