![https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQehFJ44zrTaFbHED-bJWynmtC2ULo3Ukeekg&usqp=CAU](http://)

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.graph_objs as go 
from plotly.offline import init_notebook_mode,iplot
init_notebook_mode(connected=True) 
import missingno
import plotly.express as px
from plotly.subplots import make_subplots
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression()
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.metrics import classification_report
from sklearn.preprocessing  import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [None]:
df = pd.read_csv('../input/water-potability/water_potability.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.columns

In [None]:
color = ['grey','grey','grey','grey','grey','grey','grey','red','red','red','red']
missingno.bar(df,fontsize =16, color=color, sort = 'descending', figsize = (16,8))
plt.title("VISUALIZATION OF MISSING VAUES",fontsize=23)
plt.show()

In [None]:
df['ph'] = df['ph'].fillna(df['ph'].mean())
df['Sulfate'] = df['Sulfate'].fillna(df['Sulfate'].mean())
df['Trihalomethanes'] = df['Trihalomethanes'].fillna(df['Trihalomethanes'].mean())
print(df.isna().sum())

In [None]:
df.describe().T.style.background_gradient(subset=['std','50%','count'], cmap='PuBu')

In [None]:
label= ["Potable","Non Potable"]
fig = px.pie(labels=label,values=df['Potability'].value_counts(), names = label,width = 900, height = 700)
fig.update_traces(textposition = 'inside', 
                  textinfo = 'percent + label', 
                  hole = 0.75, 
                  marker = dict(colors = ['#ff6361','#1e434c','#ffa600'], line = dict(color = 'white', width = 2)))

fig.update_layout(annotations = [dict(text = 'Potability', 
                                      x = 0.5, y = 0.5, font_size = 48, showarrow = False, 
                                      font_family = 'monospace',
                                      font_color = 'black')],
                  showlegend = False)

In [None]:
plt.figure(figsize=(16,8))
sns.set_style('white')
sns.set_context("poster", font_scale = .85)
a = sns.kdeplot(df[df['Potability']==0]['ph'],fill=True,color=['cyan'],alpha=0.7)
sns.kdeplot(df[df['Potability']==1]['ph'],fill=True,color=['yellow'],alpha=0.7)
a.set_xlim([0,14])
sns.despine(left=True)
plt.legend(['Not potable', 'Potable'], bbox_to_anchor = (0.15, 1.07), ncol = 1, borderpad = 3, frameon = False, fontsize = 11)
a.axvline(x = 6.72, linestyle = '-', color = '#1995ad', ymin = 0, ymax = 0.72, lw = 3)
a.axvline(x = 7.05, linestyle = '-', color = '#1995ad', ymin = 0, ymax = 0.9, lw = 3)
a.arrow(6.1, 0.2, -1.5, 0, width = 0.001, head_width = 0.007, head_length = 0.5, color = 'red')
a.arrow(8.05, 0.2, 1.5, 0, width = 0.001, head_width = 0.007, head_length = 0.5, color = 'red')
plt.title('PH level', size = 25, y = 1.03, fontname = 'monospace', color = '#283655')
plt.figtext(0.40, 0.51, 'acidic', fontsize = 13, fontname = 'monospace', color = '#a43820')
plt.figtext(0.403, 0.485, 'water', fontsize = 13, fontname = 'monospace', color = '#a43820')
plt.figtext(0.58, 0.51, 'alkaline', fontsize = 13, fontname = 'monospace', color = '#a43820')
plt.figtext(0.582, 0.485, 'water', fontsize = 13, fontname = 'monospace', color = '#a43820')
plt.figtext(0.90, 0.70, '''The acid-base balance of water. WHO has 
recommended maximum permissible limit 
of pH from 6.5 to 8.5. The current 
investigation ranges were 6.52–6.83 
which are in the range of WHO standards
.At pH=7, the water is neutral, 
at pH less than 7 — acidic,
at pH more than 7 — alkaline.''', fontsize = 13, fontname = 'monospace', color = 'purple', ha = 'right')
plt.figtext(0.505, 0.2, 'WHO maximum permissible limit', fontsize = 13, fontname = 'monospace', color = 'red', rotation = 90)


In [None]:
plt.figure(figsize=(16,8))
sns.set_style('white')
sns.set_context("poster", font_scale = .85)
plt.title('Hardness Level', size = 25, y = 1.03, fontname = 'monospace', color = '#283655')
a = sns.kdeplot(df[df['Potability']==0]['Hardness'],fill=True,color=['cyan'],alpha=0.7)
sns.kdeplot(df[df['Potability']==1]['Hardness'],fill=True,color=['yellow'],alpha=0.6)
plt.legend(['Not potable', 'Potable'], bbox_to_anchor = (0.15, 1.07), ncol = 1, borderpad = 3, frameon = False, fontsize = 11)
a.axhline(y = 0.001, linestyle = '-', color = '#1995ad', xmin = 0, xmax = 350, lw = 3)
a.axvline(x = 60, linestyle = '-', color = '#1995ad', ymin = 0, ymax = 0.059, lw = 3)
a.axvline(x = 120, linestyle = '-', color = '#1995ad', ymin = 0, ymax = 0.059, lw = 3)
a.axvline(x = 180, linestyle = '-', color = '#1995ad', ymin = 0, ymax = 0.059, lw = 3)
plt.figtext(0.15, 0.14, 'Soft', fontsize = 13, fontname = 'monospace', color = 'red')
plt.figtext(0.28, 0.14, 'Moderate', fontsize = 13, fontname = 'monospace', color = 'red')
plt.figtext(0.42, 0.14, 'Hard', fontsize = 13, fontname = 'monospace', color = 'red')
plt.figtext(0.58, 0.14, 'Very Hard', fontsize = 13, fontname = 'monospace', color = 'red')
plt.figtext(0.85, 0.561, '''Hardness is caused by compounds of
calcium and magnesium, and by a 
variety of other metals. General 
guidelines for classification of 
waters are: 0 to 60 mg/L as calcium 
carbonate is classified as soft; 
61 to 120 mg/L as moderately hard; 
121 to 180 mg/L as hard; and more 
than 180 mg/L as very hard.''', fontsize = 13, fontname = 'monospace', color = 'purple', ha = 'right')
sns.despine(left=True)

In [None]:
plt.figure(figsize=(16,8))
sns.set_style('white')
sns.set_context("poster", font_scale = .85)
plt.title('Sulphate', size = 25, y = 1.03, fontname = 'monospace', color = '#283655')
a = sns.kdeplot(df[df['Potability']==0]['Sulfate'],fill=True,color=['cyan'],alpha=0.7)
sns.kdeplot(df[df['Potability']==1]['Sulfate'],fill=True,color=['yellow'],alpha=0.6)
plt.legend(['Not potable', 'Potable'], bbox_to_anchor = (0.15, 1.07), ncol = 1, borderpad = 3, frameon = False, fontsize = 11)
a.set_xlim([100,500])
a.axhline(y = 0.0017, linestyle = '-', color = '#1995ad', xmin = 0, xmax = 500, lw = 3)
a.axvline(x = 250, linestyle = '-', color = '#1995ad', ymin = 0, ymax = 0.07, lw = 3)
plt.figtext(0.25, 0.14, 'Safe', fontsize = 13, fontname = 'monospace', color = 'red')
plt.figtext(0.55, 0.14, 'Unsafe', fontsize = 13, fontname = 'monospace', color = 'red')
plt.figtext(0.89, 0.685, '''Sulphate level between 0-250 mg/L
is Considered Safe While Above 
250 mg/L is Unsafe''', fontsize = 13, fontname = 'monospace', color = 'purple', ha = 'right')
sns.despine(left=True)

In [None]:
plt.figure(figsize=(16,8))
sns.set_style('white')
sns.set_context("poster", font_scale = .85)
plt.title('Trihalomethanes', size = 25, y = 1.03, fontname = 'monospace', color = '#283655')
a = sns.kdeplot(df[df['Potability']==0]['Trihalomethanes'],fill=True,color=['cyan'],alpha=0.7)
sns.kdeplot(df[df['Potability']==1]['Trihalomethanes'],fill=True,color=['yellow'],alpha=0.6)
plt.legend(['Not potable', 'Potable'], bbox_to_anchor = (0.15, 1.07), ncol = 1, borderpad = 3, frameon = False, fontsize = 11)
a.set_xlim([0,140])
a.axhline(y = 0.0024, linestyle = '-', color = '#1995ad', xmin = 0, xmax = 140, lw = 3)
a.axvline(x = 100, linestyle = '-', color = '#1995ad', ymin = 0, ymax = 0.072, lw = 3)
plt.figtext(0.355, 0.145, 'safe', fontsize = 13, fontname = 'monospace', color = 'red')
plt.figtext(0.755, 0.145, 'unsafe', fontsize = 13, fontname = 'monospace', color = 'red')
plt.figtext(0.9, 0.67, '''Trihalomethanes (THMs) are the result of a reaction
between the chlorine used for disinfecting tap water
and natural organic matter in the water. The legal
limit of total THMs in drinking water in Europe is
100 ppm.''', fontsize = 12, fontname = 'monospace', color = 'purple', ha = 'right')
sns.despine(left=True)

In [None]:
plt.figure(figsize=(16,8))
sns.set_style('white')
sns.set_context("poster", font_scale = .85)
plt.title('Turbidity', size = 25, y = 1.03, fontname = 'monospace', color = '#283655')
a = sns.kdeplot(df[df['Potability']==0]['Turbidity'],fill=True,color=['cyan'],alpha=0.7)
sns.kdeplot(df[df['Potability']==1]['Turbidity'],fill=True,color=['yellow'],alpha=0.6)
plt.legend(['Not potable', 'Potable'], bbox_to_anchor = (0.15, 1.07), ncol = 1, borderpad = 3, frameon = False, fontsize = 11)
a.axhline(y = 0.04, linestyle = '-', color = '#1995ad', xmin = 0, xmax = 7, lw = 3)
a.axvline(x = 5, linestyle = '-', color = '#1995ad', ymin = 0, ymax = 0.072, lw = 3)
plt.figtext(0.355, 0.145, 'safe', fontsize = 13, fontname = 'monospace', color = 'red')
plt.figtext(0.755, 0.145, 'unsafe', fontsize = 13, fontname = 'monospace', color = 'red')
plt.figtext(0.9, 0.64, '''Turbidity is a measure of the degree to which the water
loses its transparency due to the presence of suspended
particulates.The more total suspended solids in the 
water,the murkier it seems and the higher the 
turbidity.The WHO, establishes that the 
turbidity of drinking water shouldn't 
be more than 5 NTU.''', fontsize = 12, fontname = 'monospace', color = 'purple', ha = 'right')
sns.despine(left=True)

In [None]:
plt.figure(figsize=(16,8))
sns.set_style('white')
sns.set_context("poster", font_scale = .85)
plt.title('Conductivity', size = 28,x=0.45, y = 1.08, fontname = 'monospace', color = '#283655')
a = sns.kdeplot(df[df['Potability']==0]['Conductivity'],fill=True,color=['cyan'],alpha=0.7)
sns.kdeplot(df[df['Potability']==1]['Conductivity'],fill=True,color=['yellow'],alpha=0.6)
plt.legend(['Not potable', 'Potable'], bbox_to_anchor = (0.15, 1.07), ncol = 1, borderpad = 3, frameon = False, fontsize = 11)
a.axhline(y = 0.00038, linestyle = '-', color = '#1995ad', xmin = 0, xmax = 500, lw = 3)
a.axvline(x = 400, linestyle = '-', color = '#1995ad', ymin = 0, ymax = 0.07, lw = 3)
a.axvline(x = 800, linestyle = '-', color = '#1995ad', ymin = 0, ymax = 0.07, lw = 3)
plt.figtext(0.25, 0.14, 'WHO Recommended', fontsize = 13, fontname = 'monospace', color = 'red')
plt.figtext(0.60, 0.14, 'Safe', fontsize = 13, fontname = 'monospace', color = 'red')
plt.figtext(0.86, 0.14, 'Unsafe', fontsize = 13, fontname = 'monospace', color = 'red')
plt.figtext(0.9, 0.64, '''Electrical conductivity is a measure of the saltiness
of the water. Good drinking water for humans (provided
there is no organicpollution and not too much suspended
clay material) should not exceeded 800 μS/cm.''', fontsize = 12, fontname = 'monospace', color = 'purple', ha = 'right')
sns.despine(left=True)

In [None]:
plt.figure(figsize=(18,10))
sns.set_style('white')
sns.set_context("poster", font_scale = .75)
plt.suptitle('Potability', size = 35 , fontname = 'monospace',color = '#283655')
plt.subplot(1,3,1)
sns.boxplot(x='Potability',y='Hardness',data=df,color='limegreen')
plt.title('Hardness Vs Potability')
plt.xticks(rotation='vertical')
plt.grid()
hspace=0.9
plt.subplot(1,3,2)
s = sns.barplot(x='Potability',y='ph',data=df,color='cyan')
s.set_ylim([0,8])
plt.title('ph Vs Potability')
plt.xticks(rotation='vertical')
plt.grid()
plt.subplot(1,3,3)
sns.violinplot(x='Potability',y='Conductivity',data=df,color='coral')
plt.title('Conductivity Vs Potability')
plt.xticks(rotation='vertical')
plt.grid()
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(18,10))
sns.set_style('white')
sns.set_context("poster", font_scale = .75)
plt.suptitle('Potability', size = 35 , fontname = 'monospace',color = '#283655')
plt.subplot(1,3,1)
sns.scatterplot(y='Solids',data=df,x='ph',hue='Potability',palette=['red','purple'],alpha=0.9)
plt.title('Solids Vs Potability')
plt.xticks(rotation='vertical')
plt.grid()
hspace=0.9
plt.subplot(1,3,2)
s = sns.kdeplot(x='Organic_carbon',data=df,color='cyan',hue='Potability',fill=True,palette=["#ffa600","limegreen"],alpha=0.7)
plt.title('Organic Carbon Vs Potability')
plt.xticks(rotation='vertical')
plt.grid()
plt.subplot(1,3,3)
sns.set_palette("RdBu",)
sns.stripplot(x='Potability',y='Trihalomethanes',data=df)
plt.title('Trihalomethanes Vs Potability')
plt.xticks(rotation='vertical')
plt.grid()

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize = (16, 9))
sns.set_style("white")
a = sns.scatterplot(x = "ph", y = "Turbidity", hue = "Potability", palette = ['#bcbabe', '#a1d6e2'], 
                    sizes = (1, 8), linewidth = 0.5, data = df, edgecolor = "black")
plt.title('Turbidity Vs ph',size=24)
sns.despine(left=True)

In [None]:
plt.figure(figsize = (16, 8))
sns.set_style("white")
a = sns.scatterplot(x = "Conductivity", y = "Organic_carbon", hue = "Potability", palette = ['#bcbabe', '#a1d6e2'], 
                    sizes = (1, 8), linewidth = 0.5, data = df, edgecolor = "black")
plt.title('Conductivity Vs Organic Carbon',size=24,y=1.08)
sns.despine(left=True)

In [None]:
plt.figure(figsize = (16, 9))
sns.set_style("white")
a = sns.scatterplot(x = "Solids", y = "Chloramines", hue = "Potability", palette = ['#bcbabe', '#a1d6e2'], 
                    sizes = (1, 8), linewidth = 0.5, data = df, edgecolor = "black")
plt.title('Solids Vs Chloramines',size=24)
sns.despine(left=True)

# Conclusions

> 1. There doesn't seems to be any major differences in any of the features between potable and Non Potable water.
> 2. All the features such as Sulphates,Trihalomethanes etc have values that are above WHO recommended standards.

In [None]:
plt.figure(figsize = (14, 8))
matrix = np.triu(df.corr())

sns.heatmap(df.corr(), annot = True, cmap = 'Blues', fmt=".2f", mask = matrix, vmin = -1, vmax = 1, linewidths = 0.1, 
            linecolor = 'white', cbar = False, annot_kws = {'fontsize': 15})
plt.figtext(0.88, 0.76, '''As it's visible here that
all of features are not much
correlated.''', fontsize = 14, fontname = 'monospace', color = 'purple', ha = 'right')
plt.title('Correlations',size=24,x=0.4,y=1.09)
plt.show()

# Model Building

# Random Forest

In [None]:
X = df.drop('Potability',axis=1)
y = df['Potability']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=101)
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)
rfc_pred = rfc.predict(X_test)
print('\n')
print('Random Forest Accuracy score is: ',accuracy_score(y_test,rfc_pred)*100)
k=accuracy_score(y_test,rfc_pred)*100
print('\n')
print(confusion_matrix(y_test,rfc_pred))
print('\n')
print(classification_report(y_test,rfc_pred))

# KNN Algorithm

In [None]:
scale= StandardScaler()
scale.fit( df.drop('Potability',axis=1))
scaled_features = scale.transform( df.drop('Potability',axis=1))
df_feat = pd.DataFrame(scaled_features,columns=df.columns[:-1])
X = df_feat
y = df['Potability']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=101)
knn = KNeighborsClassifier(n_neighbors=1)
pred = knn.fit(X_train,y_train)
pred = knn.predict(X_test)
print('\n')
print('KNN Accuracy score is: ',accuracy_score(y_test,pred)*100)
print('\n')
print(confusion_matrix(y_test,pred))
print('\n')
print(classification_report(y_test,pred))

In [None]:
#Choose KNN Value
fig = plt.figure(figsize=(15.5,6))
sns.set_context("poster", font_scale = 0.7)
sns.set_style("white")
error = []
for i in range(1,40):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train,y_train)
    pred_i = knn.predict(X_test)
    error.append(np.mean(pred_i != y_test))

sns.set_style('whitegrid')
plt.plot(range(1,40),error,color='red', linestyle='dashed', marker='o',
         markerfacecolor='yellow', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')

In [None]:
#Best KNN Accuracy Score
knn = KNeighborsClassifier(n_neighbors=25)
pred = knn.fit(X_train,y_train)
pred = knn.predict(X_test)
#print('LogisticRegression score is: ',np.round(model.score(y_test,pred)*100,decimals=2))
print('\n')
print('Best KNN Accuracy score is: ',accuracy_score(y_test,pred)*100)
print('\n')
m=accuracy_score(y_test,pred)*100

print(confusion_matrix(y_test,pred))
print('\n')
print(classification_report(y_test,pred))

# LogisticRegression

In [None]:
X = df.drop('Potability',axis=1)
y = df['Potability']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30,random_state=101)
logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)
predictions = logmodel.predict(X_test)
print('\n')
print('Logistics Accuracy score is: ',accuracy_score(y_test,predictions)*100)
p=accuracy_score(y_test,predictions)*100
print('\n')
print(confusion_matrix(y_test,predictions))
print('\n')
print(classification_report(y_test,predictions))

In [None]:
plt.figure(figsize = (8, 5))
cm = confusion_matrix(y_test, predictions)
sns.heatmap(cm, cmap = 'Blues', annot = True, fmt = 'd', linewidths = 5, cbar = False, annot_kws = {'fontsize': 15}, 
            yticklabels = ['Not potable', 'Potable'], xticklabels = ['Predicted not potable', 'Predicted potable'])
plt.yticks(rotation = 0)
plt.show()

# Vector Machine

In [None]:
model = SVC()
model.fit(X_train,y_train)
pred = model.predict(X_test)
print('Vector Machine Accuracy score is: ',accuracy_score(y_test,pred)*100)
print('\n')
print(confusion_matrix(y_test,pred))
print('\n')
print(classification_report(y_test,pred))

In [None]:
#Grid Search
#Choosing C and Gamma values
param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001], 'kernel': ['rbf']} 
grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=3)
grid.fit(X_train,y_train)
print(grid.best_params_)
print(grid.best_estimator_)
print(grid.best_score_)

In [None]:
grid_pred = grid.predict(X_test)
print('Best Vector Machine Accuracy score is: ',accuracy_score(y_test,grid_pred)*100)
g=accuracy_score(y_test,grid_pred)*100
print('\n')
print(confusion_matrix(y_test,grid_pred))
print('\n')
print(classification_report(y_test,grid_pred))

In [None]:
label = ['Random Forest','K Nearest Neighbours','Logistics Regression','Vector Machine']
fig = px.pie(labels=label,values=[k,m,p,g], names = label,width = 800, height = 700)
fig.update_traces(textposition = 'inside', 
                  textinfo = 'percent + label', 
                  hole = 0.65, 
                  marker = dict(colors = ['#8d230f','#ff6361','#ffa600','#bc5090'], line = dict(color = 'white', width = 2)))

fig.update_layout(annotations = [dict(text = 'Performance Comaprison', 
                                      x = 0.5, y = 0.5, font_size = 28, showarrow = False, 
                                      font_family = 'monospace',
                                      font_color = 'black')],
                  showlegend = False)

# Thanks
took help from Dmitry Uarov