In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import plotly
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
import cufflinks as cf
from collections import Counter

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected = True)
cf.go_offline()

In [None]:
wp = pd.read_csv('../input/water-potability/water_potability.csv')

In [None]:
wp.info()

In [None]:
wp.head()

In [None]:
wp_pot = pd.DataFrame(wp['Potability'].value_counts())

fig = px.pie(
    wp_pot,
    values = 'Potability',
    names = ['Not Potable', 'Potable'],
    labels = {'label':'Potability','Potability':'No. Of Samples'},
    hole = 0.3,
    #color_discrete_sequence = [colors_green[3], colors_blue[3]]
)

fig.update_layout(hoverlabel = dict(bgcolor = 'white'))

fig.update_traces(textposition = 'outside', textinfo = 'percent+label')

fig.show()

In [None]:
fig = px.histogram(wp,
                  x = 'ph', 
                  color = 'Potability',
                   template = 'plotly_white', 
                   opacity = 0.6, 
                   nbins = 100,
                   barmode = 'group'
                  )

fig.add_vrect(x0 = '6.5', x1 = '8',
             fillcolor = 'green', opacity = 0.2)

fig.update_layout(hoverlabel = dict(bgcolor = 'white'))

fig.add_annotation(text = 'Water is safe inbetween <br> 6.5 and 8',
                   xref = 'x domain', yref = 'y domain',
                   x = 0.95, y = 0.9, showarrow = False)

fig.show()

In [None]:
fig = px.histogram(wp, x = 'Hardness',
                   color = 'Potability',
                   template = 'plotly_white',
                   opacity = 0.6,
                   barmode = 'group',
                   nbins = 100,
                  )

fig.add_vrect(x0 = '120', x1 = '170',
             fillcolor = 'green', opacity =0.2)

#fig.update_layout(xaxis = dict(domain = [0,0.5]), yaxis = dict(domain = [0.25,0.75]))

fig.add_annotation(text = 'Water is potable inbetween<br>120(mg/L) and 170(mg/L)',
                   xref = 'x domain',
                   yref = 'y domain', 
                   x = 0.95, y = 0.85, showarrow = False)

fig.update_layout(hoverlabel = dict(bgcolor = 'white'))

fig.show()

In [None]:
fig = px.histogram(wp, x = 'Solids',
            color = 'Potability',
            opacity = 0.6,
            barmode = 'group',
            template = 'plotly_white',
            nbins = 100)

fig.add_vrect(x0 = '0', x1='1200', fillcolor = 'green', opacity = 0.2)

fig.add_annotation(text = 'TDS <600(mg/L) is desireable <br>and >1200(mg/L) is unacceptable',
                   xref = 'x domain', yref = 'y domain',
                  x = 0.95, showarrow = False, y = 0.9)

fig.update_layout(hoverlabel = dict(bgcolor= 'white'))

fig.show()

In [None]:
fig = px.histogram(wp, x = 'Chloramines',
                  color = 'Potability',
                  barmode = 'group',
                   template = 'plotly_white',
                  nbins = 100, 
                  opacity = 0.6)

fig.add_vline( x = '4', fillcolor = 'green', opacity = 0.6, line_dash = 'dash')

fig.add_annotation(text = 'Chloramines <4 (mg/L) <br>is safe',
                   xref = 'x domain', yref = 'y domain',
                   x= 0.95, y = 0.9, 
                  showarrow = False)

fig.update_layout(hoverlabel = dict(bgcolor = 'white'))

fig.show()

In [None]:
wp['Sulfate'].agg(['mean', 'min', 'max'])

In [None]:
fig = px.histogram(wp, x = 'Sulfate',
                  color = 'Potability',
                  barmode = 'group',
                   template = 'plotly_white',
                  nbins = 100, 
                  opacity = 0.6)

fig.add_vline( x = '250', opacity = 0.6, line_dash = 'dash')

fig.add_annotation(text = 'Sulfate <250 (mg/L) <br>is safe',
                   xref = 'x domain', yref = 'y domain',
                   x= 0.95, y = 0.9, 
                  showarrow = False)

fig.update_layout(hoverlabel = dict(bgcolor = 'white'))

fig.show()

In [None]:
fig = px.histogram(wp, x = 'Conductivity',
                  color = 'Potability',
                  barmode = 'group',
                   template = 'plotly_white',
                  nbins = 100, 
                  opacity = 0.6)

fig.add_vline( x = '400', opacity = 0.6, line_dash = 'dash')

fig.add_annotation(text = 'Conductivity <400 (μS/cm) <br>is safe',
                   xref = 'x domain', yref = 'y domain',
                   x= 0.95, y = 0.9, 
                  showarrow = False)

fig.update_layout(hoverlabel = dict(bgcolor = 'white'))

fig.show()

In [None]:
fig = px.histogram(wp, x = 'Organic_carbon',
                  color = 'Potability',
                  barmode = 'group',
                   template = 'plotly_white',
                  nbins = 100, 
                  opacity = 0.6)

fig.add_annotation(text = 'There is no official<br>rules on TOC levels',
                   xref = 'x domain', yref = 'y domain',
                   x= 0.95, y = 0.9, 
                  showarrow = False)

fig.update_layout(hoverlabel = dict(bgcolor = 'white'))

fig.show()

In [None]:
fig = px.histogram(wp, x = 'Trihalomethanes',
                  color = 'Potability',
                  barmode = 'group',
                   template = 'plotly_white',
                  nbins = 100, 
                  opacity = 0.6)

fig.add_vline( x = '80', opacity = 0.6, line_dash = 'dash')

fig.add_annotation(text = 'Trihalomethanes <80 (μg/L) <br>is safe',
                   xref = 'x domain', yref = 'y domain',
                   x= 0.95, y = 0.9, 
                  showarrow = False)

fig.update_layout(hoverlabel = dict(bgcolor = 'white'))

fig.show()

In [None]:
fig = px.histogram(wp, x = 'Turbidity',
                  color = 'Potability',
                  barmode = 'group',
                   template = 'plotly_white',
                  nbins = 100, 
                  opacity = 0.6)

fig.add_vline( x = '5', opacity = 0.6, line_dash = 'dash')

fig.add_annotation(text = 'Trihalomethanes <5 (NTU) <br>is safe',
                   xref = 'x domain', yref = 'y domain',
                   x= 0.95, y = 0.9, 
                  showarrow = False)

fig.update_layout(hoverlabel = dict(bgcolor = 'white'))

fig.show()

This dataset looks very strange, some measures are way over the standards and still classified as potable. Especially when it comes to TDS. Seems like the author made mistakes when collecting the dataset, and I will drop  the entire column.

### Missing Values

In [None]:
sns.heatmap(wp.isna())

In [None]:
wp.isna().sum()

In [None]:
wp[['ph', 'Sulfate', 'Trihalomethanes']].agg(['median', 'mean'])

In [None]:
wp[wp['Potability']==1][['ph', 'Sulfate', 'Trihalomethanes']].agg(['median', 'mean'])

In [None]:
wp[wp['Potability']==1].describe()

In [None]:
wp[wp['Potability']==0].describe()

In [None]:
cor = wp.drop('Potability', axis = 1).corr()
fig = px.imshow(cor)
fig.show()

In [None]:
wp['ph'].fillna(value = wp['ph'].mean(), inplace = True)
wp['Sulfate'].fillna(value = wp['Sulfate'].mean(), inplace = True)
wp['Trihalomethanes'].fillna(value = wp['Trihalomethanes'].mean(), inplace = True)

### Modeling

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
wp1 = wp.copy()
X = wp1.drop(['Potability','Solids'], axis = 1)
y = wp1['Potability']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [None]:
dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train)

In [None]:
pred = dtree.predict(X_test)

In [None]:
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))

### Standard Scaler

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
wp2 = wp.copy().drop('Potability', axis = 1)
scaler = StandardScaler()
scaler.fit(wp2)
scaled_wp = scaler.transform(wp2)

In [None]:
dfscaled = pd.DataFrame(scaled_wp,columns = wp.columns[0:-1])

In [None]:
dfscaled.head()

In [None]:
X = dfscaled
y = wp['Potability']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [None]:
dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train)

In [None]:
pred = dtree.predict(X_test)

In [None]:
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))