In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Data Collection**

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
sns.set_theme()
%config InlineBackend.figure_format = 'retina'

from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV,cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score,classification_report,precision_score,recall_score,roc_auc_score,confusion_matrix,roc_curve
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv('../input/water-potability/water_potability.csv')

In [None]:
df.sample(10)

**DESCRIPTION**

1. **pH value:**
PH is an important parameter in evaluating the acid–base balance of water. It is also the indicator of acidic or alkaline condition of water status. WHO has recommended maximum permissible limit of pH from 6.5 to 8.5. The current investigation ranges were 6.52–6.83 which are in the range of WHO standards.

2. **Hardness:**
Hardness is mainly caused by calcium and magnesium salts. These salts are dissolved from geologic deposits through which water travels. The length of time water is in contact with hardness producing material helps determine how much hardness there is in raw water. Hardness was originally defined as the capacity of water to precipitate soap caused by Calcium and Magnesium.

3. **Solids (Total dissolved solids - TDS):**
Water has the ability to dissolve a wide range of inorganic and some organic minerals or salts such as potassium, calcium, sodium, bicarbonates, chlorides, magnesium, sulfates etc. These minerals produced un-wanted taste and diluted color in appearance of water. This is the important parameter for the use of water. The water with high TDS value indicates that water is highly mineralized. Desirable limit for TDS is 500 mg/l and maximum limit is 1000 mg/l which prescribed for drinking purpose.

4. **Chloramines:**
Chlorine and chloramine are the major disinfectants used in public water systems. Chloramines are most commonly formed when ammonia is added to chlorine to treat drinking water. Chlorine levels up to 4 milligrams per liter (mg/L or 4 parts per million (ppm)) are considered safe in drinking water.

5. **Sulfate:**
Sulfates are naturally occurring substances that are found in minerals, soil, and rocks. They are present in ambient air, groundwater, plants, and food. The principal commercial use of sulfate is in the chemical industry. Sulfate concentration in seawater is about 2,700 milligrams per liter (mg/L). It ranges from 3 to 30 mg/L in most freshwater supplies, although much higher concentrations (1000 mg/L) are found in some geographic locations.

6. **Conductivity:**
Pure water is not a good conductor of electric current rather’s a good insulator. Increase in ions concentration enhances the electrical conductivity of water. Generally, the amount of dissolved solids in water determines the electrical conductivity. Electrical conductivity (EC) actually measures the ionic process of a solution that enables it to transmit current. According to WHO standards, EC value should not exceeded 400 μS/cm.

7. **Organic_carbon:**
Total Organic Carbon (TOC) in source waters comes from decaying natural organic matter (NOM) as well as synthetic sources. TOC is a measure of the total amount of carbon in organic compounds in pure water. According to US EPA < 2 mg/L as TOC in treated / drinking water, and < 4 mg/Lit in source water which is use for treatment.

8. **Trihalomethanes:**
THMs are chemicals which may be found in water treated with chlorine. The concentration of THMs in drinking water varies according to the level of organic material in the water, the amount of chlorine required to treat the water, and the temperature of the water that is being treated. THM levels up to 80 ppm is considered safe in drinking water.

9. **Turbidity:**
The turbidity of water depends on the quantity of solid matter present in the suspended state. It is a measure of light emitting properties of water and the test is used to indicate the quality of waste discharge with respect to colloidal matter. The mean turbidity value obtained for Wondo Genet Campus (0.98 NTU) is lower than the WHO recommended value of 5.00 NTU.

10. **Potability:**
Indicates if water is safe for human consumption where 1 means Potable and 0 means Not potable.

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.columns

**DATA CLEANING & EXPLORATION**

In [None]:
df.isnull().sum() #finding total number of missing values in each columns

In [None]:
x = len(df)
((df.isnull().sum() / x) *100).to_frame(name='Percentage of Null Values') #Percentage of missing values in each column

In [None]:
df.info()

In [None]:
df.dropna(inplace=True)

In [None]:
df.describe()

In [None]:
df.shape

In [None]:
df.skew(axis=0,skipna=True).to_frame(name='Skewed Values')

In [None]:
feature=['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
       'Organic_carbon', 'Trihalomethanes', 'Turbidity', 'Potability']
list(enumerate(feature))

In [None]:
fig, axes = plt.subplots(3, 3, figsize=(22,20))

sns.boxplot(ax=axes[0, 0], data=df, x='Potability', y='ph')
sns.boxplot(ax=axes[0, 1], data=df, x='Potability', y='Hardness')
sns.boxplot(ax=axes[0, 2], data=df, x='Potability', y='Solids')
sns.boxplot(ax=axes[1, 0], data=df, x='Potability', y='Chloramines')
sns.boxplot(ax=axes[1, 1], data=df, x='Potability', y='Sulfate')
sns.boxplot(ax=axes[1, 2], data=df, x='Potability', y='Conductivity')
sns.boxplot(ax=axes[2, 0], data=df, x='Potability', y='Organic_carbon')
sns.boxplot(ax=axes[2, 1], data=df, x='Potability', y='Trihalomethanes')
sns.boxplot(ax=axes[2, 2], data=df, x='Potability', y='Turbidity')


1. From The boxplot we can clearly see that there are outliers present in the set which we will remove later in the code

In [None]:
plt.figure(figsize=(8,5))

sns.countplot(x='Potability',data=df,saturation=0.95)

In [None]:
df['Potability'].value_counts()

In [None]:
print("Potable water %",(df['Potability'].value_counts()[1]/len(df)*100))
print("Non-Potable water %",(df['Potability'].value_counts()[0]/len(df)*100))

In [None]:
df.iloc[:,:-1].mean().to_frame().T

In [None]:
plt.figure(figsize=(13,12))
sns.distplot(df['ph'][df.Potability==1],hist=True, rug=True,color='Maroon',hist_kws={'alpha':0.50})
sns.distplot(df['ph'][df.Potability==0],hist=True, rug=True,color='Blue',hist_kws={'alpha':0.60})
plt.legend(['Potability','Non Potability'])

In [None]:
plt.figure(figsize=(13,12))
sns.distplot(df['Solids'][df.Potability==1],hist=True, rug=True,color='Maroon',hist_kws={'alpha':0.50})
sns.distplot(df['Solids'][df.Potability==0],hist=True, rug=True,color='Blue',hist_kws={'alpha':0.60})
plt.legend(['Potability','Non Potability'])

In [None]:
plt.figure(figsize=(13,12))
sns.distplot(df['Sulfate'][df.Potability==1],hist=True, rug=True,color='Maroon',hist_kws={'alpha':0.50})
sns.distplot(df['Sulfate'][df.Potability==0],hist=True, rug=True,color='Blue',hist_kws={'alpha':0.60})
plt.legend(['Potability','Non Potability'])

In [None]:
plt.figure(figsize=(13,12))
sns.distplot(df['Trihalomethanes'][df.Potability==1],hist=True, rug=True,color='Maroon',hist_kws={'alpha':0.50})
sns.distplot(df['Trihalomethanes'][df.Potability==0],hist=True, rug=True,color='Blue',hist_kws={'alpha':0.60})
plt.legend(['Potability','Non Potability'])

In [None]:
plt.figure(figsize=(13,12))
sns.distplot(df['Conductivity'][df.Potability==1],hist=True, rug=True,color='Maroon',hist_kws={'alpha':0.50})
sns.distplot(df['Conductivity'][df.Potability==0],hist=True, rug=True,color='Blue',hist_kws={'alpha':0.60})
plt.legend(['Potability','Non Potability'])

1. Both the Potable and Non Potable Water is within the range of WHO standard i.e 6.5 and 8.5
2. Solids contained in both potable and non potable water is high but containment is less in Potable water.
3. Sulfate containment in Non Potable water is too high compare to Potable water but still both water is drinkable as sulfate containment is less than 1000mg/L
4. Trihalomethanes containment is high in non potable water but less than permissible limit but still this non potable water is not recommended to drink.
5. Conductivity of the non potable water is too high above permissible limit. This because of containments of solid,organic matter,etc. 
6. From The Mean table We can also see that most water is below ph 8.5 but conductivitiy is high that makes it unhealthy.
**As the conductivity, presence of high Trihalomethanes and other levels are quite high than Potable water, This makes it unhealty to drink.Therefore almost 60% of water is Non-Potable Water.**

In [None]:
plt.figure(figsize=(18,15))
corr=df.iloc[:,:-1].corr()
corr_feat = corr.index
sns.heatmap(df[corr_feat].corr(),cmap='cividis',annot=True,linewidths=2)

**From the Correlation Table we see that there no such independent variable that is highly correlated to another independent variable**

In [None]:
bin=np.linspace(min(df['ph']),max(df['ph']),15)
groups=[1,2,3,4,5,6,7,8,9,10,11,12,13,14]
df['ph-binned']=pd.cut(df['ph'],bins=bin,labels=groups)

In [None]:
plt.figure(figsize=(10,8))
sns.countplot(df['ph-binned'],hue=df['Potability'],palette='viridis')

In [None]:
df.drop(columns='ph-binned',axis=1,inplace=True)

In [None]:
sns.pairplot(df,hue='Potability')

**PREPARATION**

In [None]:
#Old Skewness
df.skew()

In [None]:
from scipy import stats
import pylab
def normality(data,feature):
    plt.figure(figsize=(10,5))
    plt.subplot(1,2,1)
    sns.kdeplot(data[feature])
    plt.subplot(1,2,2)
    stats.probplot(data[feature],plot=pylab)
    plt.show()

In [None]:
normality(df,'Solids')

In [None]:
df['Solids'],param=stats.boxcox(df['Solids'])

In [None]:
normality(df,'Solids')

In [None]:
normality(df,'Conductivity')

In [None]:
df['Conductivity'],param=stats.boxcox(df['Conductivity'])

In [None]:
normality(df,'Conductivity')

In [None]:
df.skew()

**We perform Gaussian Transform i.e Boxcox transformation method on Solids and Conductivity only as they were mostly skewed and others not as there skeweness was almost good i.e skew value close to 0 and we did'nt perform transformation on Target Data is its a discrete value.**

In [None]:
X = df.loc[:,df.columns != "Potability"]
X.head()

In [None]:
y = df['Potability']

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
model.fit(X,y)
print(model.feature_importances_)


In [None]:
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.plot(kind='barh')
plt.show()

**Model Training**

**LOGISTIC REGRESSION**

In [None]:
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=45)

In [None]:
scaler = StandardScaler().fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

In [None]:
logireg = LogisticRegression()

In [None]:
logireg.fit(x_train,y_train)

In [None]:
logireg.score(x_test,y_test)

In [None]:
y_pred_train = logireg.predict(x_train)

In [None]:
y_pred_test = logireg.predict(x_test)
y_pred_test

In [None]:
print("Training accuracy: ",accuracy_score(y_train,y_pred_train))
lg_acc=accuracy_score(y_test,y_pred_test)
print("Test accuracy: ",lg_acc)

In [None]:
plt.figure(figsize=(9,6),facecolor='lightyellow')
sns.heatmap(confusion_matrix(y_test,y_pred_test),annot=True,linewidths=2,cmap='crest_r',fmt='.3g',linecolor='orange',square=True)
plt.xlabel('Predicted')
plt.ylabel('Actual')

In [None]:

roc_auc_score(y_test,y_pred_test)

In [None]:
print(classification_report(y_test,y_pred_test))

In [None]:
print((cross_val_score(logireg, x_train,y_train, cv=6)).mean())

**NAIVE BAYES**

In [None]:
gb = GaussianNB()
gb.fit(x_train,y_train)

In [None]:
gb.score(x_test,y_test)

In [None]:
y_pred_test_gb = gb.predict(x_test)
y_pred_test_gb

In [None]:
y_pred_train_gb = gb.predict(x_train)

In [None]:
print("Training accuracy: ",accuracy_score(y_train,y_pred_train_gb))
gb_acc=accuracy_score(y_test,y_pred_test_gb)
print("Test accuracy: ",gb_acc)

In [None]:
plt.figure(figsize=(9,6),facecolor='lightyellow')
sns.heatmap(confusion_matrix(y_test,y_pred_test_gb),annot=True,linewidths=2,cmap='crest_r',fmt='.3g',linecolor='orange',square=True)
plt.xlabel('Predicted')
plt.ylabel('Actual')

In [None]:
roc_auc_score(y_test,y_pred_test_gb)

In [None]:
print((cross_val_score(gb, x_train,y_train, cv=6)).mean())

In [None]:
print(classification_report(y_test,y_pred_test_gb))

**KNN Classifier**

In [None]:
knn = KNeighborsClassifier(n_neighbors=6,n_jobs=-1)

In [None]:
knn.fit(x_train,y_train)

In [None]:
knn.score(x_test,y_test)

In [None]:
y_pred_test_knn = knn.predict(x_test)
y_pred_test_knn

In [None]:
y_pred_train_knn = knn.predict(x_train)

In [None]:
print("Training accuracy: ",accuracy_score(y_train,y_pred_train_knn))
knn_acc=accuracy_score(y_test,y_pred_test_knn)
print("Test accuracy: ",knn_acc)

In [None]:

roc_auc_score(y_test,y_pred_test_knn)

In [None]:
plt.figure(figsize=(9,6),facecolor='lightyellow')
sns.heatmap(confusion_matrix(y_test,y_pred_test_knn),annot=True,linewidths=2,cmap='crest_r',fmt='.3g',linecolor='orange',square=True)
plt.xlabel('Predicted')
plt.ylabel('Actual')

In [None]:
print(classification_report(y_test,y_pred_test_knn))

In [None]:
print((cross_val_score(knn, x_train,y_train, cv=6)).mean())

**DECISION TREE**

In [None]:
dt = DecisionTreeClassifier(criterion='gini',min_samples_leaf=2,min_samples_split=2,max_depth=70)

In [None]:
dt.fit(x_train,y_train)

In [None]:
dt.score(x_test,y_test)

In [None]:
y_pred_test_dt = dt.predict(x_test)
y_pred_test_dt

In [None]:
y_pred_train_dt = dt.predict(x_train)

In [None]:
print("Training accuracy: ",accuracy_score(y_train,y_pred_train_dt))
dt_acc=accuracy_score(y_test,y_pred_test_dt)
print("Test accuracy: ",dt_acc)

In [None]:
roc_auc_score(y_test,y_pred_test_dt)

In [None]:
plt.figure(figsize=(9,6),facecolor='lightyellow')
sns.heatmap(confusion_matrix(y_test,y_pred_test_dt),annot=True,linewidths=2,cmap='crest_r',fmt='.3g',linecolor='orange',square=True)
plt.xlabel('Predicted')
plt.ylabel('Actual')

In [None]:
print(classification_report(y_test,y_pred_test_dt))

**ADABOOST with Decision Tree**

In [None]:
ada_dt = AdaBoostClassifier(dt,n_estimators=75,learning_rate=0.01)

In [None]:
ada_dt.fit(x_train,y_train)

In [None]:
ada_dt.score(x_train,y_train)

In [None]:
y_pred_test_adadt = ada_dt.predict(x_test)
y_pred_test_adadt

In [None]:
y_pred_train_adadt= ada_dt.predict(x_train)

In [None]:
print("Training accuracy: ",accuracy_score(y_train,y_pred_train_adadt))
adadt_acc=accuracy_score(y_test,y_pred_test_adadt)
print("Test accuracy: ",adadt_acc)

In [None]:
plt.figure(figsize=(9,6),facecolor='lightyellow')
sns.heatmap(confusion_matrix(y_test,y_pred_test_adadt),annot=True,linewidths=2,cmap='crest_r',fmt='.3g',linecolor='orange',square=True)
plt.xlabel('Predicted')
plt.ylabel('Actual')

In [None]:
roc_auc_score(y_test,y_pred_test_adadt)

In [None]:
print(classification_report(y_test,y_pred_test_adadt))

**XGBOOST**

In [None]:
xgb = XGBClassifier(random_state=40)

In [None]:
xgb.fit(x_train,y_train)

In [None]:
xgb.score(x_test,y_test)

In [None]:
y_pred_test_xgb= xgb.predict(x_test)
y_pred_test_xgb

In [None]:
y_pred_train_xgb = xgb.predict(x_train)

In [None]:
print("Training accuracy: ",accuracy_score(y_train,y_pred_train_xgb))
xgb_acc=accuracy_score(y_test,y_pred_test_xgb)
print("Test accuracy: ",xgb_acc)

In [None]:
plt.figure(figsize=(9,6),facecolor='lightyellow')
sns.heatmap(confusion_matrix(y_test,y_pred_test_xgb),annot=True,linewidths=2,cmap='crest_r',fmt='.3g',linecolor='orange',square=True)
plt.xlabel('Predicted')
plt.ylabel('Actual')

In [None]:
roc_auc_score(y_test,y_pred_test_xgb)

In [None]:
print(classification_report(y_test,y_pred_test_xgb))

**MLP**


In [None]:
mlp = MLPClassifier(max_iter=600,random_state=5)

In [None]:
mlp.fit(x_train,y_train)

In [None]:
mlp.score(x_test,y_test)

In [None]:
y_pred_test_mlp= mlp.predict(x_test)
y_pred_test_mlp

In [None]:
y_pred_train_mlp= mlp.predict(x_train)

In [None]:
print("Training accuracy: ",accuracy_score(y_train,y_pred_train_mlp))
mlp_acc=accuracy_score(y_test,y_pred_test_mlp)
print("Test accuracy: ",mlp_acc)

In [None]:
plt.figure(figsize=(9,6),facecolor='lightyellow')
sns.heatmap(confusion_matrix(y_test,y_pred_test_mlp),annot=True,linewidths=2,cmap='crest_r',fmt='.3g',linecolor='orange',square=True)
plt.xlabel('Predicted')
plt.ylabel('Actual')

In [None]:
roc_auc_score(y_test,y_pred_test_mlp)

In [None]:
print(classification_report(y_test,y_pred_test_mlp))

**Random Forest**

In [None]:
rfc = RandomForestClassifier(n_estimators=300,criterion='gini')

In [None]:
rfc.fit(x_train,y_train)

In [None]:
rfc.score(x_test,y_test)

In [None]:
y_pred_test_rfc= rfc.predict(x_test)
y_pred_test_rfc

In [None]:
y_pred_train_rfc= rfc.predict(x_train)

In [None]:
print("Training accuracy: ",accuracy_score(y_train,y_pred_train_rfc))
rfc_acc=accuracy_score(y_test,y_pred_test_rfc)
print("Test accuracy: ",rfc_acc)

In [None]:
plt.figure(figsize=(9,6),facecolor='lightyellow')
sns.heatmap(confusion_matrix(y_test,y_pred_test_rfc),annot=True,linewidths=2,cmap='crest_r',fmt='.3g',linecolor='orange',square=True)
plt.xlabel('Predicted')
plt.ylabel('Actual')

In [None]:
roc_auc_score(y_test,y_pred_test_rfc)

In [None]:
print(classification_report(y_test,y_pred_test_rfc))

**Support Vector Classifier**

In [None]:
svc=SVC(gamma='auto',kernel='rbf',random_state=3,probability=True)

In [None]:
svc.fit(x_train,y_train)

In [None]:
svc.score(x_test,y_test)

In [None]:
y_pred_test_svc= svc.predict(x_test)
y_pred_test_svc

In [None]:
y_pred_train_svc= svc.predict(x_train)

In [None]:
print("Training accuracy: ",accuracy_score(y_train,y_pred_train_svc))
svc_acc=accuracy_score(y_test,y_pred_test_svc)
print("Test accuracy: ",accuracy_score(y_test,y_pred_test_svc))

In [None]:
plt.figure(figsize=(9,6),facecolor='lightyellow')
sns.heatmap(confusion_matrix(y_test,y_pred_test_svc),annot=True,linewidths=2,cmap='crest_r',fmt='.3g',linecolor='orange',square=True)
plt.xlabel('Predicted')
plt.ylabel('Actual')

In [None]:
roc_auc_score(y_test,y_pred_test_svc)

In [None]:
print(classification_report(y_test,y_pred_test_svc))

**Models Overview**

In [None]:
model_df = pd.DataFrame({
    'Models' : ['Logistic Regression', 'GaussianNB', 'KNN', 'Decision Tree',  'AdaBoost with Decision Tree', 'XGBoost','MLP',
             'Random Forest', 'SVC'],
    'Accuracy Score' : [lg_acc,gb_acc,knn_acc,dt_acc,adadt_acc,xgb_acc,mlp_acc,rfc_acc,svc_acc]
})

In [None]:
model_df.sort_values(by = 'Accuracy Score', ascending = False)

In [None]:
plt.figure(figsize = (25,10))
sns.set_theme(style="whitegrid")
sns.barplot(x = 'Models', y = 'Accuracy Score', data = model_df)
plt.ylim(0,0.75)
plt.show()
