# Water quality is a very important topic in the modern world.
# Using this dataset, I will create a model to predict the potability of water.

In [None]:
import numpy as np 
import pandas as pd 

In [None]:
df = pd.read_csv("../input/water-potability/water_potability.csv")
df.head()

In [None]:
df.shape

# There are 10 columns with 3276 inputs
# Let's check if there are any missing values in the data

In [None]:
df.isnull().sum()

# Since the data is sensitive and is based on actual facts, we will drop all rows with missing values

In [None]:
df.dropna(inplace = True)
df.isnull().sum()

# Now that we have gotten rid of the missing data, let's check if the data is balanced or imbalanced

In [None]:
df.Potability.value_counts()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [None]:
d= pd.DataFrame(df['Potability'].value_counts())
fig = px.pie(d,values='Potability',names=['Unsafe','Safe'],hole=0.4,opacity=0.7,
            color_discrete_sequence=['#E5D9FF','#D0E2F5'])

fig.add_annotation(text='Potability',
                   x=0.5,y=0.5,showarrow=False,font_size=18,opacity=0.7,font_family='monospace')

fig.update_layout(
    font_family='monospace',
    title=dict(text='Potability Ratio',x=0.5,y=0.98,
               font=dict(color='#636363',size=20)),
    legend=dict(x=0.37,y=-0.05,orientation='h',traceorder='reversed'),
    hoverlabel=dict(bgcolor='white'))

fig.update_traces(textposition='outside', textinfo='percent+label')

fig.show()

# A 3:2 ratio between Unpotable & Potable water respectively shows that the data is slightly imbalanced but not enough to need upsampling/downsampling. This also gives us a chance of having more false negatives than false positives which is safer for training with this particular data set since we especially don't want to predict an input as safe to drink when it's not.

In [None]:
plt.figure(figsize=(14, 8))
sns.heatmap(df.corr(), annot=True)

# This heatmap shows that the correlation between the features is low and there is no linear relationship that results in the Potability binary value.

In [None]:
fig = plt.figure(figsize=(22, 11))
fig.subplots_adjust(hspace=0.4, wspace=0.4)
ax = fig.add_subplot(3, 3, 1)
sns.distplot(df['ph'], hist_kws=dict(edgecolor='k', linewidth=1), bins=10)
ax = fig.add_subplot(3, 3, 2)
sns.distplot(df['Hardness'], hist_kws=dict(edgecolor='k', linewidth=1), bins=10)
ax = fig.add_subplot(3, 3, 3)
sns.distplot(df['Solids'], hist_kws=dict(edgecolor='k', linewidth=1), bins=10)
ax = fig.add_subplot(3, 3, 4)
sns.distplot(df['Chloramines'], hist_kws=dict(edgecolor='k', linewidth=1), bins=10)
ax = fig.add_subplot(3, 3, 5)
sns.distplot(df['Sulfate'], hist_kws=dict(edgecolor='k', linewidth=1), bins=10)
ax = fig.add_subplot(3, 3, 6)
sns.distplot(df['Conductivity'], hist_kws=dict(edgecolor='k', linewidth=1), bins=10)
ax = fig.add_subplot(3, 3, 7)
sns.distplot(df['Organic_carbon'], hist_kws=dict(edgecolor='k', linewidth=1), bins=10)
ax = fig.add_subplot(3, 3, 8)
sns.distplot(df['Trihalomethanes'], hist_kws=dict(edgecolor='k', linewidth=1), bins=10)
ax = fig.add_subplot(3, 3, 9)
sns.distplot(df['Turbidity'], hist_kws=dict(edgecolor='k', linewidth=1), bins=10)
plt.show()

# The distributions of each feature are centered around the mean and are normal.

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler

x = df.drop(labels=['Potability'], axis=1)
y = df['Potability']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=101, stratify = y)

sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

mod = []
cvscore=[]
model =[AdaBoostClassifier(), BaggingClassifier(), GradientBoostingClassifier(), DecisionTreeClassifier(), ExtraTreeClassifier(), KNeighborsClassifier()]
for m in model:
    cvscore.append(cross_val_score(m, x_train, y_train, scoring='accuracy', cv=5).mean())
    mod.append(m)
model_df=pd.DataFrame(columns=['model','cv score'])
model_df['model']=mod
model_df['cv score']=cvscore
model_df.sort_values(by=['cv score'], ascending=False)