# Data Info
* ph : pH of water
* Hardness : Capacity of water to precipitate soap in mg/L
* Solids : Total dissolved solids in ppm
* Chloramines : Amount of Chloramines in ppm
* Sulfate : Amount of Sulfates dissolved in mg/L
* Conductivity : Electrical conductivity of water in μS/cm
* Organic_carbon : Amount of organic carbon in ppm
* Trihalomethanes : Amount of Trihalomethanes in μg/L
* Turbidity : Measure of light emiting property of water in NTU (Nephelometric Turbidity Units)
* Potability : Indicates if water is safe for human consumption
    + 1 means Potable 
    + 0 means Not potable

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.patches as mpatches

from pandas_profiling import ProfileReport
from scipy.stats import shapiro
from scipy.stats import levene
import missingno
%matplotlib inline
import seaborn as sns
sns.set_style('whitegrid')
import plotly.graph_objs as go
from plotly.subplots import make_subplots
from plotly.offline import iplot
from collections import Counter
from lightgbm import LGBMClassifier
from sklearn.decomposition import PCA

from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import auc
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve
from sklearn.metrics import plot_roc_curve


from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

In [None]:
water = pd.read_csv('/kaggle/input/water-potability/water_potability.csv')
df = water.copy()
df.head()

In [None]:
def check_df(dataframe, head=5):
    
    print(" SHAPE ".center(70,'#'))
    print('Rows: {}'.format(dataframe.shape[0]))
    print('Columns: {}'.format(dataframe.shape[1]))
    print(" TYPES ".center(70,'#'))
    print(dataframe.dtypes)
    print(" HEAD ".center(70,'#'))
    print(dataframe.head(head))
    print(' TAIL '.center(70,'#'))
    print(dataframe.tail(head))
    print(' MISSING VALUES '.center(70,'#'))
    print(dataframe.isnull().sum())
    print(' DUPLICATED VALUES '.center(70,'#'))
    print(dataframe.duplicated().sum())
    print(" QUANTILES ".center(70,'#'))
    print(dataframe.quantile([0, 0.05, 0.50, 0.95, 0.99, 1]).T)
    
check_df(df)

In [None]:
desc = df.describe().T
desc_df = pd.DataFrame(index= [col for col in df.columns if df[col].dtype != 'O'], 
                   columns= df.describe().T.columns.tolist(),data= desc )

f,ax = plt.subplots(figsize=(12,8))
sns.heatmap(desc_df, annot=True,cmap = "Blues", fmt= '.0f',
            ax=ax,linewidths = 5, cbar = False,
            annot_kws={"size": 16})

plt.xticks(size = 18)
plt.yticks(size = 14, rotation = 0)
plt.title("Descriptive Statistics", size = 16)
plt.show()

In [None]:
sns.set_style("white")
matrix = np.triu(df.corr(method="pearson"))
f,ax=plt.subplots(figsize = (15,15))
sns.heatmap(df.corr(),annot= True,fmt = ".2f",ax=ax,
            vmin = -1, vmax = 1, mask = matrix, cmap = "Blues",
            linewidth = 0.4,linecolor = "white",annot_kws={"size": 12})
plt.xticks(rotation=90,size=10)
plt.yticks(rotation=0,size=10)
plt.title('Pearson Correlation Map', size = 14)
plt.show()

In [None]:
df_clean = df.copy()
df_clean.dropna(inplace = True)
fig, ax = plt.subplots(nrows=3, ncols=3, figsize=(30, 20))

df_clean['temp_pH'] = df_clean['ph'].copy()
df_clean['temp_pH'] = df_clean['temp_pH'].round().astype('int')
sns.set(style="darkgrid")
# set the figure size
# top bar -> sum all values to find y position of the bars
df1 = df_clean.groupby(['temp_pH'])['Potability'].count().reset_index(name = 'count')
dff = df_clean[df_clean.Potability == 0].groupby(['temp_pH'])['Potability'].count().reset_index(name = 'count')
pH = df1.merge(dff, on='temp_pH', how='outer').fillna(0)

df_clean['temp_Hardness'] = df_clean['Hardness'].copy()
df_clean['temp_Hardness'] = (df_clean['temp_Hardness']/40).round().astype('int')
sns.set(style="darkgrid")
# set the figure size
# top bar -> sum all values to find y position of the bars
df1 = df_clean.groupby(['temp_Hardness'])['Potability'].count().reset_index(name = 'count')
dff = df_clean[df_clean.Potability == 0].groupby(['temp_Hardness'])['Potability'].count().reset_index(name = 'count')
hardness = df1.merge(dff, on='temp_Hardness', how='outer').fillna(0)
hardness['temp_Hardness'] =  hardness['temp_Hardness'].apply(lambda x:str(x*40) + "-" + str(x*40+40))

df_clean['temp_Solids'] = df_clean['Solids'].copy()
df_clean['temp_Solids'] = (df_clean['temp_Solids']/15000).round().astype('int')
sns.set(style="darkgrid")
# set the figure size
# top bar -> sum all values to find y position of the bars
df1 = df_clean.groupby(['temp_Solids'])['Potability'].count().reset_index(name = 'count')
dff = df_clean[df_clean.Potability == 0].groupby(['temp_Solids'])['Potability'].count().reset_index(name = 'count')
solids = df1.merge(dff, on='temp_Solids', how='outer').fillna(0)
solids['temp_Solids'] =  solids['temp_Solids'].apply(lambda x:str(x*15000) + "-" + str(x*15000+1500))

df_clean['temp_Chloramines'] = df_clean['Chloramines'].copy()
df_clean['temp_Chloramines'] = (df_clean['temp_Chloramines']).round().astype('int')
sns.set(style="darkgrid")
# set the figure size
# top bar -> sum all values to find y position of the bars
df1 = df_clean.groupby(['temp_Chloramines'])['Potability'].count().reset_index(name = 'count')
dff = df_clean[df_clean.Potability == 0].groupby(['temp_Chloramines'])['Potability'].count().reset_index(name = 'count')
chloramines = df1.merge(dff, on='temp_Chloramines', how='outer').fillna(0)

df_clean['temp_Sulfate'] = df_clean['Sulfate'].copy()
df_clean['temp_Sulfate'] = (df_clean['temp_Sulfate']/50).round().astype('int')
sns.set(style="darkgrid")
# set the figure size
# top bar -> sum all values to find y position of the bars
df1 = df_clean.groupby(['temp_Sulfate'])['Potability'].count().reset_index(name = 'count')
dff = df_clean[df_clean.Potability == 0].groupby(['temp_Sulfate'])['Potability'].count().reset_index(name = 'count')
sulfate = df1.merge(dff, on='temp_Sulfate', how='outer').fillna(0)
sulfate['temp_Sulfate'] =  sulfate['temp_Sulfate'].apply(lambda x:str(x*50) + "-" + str(x*50+50))

df_clean['temp_Conductivity'] = df_clean['Conductivity'].copy()
df_clean['temp_Conductivity'] = (df_clean['temp_Conductivity']/75).round().astype('int')
sns.set(style="darkgrid")
# set the figure size
# top bar -> sum all values to find y position of the bars
df1 = df_clean.groupby(['temp_Conductivity'])['Potability'].count().reset_index(name = 'count')
dff = df_clean[df_clean.Potability == 0].groupby(['temp_Conductivity'])['Potability'].count().reset_index(name = 'count')
conductivity = df1.merge(dff, on='temp_Conductivity', how='outer').fillna(0)
conductivity['temp_Conductivity'] =  conductivity['temp_Conductivity'].apply(lambda x:str(x*75) + "-" + str(x*75+75))

df_clean['temp_Organic_carbon'] = df_clean['Organic_carbon'].copy()
df_clean['temp_Organic_carbon'] = (df_clean['temp_Organic_carbon']/2).round().astype('int')
sns.set(style="darkgrid")
# set the figure size
# top bar -> sum all values to find y position of the bars
df1 = df_clean.groupby(['temp_Organic_carbon'])['Potability'].count().reset_index(name = 'count')
dff = df_clean[df_clean.Potability == 0].groupby(['temp_Organic_carbon'])['Potability'].count().reset_index(name = 'count')
organic_carbon = df1.merge(dff, on='temp_Organic_carbon', how='outer').fillna(0)
organic_carbon['temp_Organic_carbon'] =  organic_carbon['temp_Organic_carbon'].apply(lambda x:str(x*2) + "-" + str(x*2+2))

df_clean['temp_Trihalomethanes'] = df_clean['Trihalomethanes'].copy()
df_clean['temp_Trihalomethanes'] = (df_clean['temp_Trihalomethanes']/20).round().astype('int')
sns.set(style="darkgrid")
# set the figure size
# top bar -> sum all values to find y position of the bars
df1 = df_clean.groupby(['temp_Trihalomethanes'])['Potability'].count().reset_index(name = 'count')
dff = df_clean[df_clean.Potability == 0].groupby(['temp_Trihalomethanes'])['Potability'].count().reset_index(name = 'count')
trihalomethanes = df1.merge(dff, on='temp_Trihalomethanes', how='outer').fillna(0)
trihalomethanes['temp_Trihalomethanes'] =  trihalomethanes['temp_Trihalomethanes'].apply(lambda x:str(x*20) + "-" + str(x*20+20))

df_clean['temp_Turbidity'] = df_clean['Turbidity'].copy()
df_clean['temp_Turbidity'] = (df_clean['temp_Turbidity']).round().astype('int')
sns.set(style="darkgrid")
# set the figure size
# top bar -> sum all values to find y position of the bars
df1 = df_clean.groupby(['temp_Turbidity'])['Potability'].count().reset_index(name = 'count')
dff = df_clean[df_clean.Potability == 0].groupby(['temp_Turbidity'])['Potability'].count().reset_index(name = 'count')
turbidity = df1.merge(dff, on='temp_Turbidity', how='outer').fillna(0)

# bar chart 1 -> top bars 1
bar1 = sns.barplot(x = "temp_pH", y = 'count_x', data=pH, ax=ax[0][0],color='darkblue')

# bar chart 1 -> bottom bars (group of 'sex=0')
bar2 = sns.barplot(x="temp_pH", y= 'count_y', data=pH, ax=ax[0][0], estimator=sum, ci=None,  color='lightblue')

# bar1 chart 2
bar1 = sns.barplot(x = "temp_Hardness", y = 'count_x', data=hardness, ax=ax[0][1], color='darkblue')

# bar chart 2 -> bottom bars (group of 'sex=0')
bar2 = sns.barplot(x="temp_Hardness", y= 'count_y', data=hardness, ax=ax[0][1], estimator=sum, ci=None,  color='lightblue')

# bar1 chart 2
bar1 = sns.barplot(x = "temp_Solids", y = 'count_x', data=solids, ax=ax[0][2], color='darkblue')

# bar chart 2 -> bottom bars (group of 'sex=0')
bar2 = sns.barplot(x="temp_Solids", y= 'count_y', data=solids, ax=ax[0][2], estimator=sum, ci=None,  color='lightblue')

# bar1 chart 2
bar1 = sns.barplot(x = "temp_Chloramines", y = 'count_x', data=chloramines, ax=ax[1][0], color='darkblue')

# bar chart 2 -> bottom bars (group of 'sex=0')
bar2 = sns.barplot(x="temp_Chloramines", y= 'count_y', data=chloramines, ax=ax[1][0], estimator=sum, ci=None,  color='lightblue')

# bar1 chart 2
bar1 = sns.barplot(x = "temp_Sulfate", y = 'count_x', data=sulfate, ax=ax[1][1], color='darkblue')

# bar chart 2 -> bottom bars (group of 'sex=0')
bar2 = sns.barplot(x="temp_Sulfate", y= 'count_y', data=sulfate, ax=ax[1][1], estimator=sum, ci=None,  color='lightblue')

# bar1 chart 2
bar1 = sns.barplot(x = "temp_Conductivity", y = 'count_x', data=conductivity, ax=ax[1][2], color='darkblue')

# bar chart 2 -> bottom bars (group of 'sex=0')
bar2 = sns.barplot(x="temp_Conductivity", y= 'count_y', data=conductivity, ax=ax[1][2], estimator=sum, ci=None,  color='lightblue')

# bar1 chart 2
bar1 = sns.barplot(x = "temp_Organic_carbon", y = 'count_x', data=organic_carbon, ax=ax[2][0], color='darkblue')

# bar chart 2 -> bottom bars (group of 'sex=0')
bar2 = sns.barplot(x="temp_Organic_carbon", y= 'count_y', data=organic_carbon, ax=ax[2][0], estimator=sum, ci=None,  color='lightblue')

# bar1 chart 2
bar1 = sns.barplot(x = "temp_Trihalomethanes", y = 'count_x', data=trihalomethanes, ax=ax[2][1], color='darkblue')

# bar chart 2 -> bottom bars (group of 'sex=0')
bar2 = sns.barplot(x="temp_Trihalomethanes", y= 'count_y', data=trihalomethanes, ax=ax[2][1], estimator=sum, ci=None,  color='lightblue')

# bar1 chart 2
bar1 = sns.barplot(x = "temp_Turbidity", y = 'count_x', data=turbidity, ax=ax[2][2], color='darkblue')

# bar chart 2 -> bottom bars (group of 'sex=0')
bar2 = sns.barplot(x="temp_Turbidity", y= 'count_y', data=turbidity, ax=ax[2][2], estimator=sum, ci=None,  color='lightblue')

# add legend
top_bar = mpatches.Patch(color='darkblue', label='Potable')
bottom_bar = mpatches.Patch(color='lightblue', label='Not Potable')
ax[0][0].legend(handles=[top_bar, bottom_bar])
ax[0][0].set_title("pH vs Potability Plot")
ax[0][0].set_xlabel("pH")
ax[0][0].set_ylabel("Count")

ax[0][1].legend(handles=[top_bar, bottom_bar])
ax[0][1].set_title("Hardness vs Potability Plot")
ax[0][1].set_xlabel("Hardness")
ax[0][1].set_ylabel("Count")

ax[0][2].legend(handles=[top_bar, bottom_bar])
ax[0][2].set_title("Solids vs Potability Plot")
ax[0][2].set_xlabel("Solids")
ax[0][2].set_ylabel("Count")

ax[1][0].legend(handles=[top_bar, bottom_bar])
ax[1][0].set_title("Chloramines vs Potability Plot")
ax[1][0].set_xlabel("Chloramines")
ax[1][0].set_ylabel("Count")

ax[1][1].legend(handles=[top_bar, bottom_bar])
ax[1][1].set_title("Sulfate vs Potability Plot")
ax[1][1].set_xlabel("Sulfate")
ax[1][1].set_ylabel("Count")

ax[1][2].legend(handles=[top_bar, bottom_bar])
ax[1][2].set_title("Conductivity vs Potability Plot")
ax[1][2].set_xlabel("Conductivity")
ax[1][2].set_ylabel("Count")

ax[2][0].legend(handles=[top_bar, bottom_bar])
ax[2][0].set_title("Organic Carbon vs Potability Plot")
ax[2][0].set_xlabel("Organic Carbon")
ax[2][0].set_ylabel("Count")

ax[2][1].legend(handles=[top_bar, bottom_bar])
ax[2][1].set_title("Trihalomethanes vs Potability Plot")
ax[2][1].set_xlabel("Trihalomethanes")
ax[2][1].set_ylabel("Count")

ax[2][2].legend(handles=[top_bar, bottom_bar])
ax[2][2].set_title("Turbidity vs Potability Plot")
ax[2][2].set_xlabel("Turbidity")
ax[2][2].set_ylabel("Count")

fig.suptitle("Potability VS Quantitative Data", fontsize=18)

# show the graph
plt.show()

#df.drop(columns = ['temp_pH','temp_hardness','temp_chol','temp_thalachh'], inplace = True)

In [None]:
sns.violinplot(x='Potability', y='ph', data=df, palette='Blues')

In [None]:
sns.pairplot(df, hue="Potability", palette='Blues')