In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from warnings import filterwarnings
filterwarnings('ignore')
import seaborn as sn
import matplotlib.pyplot as plt


In [None]:
df= pd.read_csv('../input/water-potability/water_potability.csv')

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df_new= df.dropna()

In [None]:
df_new.isnull().sum()

In [None]:
df_new.info()

In [None]:
sn.countplot(df_new['Potability'])
plt.show()

In [None]:
numeric_columns= list(df.columns)
fig, axs = plt.subplots(5,2, figsize= (20,30))
plt.figtext(.5,.9,'Distributions of various features', fontsize=50, ha='center')
axs=axs.flatten()
for n,i in enumerate(numeric_columns):
    g=sn.histplot(x=df[i],ax=axs[n],kde=True)
    g.set(xlabel=None)
    axs[n].set_title('Distribution of column : {}'.format(i))


In [None]:
numeric_columns= list(df.columns)
fig, axs = plt.subplots(5,2, figsize= (20,30))
plt.figtext(.5,.9,'Detecting Outliers', fontsize=50, ha='center')
axs=axs.flatten()
for n,i in enumerate(numeric_columns):
    g= sn.boxplot(x=df[i],ax=axs[n])
    g.set(xlabel=None)
    axs[n].set_title('Box Plot : {}'.format(i))


In [None]:
G= df_new.groupby('Potability')
G.describe()

In [None]:
p0=G.get_group(0)
p1=G.get_group(1)

In [None]:
numeric_columns= list(p0.columns)
fig, axs = plt.subplots(5,2, figsize= (20,30))
plt.figtext(.5,.9,'Features when potability= 0', fontsize=50, ha='center')
axs=axs.flatten()
for n,i in enumerate(numeric_columns):
    g=sn.histplot(x=p0[i],ax=axs[n],kde=True)
    g.set(xlabel=None)
    axs[n].set_title('Distribution of column : {}'.format(i))


In [None]:
numeric_columns= list(p1.columns)
fig, axs = plt.subplots(5,2, figsize= (20,30))
plt.figtext(.5,.9,'Features when Potability = 1', fontsize=50, ha='center')
axs=axs.flatten()
for n,i in enumerate(numeric_columns):
    g=sn.histplot(x=p1[i],ax=axs[n],kde=True)
    g.set(xlabel=None)
    axs[n].set_title('Distribution of column : {}'.format(i))


In [None]:
plt.figure(figsize=(20,10))
sn.heatmap(p0.drop(columns='Potability').corr(),annot=True)
plt.show()

In [None]:
plt.figure(figsize=(20,10))
sn.heatmap(p1.drop(columns='Potability').corr(),annot=True)
plt.show()

# What's wrong with the data?

**This is something unusual. I expected certain degree of skewness and kurtosis when comparing distributions of features from both the classes seperately. But surpisingly, they follow the same pattern which means, there's not much to distinguish between potable water and unsafe water. For example, we can see the pH value of water labelled as 'Potable' exceeding 10. I observed the same thing going on with other features too. How does that make sense?**

**But still just to see if a neural network or a random forest classifier could catch some underlying patterns in these distributions, I'll proceed with training two models: a Neural Network and an RFC.** 

# Training the models: 

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input , Dropout , Concatenate , BatchNormalization as BN
from tensorflow.keras.utils import to_categorical

# ANN : 

In [None]:
input_layer= Input(shape= [len(df.columns)-1])
fc1= BN()(Dense(64,activation='relu')(input_layer))
drop1=Dropout(0.5)(fc1)
fc2= BN()(Dense(128,activation='relu')(drop1))
fc3= BN()(Dense(256,activation='relu')(fc2))

drop2= Dropout(0.5)(fc3)

fc4= BN()(Dense(512,activation= 'relu')(drop2))

fc5= BN()(Dense(1024,activation= 'relu')(fc4))
fc6= BN()(Dense(2048,activation= 'relu')(fc2))

                                      
out= Dense(1,activation= 'sigmoid')(fc6)
model = Model(inputs= input_layer, outputs= out)
model.summary()

In [None]:
data=np.array(df_new.iloc[:,:-1])

In [None]:
from sklearn.model_selection import train_test_split as tts
train_data,test_data,train_lab,test_lab= tts(data,df_new.Potability)

In [None]:
from sklearn.preprocessing import MinMaxScaler as MMS
scaler= MMS()
X=scaler.fit_transform(train_data)

In [None]:
Y= np.array(list(train_lab))

In [None]:
x_test= scaler.transform(test_data)
y_test=np.array(list(test_lab))

In [None]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [None]:
model.fit(X,Y,epochs= 100, batch_size=64)

In [None]:
model.evaluate(x_test,y_test)

# Random Forest Classifier : 

In [None]:
from sklearn.ensemble import RandomForestClassifier as RFC
model1= RFC()

In [None]:
model1.fit(X,Y)

**Training score :**

In [None]:
model1.score(X,Y)

In [None]:
model1.score(x_test,y_test)

**You always expect a neural network to catch complicated correlations between various features and the output but obviously, the validation score just isn't good enough. What do you think about the data?**