In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import random

seed = random.seed(100)

In [None]:
path = '../input/water-potability/water_potability.csv'
dataLoad = pd.read_csv(path)
dataLoad.head()

In [None]:
dataLoad.isnull().sum()

In [None]:
dataLoad.info()

In [None]:
dataLoad.describe()

## EDA (Exploratory Data Analysis)

## Univariate Analysis

In [None]:
a = dataLoad['Potability'].value_counts()
sns.barplot(x=a.index, y=a)

In [None]:
for i in dataLoad.columns:
    sns.histplot(x=i, data=dataLoad)
    plt.show()

In [None]:
a = dataLoad.copy()
a.drop('Potability', axis=1, inplace=True)
a

In [None]:
from scipy import stats

for i in a.columns:
    stat, p = stats.shapiro(a[i])
    
    if p > 0.05 :
        print('{} feature does have normal distribution (p : {})'.format(i, round(p, 3)))
    else:
        print('{} feature doesn\'t have normal distribution (p : {})'.format(i, round(p, 3)))
         

In [None]:
ls =[]
col=[c for c in a.columns]

for m in range(3):
    for n in range(3):
        ls.append((m, n))

figh, axes = plt.subplots(3, 3, figsize=(18, 15))
for num in range(9):
    sns.boxplot(ax=axes[ls[num][0], ls[num][1]], y=col[num], data=a)
    

## Bivariate Analysis

In [None]:
ls1 =[]
col1=[c for c in a.columns]

for m in range(3):
    for n in range(3):
        ls.append((m, n))

figh, axes1 = plt.subplots(3, 3, figsize=(18, 15))
for num1 in range(9):
    sns.boxplot(ax=axes1[ls[num1][0], ls[num1][1]], y=a[col1[num1]], x=dataLoad['Potability'])

In [None]:
sns.pairplot(a)

In [None]:
dataCorr =a.corr()
plt.figure(figsize=(15, 8))
sns.heatmap(dataCorr, annot=True)

## Data Cleaning

In [None]:
dataPre = pd.concat([a, dataLoad['Potability']], axis=1)

Q3 = a.quantile(0.75)
Q1 = a.quantile(0.25)

IQR = Q3-Q1
dataCleaned = dataPre[~((dataPre < (Q1-1.5*IQR)) | (dataPre > (Q3+1.5*IQR))).any(axis=1)]
dataCleaned.head()

In [None]:
dataCleaned.isnull().sum()

In [None]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean', missing_values=np.nan)
dataImputed = pd.DataFrame(imputer.fit_transform(dataCleaned), columns=[dataCleaned.columns])
dataImputed.head()

In [None]:
dataImputed.isnull().sum()

## Data Splitting & Data preprocessing

In [None]:
from sklearn.model_selection import train_test_split

y=dataImputed['Potability']
X=dataImputed.drop('Potability', axis=1)


x_train, x_test, y_train, y_test=train_test_split(X, y, test_size=0.2)
x_Train, x_valid, y_Train, y_valid = train_test_split(x_train, y_train, test_size=0.2)

## Making Baseline 

In [None]:
from tensorflow import keras
from tensorflow.keras import layers

modelBase = keras.Sequential([layers.Dense(units=16, activation='relu', input_shape=[9]),
                             layers.Dense(units=1, activation='sigmoid')])


In [None]:
modelBase.compile(optimizer='adam',
                 loss='binary_crossentropy',
                 metrics=['binary_accuracy'])

earlyStopping = keras.callbacks.EarlyStopping(patience=5, min_delta=0.001, restore_best_weights=True)

history=modelBase.fit(x_Train, y_Train,
                     validation_data=(x_valid, y_valid),
                     epochs=200,
                     batch_size=256,
                     callbacks=[earlyStopping],
                     verbose = 0)

history_df = pd.DataFrame(history.history)
history_df.loc[:, ['loss', 'val_loss']].plot(title='Cross-Entropy')
history_df.loc[:, ['binary_accuracy', 'val_binary_accuracy']].plot(title='Binary-Accuracy')

print('best val loss : ', history_df['val_loss'].min())
print('best val accuracy : ', history_df['val_binary_accuracy'].max())

In [None]:
model1 = keras.Sequential([layers.Dense(units=16, activation='relu', input_shape=[9]),
                           layers.Dense(units=16, activation='relu'),
                           layers.Dense(units=1, activation='sigmoid')])
                            

In [None]:
model1.compile(optimizer='adam',
               loss='binary_crossentropy',
                metrics=['binary_accuracy'])

history1=model1.fit(x_Train, y_Train, validation_data=(x_valid, y_valid),
          epochs=200, batch_size=256, callbacks=[earlyStopping], verbose=0)

history1_df = pd.DataFrame(history1.history)

history1_df.loc[:, ['loss', 'val_loss']].plot(title='Cross-Entropy')
history1_df.loc[:, ['binary_accuracy', 'val_binary_accuracy']].plot(title='Binary-accuaracy')

print('best val loss : ', history1_df['val_loss'].min())
print('best val accuracy : ', history1_df['val_binary_accuracy'].max())

In [None]:
model2 = keras.Sequential([layers.Dense(units=32, activation='relu', input_shape=[9]),
                          layers.Dense(units=32, activation='relu'),
                          layers.Dense(units=1, activation='sigmoid')])

In [None]:
model2.compile(optimizer='adam',
               loss='binary_crossentropy',
                metrics=['binary_accuracy'])

history2=model2.fit(x_Train, y_Train, validation_data=(x_valid, y_valid),
          epochs=200, batch_size=256, callbacks=[earlyStopping], verbose=0)

history2_df = pd.DataFrame(history2.history)

history2_df.loc[:, ['loss', 'val_loss']].plot(title='Cross-Entropy')
history2_df.loc[:, ['binary_accuracy', 'val_binary_accuracy']].plot(title='Binary-accuaracy')

print('best val loss : ', history2_df['val_loss'].min())
print('best val accuracy : ', history2_df['val_binary_accuracy'].max())

In [None]:
model3 = keras.Sequential([layers.BatchNormalization(),
                           layers.Dense(units=512, activation='relu', input_shape=[9]),
                           layers.BatchNormalization(),
                           layers.Dropout(0.3),
                           layers.Dense(units=512, activation='relu'),
                           layers.BatchNormalization(),
                           layers.Dropout(0.3),
                           layers.Dense(units=1, activation='sigmoid')])

In [None]:
model3.compile(optimizer='adam',
               loss='binary_crossentropy',
                metrics=['binary_accuracy'])

history3=model3.fit(x_Train, y_Train, validation_data=(x_valid, y_valid),
          epochs=200, batch_size=512, callbacks=[earlyStopping], verbose=0)

history3_df = pd.DataFrame(history3.history)

history3_df.loc[:, ['loss', 'val_loss']].plot(title='Cross-Entropy')
history3_df.loc[:, ['binary_accuracy', 'val_binary_accuracy']].plot(title='Binary-accuracy')

print('best val loss : ', history3_df['val_loss'].min())
print('best val accuracy : ', history3_df['val_binary_accuracy'].max())

In [None]:
model4 = keras.Sequential([layers.BatchNormalization(),
                           layers.Dense(units=512, activation='relu', input_shape=[9]),
                           layers.BatchNormalization(),
                           layers.Dropout(0.3),
                           layers.Dense(units=512, activation='relu'),
                           layers.BatchNormalization(),
                           layers.Dropout(0.3),
                           layers.Dense(units=1, activation='sigmoid')])

In [None]:
model4.compile(optimizer='adam',
               loss='binary_crossentropy',
                metrics=['binary_accuracy'])

history4=model4.fit(x_Train, y_Train, validation_data=(x_valid, y_valid),
          epochs=200, batch_size=512, callbacks=[earlyStopping], verbose=1)

history4_df = pd.DataFrame(history4.history)

history4_df.loc[:, ['loss', 'val_loss']].plot(title='Cross-Entropy')
history4_df.loc[:, ['binary_accuracy', 'val_binary_accuracy']].plot(title='Binary-accuracy')

print('best val loss : ', history4_df['val_loss'].min())
print('best val accuracy : ', history4_df['val_binary_accuracy'].max())

##### 