In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('../input/water-potability/water_potability.csv')

In [None]:
data.head()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
# Compute the correlation matrix
corr = data.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

# Preprocessing

In [None]:
X = data.drop('Potability',axis=1)
X.head()

In [None]:
y = data['Potability']
y.head()

# Imputation

In [None]:
#Imputation

from sklearn.impute import SimpleImputer

# Imputation
my_imputer = SimpleImputer()
X_imputed = pd.DataFrame(my_imputer.fit_transform(X))

# Imputation removed column names; put them back
X_imputed.columns = X.columns

X =X_imputed
X

# Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
X = pd.DataFrame(StandardScaler().fit_transform(X))
X

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

# Training

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC 
from sklearn.neural_network import MLPClassifier 

In [None]:
log_model = LogisticRegression()
svm_model = SVC(C=0.1,kernel='rbf')
nn_model=MLPClassifier(hidden_layer_sizes=(8,16))

In [None]:
log_model.fit(X_train, y_train)
svm_model.fit(X_train, y_train)
nn_model.fit(X_train, y_train)

In [None]:
print(f'Logistic Regression:\nAccuracy: {log_model}\nfScore: {log_model.score(X_test,y_test)}')
print(f'SVM:\nAccuracy: {svm_model}\nfScore: {svm_model.score(X_test,y_test)}')
print(f'NN Model:\nAccuracy: {nn_model}\nfScore: {nn_model.score(X_test,y_test)}')

# Custom Neural Network

In [None]:
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow as tf

In [None]:
input_shape = [X_train.shape[1]]

In [None]:
model = tf.keras.Sequential([

    layers.Dense(8, input_shape=input_shape, activation='relu'),
    layers.Dropout(rate=0.15),
    
    layers.Dense(32, activation='relu'),
    layers.Dropout(rate=0.15),
    
    layers.LeakyReLU(alpha=0.05),
    
    layers.Dense(128, activation='relu'),
    
    layers.Dense(1,activation='sigmoid')
])
    
model.compile(optimizer = 'adam',loss = 'binary_crossentropy',metrics=['accuracy'])
    
    


In [None]:
epochs= 1000
BATCH_SIZE = 32

In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', min_delta=0, patience=25, verbose=0,
    mode='auto', baseline=None, restore_best_weights=True
)
history = model.fit(X_train, y_train,validation_data=(X_test,y_test) ,batch_size=BATCH_SIZE,epochs=epochs,callbacks=[early_stopping])

In [None]:
# list all data in history
print(history.history.keys())
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
results = model.evaluate(X_test, y_test, batch_size=128)
print("test loss, test acc:", results)
