In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import resample
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler

# Importing The Dataset

### Feature Description

* ph: pH of 1. water (0 to 14).

* Hardness: Capacity of water to precipitate soap in mg/L.

* Solids: Total dissolved solids in ppm.

* Chloramines: Amount of Chloramines in ppm.

* Sulfate: Amount of Sulfates dissolved in mg/L.

* Conductivity: Electrical conductivity of water in μS/cm.

* Organic_carbon: Amount of organic carbon in ppm.

* Trihalomethanes: Amount of Trihalomethanes in μg/L.

* Turbidity: Measure of light emiting property of water in NTU.

* Potability: Indicates if water is safe for human consumption. Potable - 1 and Not potable - 0

### Importing

In [None]:
df = pd.read_csv('../input/water-potability/water_potability.csv')
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.Potability.value_counts()

In [None]:
df = df.dropna()
df.head()

In [None]:
df.Potability.value_counts()

In [None]:
df_notpotable  = df[df['Potability']==0]
df_potable = df[df['Potability']==1]

df_notpotable.head()

In [None]:
df_potable_resample = resample(df_potable, replace = True, n_samples = 1200, random_state = 0)

In [None]:
df = pd.concat([df_notpotable, df_potable_resample])
df.Potability.value_counts()

In [None]:
df = shuffle(df, random_state=0) 

# EDA

In [None]:
df.hist(bins=10, figsize=(20,15), color = 'teal')

In [None]:
fig = plt.figure(figsize=(25,10))
 
p1 = fig.add_subplot(2,2,1)
p1.hist(df.ph[df.Potability == 0], bins=20, alpha = .4)
p1.hist(df.ph[df.Potability == 1], bins=20, alpha = .4)
plt.title('pH')
plt.xlabel('pH')
plt.ylabel('Count')
labels = ["0", "1"]
plt.legend(labels)

p1 = fig.add_subplot(2,2,2)
p1.hist(df.Hardness[df.Potability == 0], bins=20, alpha = .4)
p1.hist(df.Hardness[df.Potability == 1], bins=20, alpha = .4)
plt.title('Hardness')
plt.xlabel('Hardness')
plt.ylabel('Count')
labels = ["0", "1"]
plt.legend(labels)

p1 = fig.add_subplot(2,2,3)
p1.hist(df.Solids[df.Potability == 0], bins=20, alpha = .4)
p1.hist(df.Solids[df.Potability == 1], bins=20, alpha = .4)
plt.title('Solids')
plt.xlabel('Solids')
plt.ylabel('Count')
labels = ["0", "1"]
plt.legend(labels)

p1 = fig.add_subplot(2,2,4)
p1.hist(df.Chloramines[df.Potability == 0], bins=20, alpha = .4)
p1.hist(df.Chloramines[df.Potability == 1], bins=20, alpha = .4)
plt.title('Chloramines')
plt.xlabel('Chloramines')
plt.ylabel('Count')
labels = ["0", "1"]
plt.legend(labels)
 
plt.subplots_adjust(wspace=.1, hspace=.3)
plt.show()

In [None]:
fig = plt.figure(figsize=(25,10))

p1 = fig.add_subplot(2,2,1)
p1.hist(df.Sulfate[df.Potability == 0], bins=20, alpha = .4)
p1.hist(df.Sulfate[df.Potability == 1], bins=20, alpha = .4)
plt.title('Sulfate')
plt.xlabel('Sulfate')
plt.ylabel('Count')
labels = ["0", "1"]
plt.legend(labels)

p1 = fig.add_subplot(2,2,2)
p1.hist(df.Conductivity[df.Potability == 0], bins=20, alpha = .4)
p1.hist(df.Conductivity[df.Potability == 1], bins=20, alpha = .4)
plt.title('Conductivity')
plt.xlabel('Conductivity')
plt.ylabel('Count')
labels = ["0", "1"]
plt.legend(labels)

p1 = fig.add_subplot(2,2,3)
p1.hist(df.Organic_carbon[df.Potability == 0], bins=20, alpha = .4)
p1.hist(df.Organic_carbon[df.Potability == 1], bins=20, alpha = .4)
plt.title('Organic_carbon')
plt.xlabel('Organic_carbon')
plt.ylabel('Count')
labels = ["0", "1"]
plt.legend(labels)

p1 = fig.add_subplot(2,2,4)
p1.hist(df.Trihalomethanes[df.Potability == 0], bins=20, alpha = .4)
p1.hist(df.Trihalomethanes[df.Potability == 1], bins=20, alpha = .4)
plt.title('Trihalomethanes')
plt.xlabel('Trihalomethanes')
plt.ylabel('Count')
labels = ["0", "1"]
plt.legend(labels)

plt.subplots_adjust(wspace=.1, hspace=.3)
plt.show()

In [None]:
fig = plt.figure(figsize=(25,10))

p1 = fig.add_subplot(2,2,1)
p1.hist(df.Turbidity[df.Potability == 0], bins=20, alpha = .4)
p1.hist(df.Turbidity[df.Potability == 1], bins=20, alpha = .4)
plt.title('Turbidity')
plt.xlabel('Turbidity')
plt.ylabel('Count')
labels = ["0", "1"]
plt.legend(labels)

plt.subplots_adjust(wspace=.1, hspace=.3)
plt.show()

In [None]:
from pandas.plotting import scatter_matrix
x = scatter_matrix(df, alpha=1, figsize=(40, 20), diagonal='hist')

In [None]:
plt.figure(figsize = (15,9))
sns.heatmap(df.corr(), annot = True)

In [None]:
df_corr = df.corr()
df_corr["Potability"].sort_values(ascending=False)

# Deep Learning

In [None]:
from sklearn.model_selection import train_test_split
from tensorflow import keras
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers import LeakyReLU
from tensorflow.keras import models
from tensorflow.keras.layers import BatchNormalization, Dropout
from tensorflow.keras.optimizers import Adam, Adagrad, RMSprop, SGD
from tensorflow.keras.layers import Activation
from sklearn.metrics import accuracy_score

In [None]:
x = df.drop(['Potability'], axis = 1)
y = df['Potability']

In [None]:
st = StandardScaler()
x_columns= x.columns
x[x_columns] = st.fit_transform(x[x_columns])

In [None]:
x.head()

In [None]:
x.describe()

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(x,y, test_size = 0.2, random_state = 0)

In [None]:
X_val, X_test, Y_val, Y_test = train_test_split(X_test,Y_test, test_size = 0.5, random_state = 0)

In [None]:
X_train.shape

In [None]:
# model = models.Sequential()

# model.add(layers.Dense(16, input_shape=(9,)))
# model.add(LeakyReLU(alpha=0.01))

# model.add(layers.Dense(32))
# model.add(BatchNormalization())
# model.add(LeakyReLU(alpha=0.01))
# model.add(Dropout(rate=0.1))

# model.add(layers.Dense(16))
# model.add(BatchNormalization())
# model.add(LeakyReLU(alpha=0.01))
# model.add(Dropout(rate=0.1))

# model.add(layers.Dense(1))
# model.add(Activation("sigmoid"))

In [None]:
model = models.Sequential()

model.add(layers.Dense(16, input_shape=(9,)))
model.add(BatchNormalization())
model.add(Activation("relu"))

model.add(layers.Dense(32))
model.add(BatchNormalization())
model.add(Activation("relu"))

model.add(layers.Dense(16))
model.add(BatchNormalization())
model.add(Activation("relu"))

model.add(layers.Dense(1))
model.add(Activation("sigmoid"))

In [None]:
opt = Adam(learning_rate=0.001)

model.compile(loss="binary_crossentropy",
              optimizer=opt,
              metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
tf.random.set_seed(0)

history = model.fit(X_train,
                    Y_train,
                    epochs=300,
                    batch_size=32,
                    validation_data=(X_val, Y_val),
                   )

In [None]:
def plot_history(history):
  hist = pd.DataFrame(history.history)
  hist['epoch'] = history.epoch

  plt.figure()
  plt.xlabel('Epoch')
  plt.ylabel('Accuracy')
  plt.plot(hist['epoch'], hist['accuracy'],
           label='Train Accuracy')
  plt.plot(hist['epoch'], hist['val_accuracy'],
           label = 'Val Accuracy')
  plt.ylim([0.5,0.9])
  plt.legend()
  plt.show()
    
  plt.figure()
  plt.xlabel('Epoch')
  plt.ylabel('Loss')
  plt.plot(hist['epoch'], hist['loss'],
           label='Train Error')
  plt.plot(hist['epoch'], hist['val_loss'],
           label = 'Val Error')
  plt.ylim([0.3,0.8])
  plt.legend()
  plt.show()


plot_history(history)

In [None]:
score = model.evaluate(X_test, Y_test, verbose=1)

print("Test Error", score[0])
print("Test accuracy", score[1])

### Finally the Deep Learning Model :  with accuracy 70~75 %