In [None]:
import pandas as pd
import numpy as np
import keras
import sklearn
from keras.layers import Dense, Dropout
from keras.models import Sequential
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
sns.set(style = 'whitegrid')
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler


%matplotlib inline

In [None]:
df = pd.read_csv('../input/water-potability/water_potability.csv')

df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
print(df.shape)
df.describe()

In [None]:
df['ph'] = df['ph'].replace(np.nan, df['ph'].median())
df['Sulfate'] = df['Sulfate'].replace(np.nan, df['Sulfate'].median())
df['Trihalomethanes'] = df['Trihalomethanes'].replace(np.nan, df['Trihalomethanes'].median())

df.isnull().sum()

In [None]:
print(df.shape)
df.describe()

In [None]:
plt.figure(figsize = (11,11))
sns.set(style = 'whitegrid')
sns.heatmap(df.corr(), annot = True, cmap = 'Blues')

Wow. Minor correlation with everything. Looks like every data column matters as much as the next

In [None]:
plt.figure(figsize = (8,6))
sns.distplot(df['ph'], kde = True, bins = 45)

In [None]:
plt.figure(figsize = (10,10))
sns.jointplot(x = 'ph', y = 'Potability', data = df)

In [None]:
plt.figure(figsize = (8,6))
plt.title('Counts of Potability')
sns.countplot(x = 'Potability', data = df)

In [None]:
sns.pairplot(df)

Let's go and normalize our data to make it much more accurate

In [None]:
X = df.drop(['Potability'], axis = 1)
y = df['Potability']

X.head()

In [None]:
X = (X - X.mean()) / X.std()



print(X.shape)
X.head()

In [None]:
'''
scalar = StandardScalar()
'''

In [None]:
from keras.layers import BatchNormalization

def nn():
    model = Sequential()
    model.add(Dense(120, input_dim = X.shape[1], activation = 'relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.8))
    
    model.add(Dense(120, activation = 'relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.6))
    
    model.add(Dense(120, activation = 'relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.63))
    
    model.add(Dense(120, activation = 'relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.7))
    
    model.add(Dense(1, activation = 'sigmoid'))
    
    model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
    
    return model

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 12)

print(X_train.shape)
print(y_train.shape)

In [None]:
model = nn()

history = model.fit(X_train, y_train, validation_split = 0.25, batch_size = 28, epochs = 350, verbose = 2)

In [None]:
scores = model.evaluate(X_test, y_test, verbose = 2)
print('Test Accuracy: ', scores[1])

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Accuracy Error')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(['Training', 'Validation'], loc = 'upper right')
plt.show()