In [None]:
import numpy as np
import pandas as pd
import itertools # advanced tools
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import ADASYN
from sklearn.neural_network import MLPClassifier
from datetime import datetime
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [None]:
#Read Dataset
df = pd.read_csv('../input/water-potability/water_potability.csv')
df.head()

Data Cleansing & Transformation

In [None]:
#Define imputer to replace missing value
imputer = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean')
#Fit imputer on the dataset
imputer.fit(df)
#Define Transofrmed dataset
dftrans = imputer.transform(df)
#Buld New Dataframe with Imputed Missing Value
df1 = pd.DataFrame(dftrans, columns = ['ph', 'Hardness',	'Solids',	'Chloramines',	'Sulfate',	'Conductivity',	'Organic_carbon',	'Trihalomethanes',	'Turbidity',	'Potability'])
#Convert All Value to Integer
df1 = df1.astype({"ph":'int', "Hardness":'int', "Solids":'int', "Chloramines":'int', "Sulfate":'int', "Conductivity":'int', "Organic_carbon":'int', "Trihalomethanes":'int', "Turbidity":'int', "Potability":'int'}) 
print (df1.dtypes)

In [None]:
df1.head()

In [None]:
#Summarizing cases
cases = len(df)
potabiliy_count = len(df[df.Potability == 1])
nonpotability_count = len(df[df.Potability == 0])
nonpotability_percentage = round(nonpotability_count/cases*100, 2)

print('CASE COUNT')
print('--------------------------------------------')
print('Total number of cases are {}'.format(cases))
print('Number of potability cases are {}'.format(potabiliy_count))
print('Number of Non-potability cases are {}'.format(nonpotability_count))
print('Percentage of Non-potability cases is {}%'.format(nonpotability_percentage))
print('--------------------------------------------')

Data Pre-Processing

In [None]:
#Data Pre-Processing
#Data Split
x = df1.drop('Potability', axis = 1).values
y = df1['Potability'].values
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = None)

#Oversampling Data
ada = ADASYN(sampling_strategy='auto', random_state=None)
x_train, y_train = ada.fit_resample(x_train, y_train)

#Min-Max Data Scalling
minmaxscaler = MinMaxScaler() 
x_train = minmaxscaler.fit_transform(x_train)
x_test = minmaxscaler.transform(x_test)

#Standarization
scaler = StandardScaler() 
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

Model Building

In [None]:
start = datetime.now()
#Multi Layer Perceptron Classifier
mlp = MLPClassifier(max_iter=100)
mlp_parameter = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}
mlp_tuned = GridSearchCV(mlp, mlp_parameter, n_jobs=-1, cv=3)
mlp_tuned.fit(x_train, y_train)
mlp_model = mlp_tuned.predict(x_test)

# Best parameter set
print('Best parameters found:\n', mlp_tuned.best_params_)

end = datetime.now()
time_taken = end - start
print('Training Time: ',time_taken)

Tuned Model

In [None]:
start = datetime.now()

#Conducting Final Model
mlp_final = MLPClassifier(max_iter=10000, activation=mlp_tuned.best_params_['activation'], alpha=mlp_tuned.best_params_['alpha'], hidden_layer_sizes=mlp_tuned.best_params_['hidden_layer_sizes'], learning_rate=mlp_tuned.best_params_['learning_rate'], solver=mlp_tuned.best_params_['solver'])
mlp_final.fit(x_train, y_train)
final_model = mlp_final.predict(x_test)

end = datetime.now()
time_taken = end - start

print('MLP Classifier Performance')
print('Precision {}'.format(precision_score(y_test, final_model)))
print('Recall {}'.format(recall_score(y_test, final_model)))
print('F1 Score {}'.format(f1_score(y_test, final_model)))
print('Accuracy {}'.format(accuracy_score(y_test, final_model)))
print('Training Time ',time_taken)

Confusion Matrix

In [None]:
#Confusion Matrix
#Defining the plot function
def plot_confusion_matrix(cm, classes, title, normalize = False, cmap = plt.cm.Blues):
    title = 'Confusion Matrix of {}'.format(title)
    if normalize:
        cm = cm.astype(float) / cm.sum(axis=1)[:, np.newaxis]

    plt.imshow(cm, interpolation = 'nearest', cmap = cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation = 45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment = 'center',
                 color = 'white' if cm[i, j] > thresh else 'black')

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

#Compute confusion matrix for the models
mlp_matrix = confusion_matrix(y_test, final_model, labels = [0, 1])
plt.rcParams['figure.figsize'] = (6, 6)
mlp_cm_plot = plot_confusion_matrix(mlp_matrix, 
                                classes = ['Negative(0)','Positive(1)'], 
                                normalize = False, title = 'the Classifier')
plt.savefig('lr_cm_plot.png')
plt.show()