In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn import preprocessing
from collections import Counter
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
data = pd.read_csv('../input/water-potability/water_potability.csv')

### EDA

In [None]:
data.head()

In [None]:
data.sample(10)

In [None]:
data.info()

In [None]:
data.describe().T

In [None]:
data.isnull().sum()

In [None]:
data.Potability.value_counts()

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=5, figsize=(26, 14))
sns.histplot(data=data, x="ph", kde=True,ax=axes[0][0])
sns.histplot(data=data, x="Hardness", kde=True,ax=axes[1][0])
sns.histplot(data=data, x="Solids", kde=True,ax=axes[0][1])
sns.histplot(data=data, x="Chloramines", kde=True,ax=axes[1][1])
sns.histplot(data=data, x="Sulfate", kde=True,ax=axes[0][2])
sns.histplot(data=data, x="Conductivity", kde=True,ax=axes[1][2])
sns.histplot(data=data, x="Organic_carbon", kde=True,ax=axes[0][3])
sns.histplot(data=data, x="Trihalomethanes", kde=True,ax=axes[1][3])
sns.histplot(data=data, x="Turbidity", kde=True,ax=axes[0][4])
sns.countplot(data=data, x="Potability",ax=axes[1][4])


In [None]:
def distp(x):
    if not x == 'Potability':
        plt.figure(figsize=(12,12))
        ax = sns.distplot(data[x][data.Potability == 1],color="darkturquoise", rug=True)
        sns.distplot(data[x][data.Potability == 0], color="lightcoral", rug=True);
        plt.legend(['Potable', 'Not Potable']) 
        fig.tight_layout()
for column in data.columns:
    distp(column)  

In [None]:
g = sns.pairplot(data, diag_kind="kde",hue="Potability")
g.map_lower(sns.kdeplot, levels=4, color=".2")

In [None]:
plt.figure(figsize=(16,8))
sns.heatmap(data.corr(),annot=True)

In [None]:
X= data[["ph","Sulfate",'Trihalomethanes']]
y=data["Potability"]
X.shape

In [None]:
X.isnull().sum()

### Preprocessing

In [None]:
tree = DecisionTreeClassifier()
resultado = []
tipos = ['mean', 'median', 'most_frequent', 'constant']

for t in tipos:
    imputer=SimpleImputer(strategy=t)
    imputer.fit(X)
    X_trans= imputer.transform(X)
    tree = DecisionTreeClassifier(max_depth=10,random_state=42)
    tree.fit(X_trans,y)
    y_pred = tree.predict(X_trans)
    f1sc=f1_score(y, y_pred, average='weighted')
    rauc=(y, y_pred)
    resultado.append(f1sc)
    print("El escalado Utilizado--->",t)
    print("f1 segun el tipo de estrategia:",f1sc)
    print("----------------------------------------")
   

In [None]:
vecinos = [1,3,6,9,12]
for v in vecinos:
    KNN_imputer=KNNImputer(n_neighbors=v)
    KNN_imputer.fit(X)
    X_knn= KNN_imputer.transform(X)
    tree = DecisionTreeClassifier(max_depth=10,random_state=42)
    tree.fit(X_knn,y)
    y_pred = tree.predict(X_knn)
    f1sc=f1_score(y, y_pred, average='weighted')
    rauc=(y, y_pred)
    resultado.append(f1sc)
    print("El escalado Utilizado--->",t)
    print("f1 segun el tipo de estrategia:",f1sc)
    print("----------------------------------------")
   

In [None]:
tipos2 = ['ascending', 'descending', 'roman', 'arabic', 'random']
for t in tipos2:
    iter_imputer=IterativeImputer(imputation_order=t)
    iter_imputer.fit(X)
    X_iter= iter_imputer.transform(X)
    tree = DecisionTreeClassifier(max_depth=10,random_state=42)
    tree.fit(X_iter,y)
    f1sc=f1_score(y, y_pred, average='weighted')
    rauc=(y, y_pred)
    resultado.append(f1sc)
    print("El escalado Utilizado--->",t)
    print("f1 segun el tipo de estrategia:",f1sc)
    print("----------------------------------------")
   

In [None]:
data.columns

In [None]:
imputer=SimpleImputer(strategy="constant")
imputer=imputer.fit(data[["ph","Sulfate",'Trihalomethanes']])
data[["ph","Sulfate",'Trihalomethanes']]=imputer.transform(data[["ph","Sulfate",'Trihalomethanes']])


In [None]:
data.isnull().sum()

In [None]:
plt.figure(figsize=(8,4))
sns.boxplot(data=data,x=data["ph"],color='lightblue')
data=data[(np.abs(stats.zscore(data["ph"])) < 3)]
plt.figure(figsize=(8,4))
sns.boxplot(data=data,x=data["ph"],color='lightblue')

In [None]:
plt.figure(figsize=(8,4))
sns.boxplot(data=data,x=data["Hardness"],color='lightblue')
data=data[(np.abs(stats.zscore(data["Hardness"])) < 3)]
plt.figure(figsize=(8,4))
sns.boxplot(data=data,x=data["Hardness"],color='lightblue')

In [None]:
plt.figure(figsize=(8,4))
sns.boxplot(data=data,x=data["Solids"],color='lightblue')
data=data[(np.abs(stats.zscore(data["Solids"])) < 3)]
plt.figure(figsize=(8,4))
sns.boxplot(data=data,x=data["Solids"],color='lightblue')

In [None]:
plt.figure(figsize=(8,4))
sns.boxplot(data=data,x=data["Chloramines"],color='lightblue')
data=data[(np.abs(stats.zscore(data["Chloramines"])) < 3)]
plt.figure(figsize=(8,4))
sns.boxplot(data=data,x=data["Chloramines"],color='lightblue')

In [None]:
plt.figure(figsize=(8,4))
sns.boxplot(data=data,x=data["Sulfate"],color='lightblue')
data=data[(np.abs(stats.zscore(data["Sulfate"])) < 3)]
plt.figure(figsize=(8,4))
sns.boxplot(data=data,x=data["Sulfate"],color='lightblue')

In [None]:
plt.figure(figsize=(8,4))
sns.boxplot(data=data,x=data["Conductivity"],color='lightblue')
data=data[(np.abs(stats.zscore(data["Conductivity"])) < 3)]
plt.figure(figsize=(8,4))
sns.boxplot(data=data,x=data["Conductivity"],color='lightblue')

In [None]:
plt.figure(figsize=(8,4))
sns.boxplot(data=data,x=data["Organic_carbon"],color='lightblue')
data=data[(np.abs(stats.zscore(data["Organic_carbon"])) < 3)]
plt.figure(figsize=(8,4))
sns.boxplot(data=data,x=data["Organic_carbon"],color='lightblue')

In [None]:
plt.figure(figsize=(8,4))
sns.boxplot(data=data,x=data["Trihalomethanes"],color='lightblue')
data=data[(np.abs(stats.zscore(data["Trihalomethanes"])) < 3)]
plt.figure(figsize=(8,4))
sns.boxplot(data=data,x=data["Trihalomethanes"],color='lightblue')

In [None]:
plt.figure(figsize=(8,4))
sns.boxplot(data=data,x=data["Turbidity"],color='lightblue')
data=data[(np.abs(stats.zscore(data["Turbidity"])) < 3)]
plt.figure(figsize=(8,4))
sns.boxplot(data=data,x=data["Turbidity"],color='lightblue')

In [None]:
plt.figure(figsize=(16,8))
sns.heatmap(data.corr(),annot=True)

In [None]:
robust_sc = preprocessing.RobustScaler()
standard_sc = preprocessing.StandardScaler() 
minmax_sc = preprocessing.MinMaxScaler()

In [None]:
X = data.drop(['Potability'],axis=1)
y = data["Potability"]

In [None]:
for x in [robust_sc,standard_sc,minmax_sc]:
    %time
    resultado = []
    scaler = x.fit(X)
    X_new = x.transform(X)
    tree = DecisionTreeClassifier(max_depth=15,random_state=42)
    tree.fit(X_new,y)
    y_pred = tree.predict(X_new)
    f1sc=f1_score(y, y_pred, average='weighted')
    rauc=(y, y_pred)
    resultado.append(f1sc)
    print("El escalado Utilizado--->",x)
    print("f1 segun el tipo de estrategia:",f1sc)
    print("----------------------------------------")

In [None]:
Counter(data['Potability'])

In [None]:
over = SMOTE()
under = RandomUnderSampler()
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

In [None]:
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.3,random_state=42,stratify=y)

In [None]:
X_train, y_train = pipeline.fit_resample(X_train, y_train)

In [None]:
Counter(y_train)

In [None]:
Counter(y_test)

In [None]:
X_train=robust_sc.fit_transform(X_train)
X_test=robust_sc.transform(X_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

### Model

In [None]:
def confusion(y_test,y_test_pred,X):
    names=['non_potable','potable']
    cm=confusion_matrix(y_test,y_test_pred)
    f,ax=plt.subplots(figsize=(10,10))
    sns.heatmap(cm,annot=True,linewidth=.5,linecolor="r",fmt=".0f",ax=ax)
    plt.title(X, size = 25)
    plt.xlabel("y_pred")
    plt.ylabel("y_true")
    ax.set_xticklabels(names)
    ax.set_yticklabels(names)
    plt.show()

    return

In [None]:
gbm = XGBClassifier(verbosity=1)
params_xgb = {
        "n_estimators":[500,1000,1500],
        "learning_rate":[0.1,0.3,0.6],
        'gpu_id': [0],
        "predictor":["gpu_predictor"],
        'tree_method': ['gpu_hist'],
        "updater":["grow_gpu_hist"],
        "sampling_method":["gradient_based"],
        "updater":["grow_gpu_hist"]
}

In [None]:
model_xgb = GridSearchCV(gbm,param_grid=params_xgb, cv=3,n_jobs=-1)
model_xgb.fit(X_train,y_train)

In [None]:
print("Best params: "+str(model_xgb.best_params_))
print("Best Score: "+str(model_xgb.best_score_)+'\n')

In [None]:
scores = pd.DataFrame(model_xgb.cv_results_)
scores.sort_values(by="rank_test_score")

In [None]:
y_train_pred_xgb = model_xgb.predict(X_train)
y_test_pred_xgb = model_xgb.predict(X_test)

In [None]:
print(classification_report(y_test, y_test_pred_xgb))

In [None]:
confusion(y_test,y_test_pred_xgb,"XGB")

In [None]:
clf = MLPClassifier(random_state=42)
params_MLP = {
        "hidden_layer_sizes":[64,128,256],
        "activation":["identity", "logistic", "tanh", "relu"],
        'solver': ["lbfgs", "sgd", "adam"],
        "learning_rate":["constant", "invscaling", "adaptive"],
        'max_iter': [100,200],
        "warm_start":[True]
}

In [None]:
model_MLP = GridSearchCV(clf,param_grid=params_MLP, cv=3,n_jobs=-1)
model_MLP.fit(X_train,y_train)


In [None]:

print("Best params: "+str(model_MLP.best_params_))
print("Best Score: "+str(model_MLP.best_score_)+'\n')

In [None]:
scores = pd.DataFrame(model_MLP.cv_results_)
scores.sort_values(by="rank_test_score")

In [None]:
y_train_pred_MLP = model_MLP.predict(X_train)
y_test_pred_MLP = model_MLP.predict(X_test)

In [None]:
print(classification_report(y_test, y_test_pred_MLP))

In [None]:
confusion(y_test,y_test_pred_MLP,"MLP")

In [None]:
clf = RandomForestClassifier(random_state=42)
params_RF = {
        "max_depth":[250,500,1000],
        "criterion":["gini", "entropy"],
        'min_samples_split': [2,4,6],
        "min_samples_leaf":[1,2,3],
        "max_features":['auto', 'sqrt', 'log2'],
        'warm_start':[True],
        'class_weight':['balanced', 'balanced_subsample']
}

In [None]:
model_RF = GridSearchCV(clf,param_grid=params_RF, cv=3,n_jobs=-1)
model_RF.fit(X_train,y_train)

In [None]:
print("Best params: "+str(model_RF.best_params_))
print("Best Score: "+str(model_RF.best_score_)+'\n')

In [None]:
scores = pd.DataFrame(model_RF.cv_results_)
scores.sort_values(by="rank_test_score")

In [None]:
y_train_pred_RF = model_RF.predict(X_train)
y_test_pred_RF = model_RF.predict(X_test)

In [None]:
print(classification_report(y_test, y_test_pred_RF))

In [None]:
confusion(y_test,y_test_pred_RF,"RF")

## Deep Learning

In [None]:
import tensorflow as tf
import keras
from keras import Sequential
from keras.layers import Dense
from keras.layers.normalization import BatchNormalization
from keras.layers import Dropout

In [None]:
model = Sequential()

model.add(Dense(64, activation='relu', kernel_initializer='random_normal', input_dim=9))

model.add(BatchNormalization())

model.add(Dense(32, activation='relu', kernel_initializer='random_normal'))

model.add(Dense(16, activation='relu', kernel_initializer='random_normal'))

model.add(Dropout(0.5))

model.add(Dense(1, activation='sigmoid', kernel_initializer='random_normal'))

In [None]:
model.compile(optimizer ='adam',loss='binary_crossentropy', metrics =['accuracy'])

In [None]:
model.fit(X_train,y_train, batch_size=32, epochs=50, validation_data =(X_test,y_test))

In [None]:
eval_model=model.evaluate(X_train, y_train)
eval_model

In [None]:
y_pred=model.predict(X_test)
y_pred =(y_pred>0.5)

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
print(classification_report(y_test, y_pred)) 

In [None]:
confusion(y_test,y_pred,"NN")

### WIth LRA

In [None]:
class LRA(keras.callbacks.Callback):
    best_weights=model.get_weights() # set a class vaiable so weights can be loaded after training is completed
    def __init__(self, patience=2, threshold=.95, factor=.5):
        super(LRA, self).__init__()
        self.patience=patience # specifies how many epochs without improvement before learning rate is adjusted
        self.threshold=threshold # specifies training accuracy threshold when lr will be adjusted based on validation loss
        self.factor=factor # factor by which to reduce the learning rate
        self.lr=float(tf.keras.backend.get_value(model.optimizer.lr)) # get the initiallearning rate and save it in self.lr
        self.highest_tracc=0.0 # set highest training accuracy to 0
        self.lowest_vloss=np.inf # set lowest validation loss to infinity
        self.count=0
        msg='\n Starting Training - Initializing Custom Callback'
        print_in_color (msg, (244, 252, 3), (55,65,80))
        
    def on_epoch_end(self, epoch, logs=None):  # method runs on the end of each epoch
        lr=float(tf.keras.backend.get_value(self.model.optimizer.lr)) # get the current learning rate
        v_loss=logs.get('val_loss')  # get the validation loss for this epoch
        acc=logs.get('accuracy')  # get training accuracy        
        if acc < self.threshold: # if training accuracy is below threshold adjust lr based on training accuracy
            if acc>self.highest_tracc: # training accuracy improved in the epoch
                msg= f'\n training accuracy improved from  {self.highest_tracc:7.2f} to {acc:7.2f} learning rate held at {lr:9.6f}'
                print_in_color(msg, (0,255,0), (55,65,80))
                self.highest_tracc=acc # set new highest training accuracy
                LRA.best_weights=model.get_weights() # traing accuracy improved so save the weights
                count=0 # set count to 0 since training accuracy improved
                if v_loss<self.lowest_vloss:
                    self.lowest_vloss=v_loss                    
            else:  # training accuracy did not improve check if this has happened for patience number of epochs if so adjust learning rate
                if self.count>=self.patience -1:
                    self.lr= lr* self.factor # adjust the learning by factor
                    tf.keras.backend.set_value(model.optimizer.lr, self.lr) # set the learning rate in the optimizer
                    self.count=0 # reset the count to 0
                    if v_loss<self.lowest_vloss:
                        self.lowest_vloss=v_loss
                    msg=f'\nfor epoch {epoch +1} training accuracy did not improve for {self.patience } consecutive epochs, learning rate adjusted to {lr:9.6f}'
                    print_in_color(msg, (255,0,0), (55,65,80))
                else:
                    self.count=self.count +1
                    msg=f'\nfor  epoch {epoch +1} training accuracy did not improve, patience count incremented to {self.count}'
                    print_in_color(msg, (255,255,0), (55,65,80))
        else: # training accuracy is above threshold so adjust learning rate based on validation loss
            if v_loss< self.lowest_vloss: # check if the validation loss improved
                msg=f'\n for epoch {epoch+1} validation loss improved from  {self.lowest_vloss:7.4f} to {v_loss:7.4}, saving best weights'
                print_in_color(msg, (0,255,0), (55,65,80))
                self.lowest_vloss=v_loss # replace lowest validation loss with new validation loss                
                LRA.best_weights=model.get_weights() # validation loss improved so save the weights
                self.count=0 # reset count since validation loss improved               
            else: # validation loss did not improve
                if self.count>=self.patience-1:
                    self.lr=self.lr * self.factor
                    msg=f' \nfor epoch {epoch+1} validation loss failed to improve for {self.patience} consecutive epochs, learning rate adjusted to {self.lr:9.6f}'
                    self.count=0 # reset counter
                    print_in_color(msg, (255,0,0), (55,65,80))
                    tf.keras.backend.set_value(model.optimizer.lr, self.lr) # set the learning rate in the optimizer
                else: 
                    self.count =self.count +1 # increment the count
                    msg=f' \nfor epoch {epoch+1} validation loss did not improve patience count incremented to {self.count}'
                    print_in_color(msg, (255,255,0), (55,65,80))
                    

In [None]:
def print_in_color(txt_msg,fore_tupple,back_tupple,):
    #prints the text_msg in the foreground color specified by fore_tupple with the background specified by back_tupple 
    #text_msg is the text, fore_tupple is foregroud color tupple (r,g,b), back_tupple is background tupple (r,g,b)
    rf,gf,bf=fore_tupple
    rb,gb,bb=back_tupple
    msg='{0}' + txt_msg
    mat='\33[38;2;' + str(rf) +';' + str(gf) + ';' + str(bf) + ';48;2;' + str(rb) + ';' +str(gb) + ';' + str(bb) +'m' 
    print(msg .format(mat))
    print('\33[0m') # returns default print color to back to black
    return

In [None]:
epochs=50

In [None]:
callbacks=[LRA()]
model.fit(X_train, y_train,epochs=epochs,
                              verbose=1,
                              validation_data =(X_train,y_train),callbacks=callbacks,shuffle=True)


In [None]:
eval_model=model.evaluate(X_train, y_train)
eval_model

In [None]:
y_pred=model.predict(X_test)
y_pred =(y_pred>0.5)

In [None]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
print(classification_report(y_test, y_pred)) 

In [None]:
confusion(y_test,y_pred,"NN with LRA")