In [None]:
#johnniekips@gmail.com

import numpy as np 
import pandas as pd 


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter("ignore")

In [None]:
quality_df = pd.read_csv("/kaggle/input/water-potability/water_potability.csv")
quality_df.head()

In [None]:
quality_df.info()

In [None]:
quality_df.isna().sum()

In [None]:
quality_df.fillna(quality_df.mean(), inplace=True)

In [None]:
np.any(np.isnan(quality_df.values))

In [None]:
quality_df.describe()

In [None]:
sns.countplot(quality_df["Potability"])
quality_df["Potability"].value_counts()

In [None]:
plt.figure(figsize=(15,15))
sns.heatmap(quality_df.corr(), annot=True)

In [None]:
sns.pairplot(data=quality_df, hue="Potability")

In [None]:
quality_df.hist(figsize=(20,20))

In [None]:
int_cols = quality_df.select_dtypes(exclude=['category']).columns.to_list()
fig, ax= plt.subplots(nrows=3, ncols=3, figsize=(15,15), constrained_layout=True)
plt.suptitle('Feature distribution by Potability class and Approved limit', size=20, weight='bold')
ax=ax.flatten()
for x, i in enumerate(int_cols):
    if i == "Potability":
        continue
    sns.boxplot(data=quality_df, y=i, x='Potability', ax=ax[x])
    for s in ['left','right','top','bottom']:
        ax[x].spines[s].set_visible(False)

In [None]:
from sklearn.model_selection import train_test_split

features = quality_df.drop(labels=["Potability"], axis=1)
label = quality_df["Potability"]

x_train, x_test, y_train, y_test = train_test_split(features, label, test_size=0.2, random_state=1)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [None]:
print("Before OverSampling, counts of label '1': {}".format(sum(y_train==1)))
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train==0)))

In [None]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=2)
x_train, y_train = sm.fit_resample(x_train, y_train) 

In [None]:
print('After OverSampling, the shape of train_X: {}'.format(x_train.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(y_train.shape))

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix

def print_score(y_pred, y_real):
    print("Accuracy: ", accuracy_score(y_real, y_pred))

    print()
    print("Macro precision_recall_fscore_support (macro) average")
    print(precision_recall_fscore_support(y_real, y_pred, average="macro"))

    print()
    print("Macro precision_recall_fscore_support (micro) average")
    print(precision_recall_fscore_support(y_real, y_pred, average="micro"))

    print()
    print("Macro precision_recall_fscore_support (weighted) average")
    print(precision_recall_fscore_support(y_real, y_pred, average="weighted"))
    
    print()
    print("Confusion Matrix")
    print(confusion_matrix(y_real, y_pred))

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
from sklearn.linear_model import SGDClassifier

%time
grid_params = { "loss": ["hinge", "log", "modified_huber"],
               "penalty": ["l1", "l2", "elasticnet"]
    
}

grid = GridSearchCV(SGDClassifier(), grid_params, refit=True, cv=3, verbose=1)
grid.fit(x_train, y_train)

In [None]:
print(grid.best_params_)
print(grid.best_estimator_)

y_pred = grid.predict(x_test)
print_score(y_pred, y_test)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

%time
param_grid = { "n_neighbors": np.arange(1,50)}


grid = GridSearchCV(KNeighborsClassifier(), param_grid, refit=True, cv=3, verbose=1)
grid.fit(x_train, y_train)

In [None]:
print(grid.best_params_)
print(grid.best_estimator_)

y_pred = grid.predict(x_test)
print_score(y_pred, y_test)

In [None]:
from sklearn.naive_bayes import GaussianNB

%time

param_grid = { "var_smoothing": [1e-09] }

grid = GridSearchCV(GaussianNB(), param_grid, refit=True, cv=3, verbose=1)
grid.fit(x_train, y_train)

In [None]:
print(grid.best_params_)
print(grid.best_estimator_)

y_pred = grid.predict(x_test)
print_score(y_pred, y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier

%time

param_grid = {'n_estimators': [100, 200, 300], 
              'max_features': ['auto', 'sqrt', 'log2'], 
              'bootstrap': [True, False], 
              'criterion':['entropy', 'gini']}

grid = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, verbose=1)
grid.fit(x_train, y_train)

In [None]:
print(grid.best_params_)
print(grid.best_estimator_)

y_pred = grid.predict(x_test)
print_score(y_pred, y_test)

In [None]:
from sklearn import svm

%time
param_grid = { "kernel": ["linear", "poly", "rbf", "sigmoid"],
               "degree": [1, 2 ,3, 4, 5, 6] }

grid = GridSearchCV(svm.SVC(), param_grid, refit=True, verbose=1)
grid.fit(x_train, y_train)

In [None]:
print(grid.best_params_)
print(grid.best_estimator_)

y_pred = grid.predict(x_test)
print_score(y_pred, y_test)

In [None]:
from sklearn.naive_bayes import BernoulliNB

%time
param_grid = {'alpha': [0.25, 0.5, 1]}

grid = GridSearchCV(BernoulliNB(), param_grid, refit=True, cv=3, verbose=1)
grid.fit(x_train, y_train)

In [None]:
print(grid.best_params_)
print(grid.best_estimator_)

y_pred = grid.predict(x_test)
print_score(y_pred, y_test)

In [None]:
from sklearn.tree import DecisionTreeClassifier

%time
param_grid = {'criterion':['gini','entropy'],
              'random_state':[5,10]}

grid = GridSearchCV(DecisionTreeClassifier(), param_grid, refit=True, cv=3, verbose=1)
grid.fit(x_train, y_train)

In [None]:
print(grid.best_params_)
print(grid.best_estimator_)

y_pred = grid.predict(x_test)
print_score(y_pred, y_test)

In [None]:
from xgboost import XGBClassifier

%time
param_grid = {'learning_rate': [0.01, 0.05, 0.1], 
              'eval_metric': ['error']}

grid = GridSearchCV(XGBClassifier(), param_grid, refit=True, cv=3, verbose=1)
grid.fit(x_train, y_train)


In [None]:
print(grid.best_params_)
print(grid.best_estimator_)

y_pred = grid.predict(x_test)
print_score(y_pred, y_test)

In [None]:
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from keras.models import Sequential
from keras.layers import Dense, Dropout

In [None]:
def ann_model(optimizer="adam"):
    model = Sequential()
    model.add(Dense(units=32, activation="relu"))
    model.add(Dropout(0.5))
    model.add(Dense(units=16, activation="relu"))
    model.add(Dropout(0.5))
    model.add(Dense(units=8, activation="relu"))
    model.add(Dense(units=1, activation="sigmoid"))
    model.compile(optimizer=optimizer, loss="binary_crossentropy", metrics=["accuracy"])
    return model

In [None]:
batch_size = 32
epochs = 50
ANN_Classifier =  KerasClassifier(build_fn=ann_model, 
                                  batch_size=batch_size, 
                                  epochs=epochs,                                  
                                  verbose=1)



In [None]:
!pip install livelossplot

In [None]:
parameters = {'batch_size': [25, 32],
              'epochs': [50, 100, 150],
              'optimizer': ['adam', 'rmsprop']}

grid = GridSearchCV(estimator=ANN_Classifier,
                    param_grid=parameters,
                    scoring="accuracy",
                    cv=5,
                    n_jobs=-1)

grid.fit(x_train, y_train)

In [None]:
print(grid.best_params_)
print(grid.best_estimator_)

y_pred = grid.predict(x_test)
print_score(y_pred, y_test)