In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
sns.set_style('darkgrid')

In [None]:
data = pd.read_csv("/kaggle/input/water-potability/water_potability.csv", sep = ",")
df = pd.DataFrame(data)
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df = df.fillna(df.mean())

In [None]:
df.isnull().sum()

In [None]:
sns.countplot(x =df['Potability'], data = df)

# Boxplot of parameters

In [None]:
i=1
plt.figure(figsize=(15,19))
for col in df.columns:
    plt.subplot(4,3,i)
    sns.boxplot(x = 'Potability', y = col,  data =df)
    plt.title(col)
    i+=1
    

In [None]:
corr = df.corr()
plt.figure(figsize=(15,10))
sns.heatmap(corr, linewidth = 0.5, cmap = 'coolwarm', annot =True)

In [None]:
sns.pairplot(df, hue = 'Potability')

In [None]:
X = df.drop(['Potability'],1)
y = df.Potability

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 43)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train, y_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score , plot_roc_curve
from xgboost import XGBClassifier
xgb = XGBClassifier(learning_rate = 0.03, max_depth = 8, n_estimators = 1000, 
                    verbosity = 1, random_state = 44, use_label_encoder=False)

from sklearn import metrics
from sklearn.metrics import mean_squared_error
rf = RandomForestClassifier(random_state=43)
ad = AdaBoostClassifier(base_estimator =rf)
dt = DecisionTreeClassifier()
kn = KNeighborsClassifier()
rbf = RBF()
gp = GaussianProcessClassifier(1.0 * RBF(1.0))
mlp = MLPClassifier(alpha=1, max_iter=1000)
gnb = GaussianNB()
svc = SVC(random_state = 43, C = 10, gamma = 0.1, kernel ='rbf')

models = [rf,ad, dt, kn, svc, mlp,xgb ]
for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    scores = cross_val_score(model, X, y, cv=5).mean().round(3)
    accuracy = metrics.classification_report(y_test, y_pred)
    #f1score = metrics.f1_score(y_test, y_pred).round(3)
    print(model, '\n', 'REPORT:','\n', accuracy,'\n', 'mean_score:',scores, '\n' )

In [None]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
#max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
#min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
random_state  = [int(x) for x in np.linspace(10, 100,num= 10)]
# Method of selecting samples for training each tree
learning_rate = [0.03,0.05,0.07,0.1,0.2]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'random_state': random_state,
               'learning_rate': learning_rate}
print(random_grid)

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
#xgb = XGBClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
#xgb_random = RandomizedSearchCV(estimator = xgb, param_distributions = random_grid, n_iter = 70,
#                                cv = 3, verbose=2, n_jobs = -1)
# Fit the random search model
#xgb_random.fit(X_train, y_train)

In [None]:
#xgb_random.best_params_

In [None]:
xgb =  XGBClassifier(random_state = 90, n_estimators = 200, max_depth = 200, learning_rate = 0.07)
model = xgb.fit(X_train, y_train)
y_pred = model.predict(X_test)
scores = cross_val_score(model, X, y, cv=3).mean().round(3)
accuracy = metrics.classification_report(y_test, y_pred)
#f1score = metrics.f1_score(y_test, y_pred).round(3)
print(model, '\n', 'REPORT:','\n', accuracy,'\n', 'mean_score:',scores, '\n' )

In [None]:
from yellowbrick.classifier import confusion_matrix
from yellowbrick.classifier import ClassificationReport
classes = [0,1]
visualizer = ClassificationReport(xgb, classes=classes, support=True)

visualizer.fit(X_train, y_train)        # Fit the visualizer and the model
visualizer.score(X_test, y_test)        # Evaluate the model on the test data
visualizer.show()                       # Finalize and show the figure

In [None]:
#confusion_matrix(XGbClassifier(), X_train, y_train, X_test, y_test, classes=[0, 1])
#plt.tight_layout()
from yellowbrick.classifier import ConfusionMatrix

cm = ConfusionMatrix(xgb, classes=[0,1], label_encoder={0: 'Potable', 1: 'Not_Potable'})

cm.fit(X_train, y_train)
cm.score(X_test, y_test)
cm.show()

In [None]:
from yellowbrick.classifier import ROCAUC
visualizer = ROCAUC(xgb, classes=["Potable", "Not_Potable"])

visualizer.fit(X_train, y_train)        # Fit the training data to the visualizer
visualizer.score(X_test, y_test)        # Evaluate the model on the test data
visualizer.show()                       # Finalize and show the figure