In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Importing important libraries

In [None]:
# Import data analysis libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Importing sklearn libraries
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, GridSearchCV, cross_val_score, cross_val_predict
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# Importing model evaluation libraries
from sklearn.metrics import precision_score, recall_score, precision_recall_curve, auc, roc_auc_score, roc_curve, confusion_matrix, fbeta_score
%matplotlib inline

## Data loading

In [None]:
data = pd.read_csv("/kaggle/input/water-potability/water_potability.csv")
data.head()

## Data Descriptive Analysis - Data as is

### Checking for datatypes and missing values

In [None]:
data.info()

In [None]:
# Identifying columns with missing values %s
np.round(data.isnull().sum()[data.isnull().sum()>0]/len(data)*100,1)

In [None]:
data.isnull().sum()

The percentage of missing values of a columns will help to determine the importance. Drop, if there is a majority of columns have missing values.
In the dataset, the "ph", "sulphate" and "Trihalomethanes" contain a few missing values. Since, the % of missing values is very small, we may impute the values.

### Checking for Class Distribution

In [None]:
np.round(data["Potability"].value_counts(normalize= True)*100.0,1)

In [None]:
sns.displot(x = data["Potability"], kde = False)
plt.title("Distribution of classes", fontsize = 14)
plt.show()


### Observations
There is an imbalance in the data. Non-Potable label has 15% more records than the Potable class.
So, we have two options:
* To undersample the Class 0 - Non Potable class to take only 1250 records, instead of all the 2000 records
* To user SMOTE to synthesize new records of Class 1 - Potable

Let's use the 1st Option - To undersample the Non Impotable class


### Exploratory Data Analysis

In [None]:
data.describe()

In [None]:
# Features of each class
gro_by_Portability = data.groupby("Potability")
gro_by_Portability.mean().T

### Pair Plotting
Visualize the separation of each class with the featues

In [None]:
# # For Potable Class only
# plt.figure(figsize = (20,20))
# sns.pairplot(data.query("Potability ==1" ), diag_kind = "hist")
# plt.legend()
# plt.title("Pair Plot")
# plt.show()

In [None]:
plt.figure(figsize = (20,20))
sns.pairplot(data, diag_kind = "hist", hue = "Potability")
plt.legend()
plt.title("Pair Plot")
plt.show()

### Observations from Pair Plot
1. The classes are not linearly separable. So, Linear Classifiers may not fetch accurate results
2. From the diagnoal hitsogram, it is evident that the mean values of the features are overlapping between the classes 

In [None]:
gro_by_Portability["Chloramines"].mean()[0]

In [None]:
fig = plt.figure(figsize =(8,6))
sns.displot(data = data, x = "Chloramines", hue = "Potability")
plt.axvline(x = gro_by_Portability["Chloramines"].mean()[0], c = 'red')
plt.axvline(x = gro_by_Portability["Chloramines"].mean()[1], c = 'blue')
plt.title("Distribution of Class with Chloramines", fontsize = 13)
plt.show()


In [None]:
# Plotting the correlation heatmap to check for multicollinearity in the feature space
plt.figure(figsize = (8,8))
sns.heatmap(data.corr(), annot = True, cmap = 'YlGnBu')
plt.show()

In [None]:
data.dropna(axis =0).drop("Potability", axis = 1).columns

In [None]:
# Checking the VIF of all the predictors
from statsmodels.stats.outliers_influence import variance_inflation_factor
data_for_vif = data.dropna(axis =0).drop("Potability", axis = 1).copy()
data_for_vif["Constant"] = 1

In [None]:
vif_df= pd.DataFrame()
vif_df["Feature"] = data_for_vif.columns
vif_df["vif"] = [variance_inflation_factor(data_for_vif.values, i) for i in range(len(data_for_vif.columns))]
vif_df

In [None]:
# CHecking the missing values per group
attribs_with_nan = ["ph","Sulfate","Trihalomethanes"]
data.set_index("Potability")[attribs_with_nan].isna().groupby("Potability").sum()

### Data Preparation Pipelines

In [None]:
from imblearn.under_sampling import RandomUnderSampler
under = RandomUnderSampler()


In [None]:
X = data.drop("Potability", axis = 1).values
y = data["Potability"].values
y_best = data["Potability"].values

from collections import Counter
Counter(y)

In [None]:
X, y = under.fit_resample(X, y)
Counter(y)


In [None]:
pipeline = Pipeline(
[("imputer", SimpleImputer(strategy = "mean")),
 ("std_sclaer", StandardScaler())
 ])

In [None]:
X_prepared = pipeline.fit_transform(X)
X_prepared

### Splitting the data into Train and Test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_prepared, y, random_state = 1, test_size = 0.2)
X_train.shape, X_test.shape, y_train.shape, y_test.shape


In [None]:
def create_confusion_matrix_df(y_actual, y_pred):
    return pd.DataFrame(confusion_matrix(y_actual, y_pred), index = ["Actual Not Potable", "Actual Potable"], columns = ["Pred Not Potable", "Pred Potable"])

### Before we continue with the Modeling, let us decide our selection criteria

Class 0 -> Not Potable
Class 1 -> Potable

* True Positive -> Actual Potable, Predicted Potable
* True Negative -> Actual Not Potable, Predicted Not Potable
* False Positive - > Actual Not Potable, Predictec Potable
* False Negative -> Actual Potable, Predicted Not Potable

For Model evaluation, we should choose a metric that -
* reduces the risk of classifying a Not potable water as Potable(False Positives) 
* should correctly classify Potable water(True Positive).


For now, it matters less if potable water is missclassified as Non Potable(False Negatives)

An ideal model evaluation metric would be - Precision and specificity

Precision = True Positives / (True Positives + False Positives)

Specificity = True Negatives / (True Negatives + False Positives)

F1-Score can also be used since it takes into account both the precision and recall - Both the False Positives and the False Negatives

### Logistic Regreesion for Classification

In [None]:
# Using Logistic Regression - Since, the classes are not linearly separable and hence, logistic regression may not be the best model. Let's give it a try
log_reg_clf = LogisticRegression()
log_reg_clf.fit(X_train, y_train)
# Let us see the training accuracy
y_train_pred_log_reg = log_reg_clf.predict(X_train)


In [None]:
cm_log_reg_train = create_confusion_matrix_df(y_train, y_train_pred_log_reg)
cm_log_reg_train

### Analysis of Logistic Regression Training Results

In [None]:
log_reg_train_score = pd.DataFrame(np.c_[log_reg_clf.predict_proba(X_train)[:,:-1], y_train_pred_log_reg.astype(int), log_reg_clf.predict_proba(X_train), y_train], columns = ["Predict_Proba", "Y_predicted", "Class0_Prob","Class1 Prob", "Y_Actual"])
log_reg_train_score["Y_predicted"] = log_reg_train_score["Y_predicted"].astype(int)
log_reg_train_score["Y_Actual"] = log_reg_train_score["Y_Actual"].astype(int)
log_reg_train_score

In [None]:
# Cross Validatiing the Logistic Reression Model

y_scores = cross_val_predict(log_reg_clf, X_train, y_train, cv = 5, method = "predict_proba")
y_scores

In [None]:
# Plotting the precision Recall Curve for Logistic Regression Model

prec, rec, thrs = precision_recall_curve(y_train, y_scores[:,0])

fig = plt.figure(figsize = (8,6))
plt.plot(thrs, prec[:-1], 'b--', label = "Precision")
plt.plot(thrs, rec[:-1], 'g-', label = "Recall")
plt.xlabel("Threshold - Probability Score")
plt.ylabel("Precision /  Recall")
plt.title("Precision/Recall vs Threshold curve of Logistic Regression")
plt.legend()
plt.show()


In [None]:
plt.plot(rec, prec)
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision Vs Recall for Logistics Regression")
plt.show()

Plotting ROC AUC curve for Logistic Regression

In [None]:
fpr, tpr, threholds = roc_curve(y_train, y_scores[:,1])
plt.plot(fpr, tpr, 'b-')
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC Curve of Logistic Regression")
plt.text(0, 0.8, s = "ROC AUC score : {}".format(roc_auc_score(y_train, y_scores[:,1])))
plt.show()

### SGD Classifier

In [None]:
sgd_clf = SGDClassifier(random_state = 1, loss = "log")
sgd_clf.fit(X_train, y_train)
y_train_pred_sgd =sgd_clf.predict(X_train) # In the training Set

In [None]:
cm_sgd_train = create_confusion_matrix_df(y_train, y_train_pred_sgd)
cm_sgd_train

In [None]:
from scipy.stats import hmean # harmonic mean
precisions_train_sgd = cm_sgd_train.iloc[1,1]/(cm_sgd_train.iloc[1,1] + cm_sgd_train.iloc[0,1])
recalls_train_sgd = cm_sgd_train.iloc[1,1]/(cm_sgd_train.iloc[1,1] + cm_sgd_train.iloc[1,0])
f1_train_sgd = hmean([precisions_train_sgd, recalls_train_sgd])
precisions_train_sgd, recalls_train_sgd, f1_train_sgd

In [None]:
# Checking teh Precision and Recall Score for SGDCLassifier
# precsion_SGD_train = precision_score(y_train, y_train_pred_sgd)
# Recall_SGD_train = recall_score(y_train, y_train_pred_sgd)
# f1_train_sgd = f1_score(y_train, y_train_pred_sgd)
# precsion_SGD_train, Recall_SGD_train, f1_sgd_train

In [None]:
#Using Cross Validation to see the variation of Precision and Recall with the threshold
from sklearn.model_selection import cross_val_predict

y_train_scores_sgd = cross_val_predict(sgd_clf, X_train, y_train, cv = 10, method = 'predict_proba', verbose = 10)
y_train_scores_sgd.shape

In [None]:
def display_scores(scores):
    print("Precisions: ", scores)
    print("Mean Precision: ", scores.mean())
    print("Std. of precisions: ", scores.std())


In [None]:
y_cross_val_scores = cross_val_score(sgd_clf, X_train, y_train , cv = 10, scoring = 'f1')
display_scores(y_cross_val_scores)



In [None]:
#Plotting Threshold vs True Positive Rate
from sklearn.metrics import precision_recall_curve

precisions, recalls, thresholds = precision_recall_curve(y_train, y_train_scores_sgd[:,1])
thresholds

In [None]:
from sklearn.metrics import plot_precision_recall_curve
plot_precision_recall_curve(sgd_clf, X_train, y_train,response_method = "predict_proba" )
plt.show()


In [None]:
# Plotting Precision vs Threshold, Recall Vs Threshold
fig = plt.figure(figsize = (8,6))
ax = plt.subplot(111)
plt.plot(thresholds, precisions[:-1], 'b--', label = "Precision")
plt.plot(thresholds, recalls[:-1], 'g-', label = "Recalls")
plt.legend()
#ax.set_xticks(np.linspace(-1,1,20))
plt.ylabel("Precisions and Recalls")
plt.xlabel("Thresholds")
plt.title("Precisions/Recalls vs Thresholds", fontsize = 14)
plt.show()

In [None]:
# Plotting the threshold with Precision only
plt.plot(thresholds, precisions[:-1])
plt.xlabel("Threshold")
plt.ylabel("Precision")
plt.title("Precisions vs Threshold", fontsize = 13)
plt.show()

In [None]:
#Plotting Precsions Vs Recall for the scores

fig = plt.figure(figsize = (8,6))
ax = plt.subplot(111)
plt.plot(recalls, precisions) # excluding border values
#ax.set_xticks(np.linspace(0,1,50))
plt.xlabel("Recalls")
plt.ylabel("Precisions")
plt.title("Precisions vs Recalls", fontsize = 14)
plt.show()


In [None]:
# Calculating ROC AUC score for SGD and plottting
roc_auc_score(y_train, y_train_scores_sgd[:,1])
fpr, tpr, threholds = roc_curve(y_train, y_train_scores_sgd[:,1])
plt.plot(fpr, tpr, 'b-')
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC Curve of SGD Classifier")
plt.text(0, 0.8, s = "ROC AUC score : {}".format(roc_auc_score(y_train, y_train_scores_sgd[:,1])))
plt.show()

### Decision Tree Classifier

In [None]:
dtree_clf = DecisionTreeClassifier(random_state=1)
dtree_clf.fit(X_train, y_train)

In [None]:
y_train_pred_dtree = dtree_clf.predict(X_train)

create_confusion_matrix_df(y_train, y_train_pred_dtree)

In [None]:
#Using Cross Validation to see the variation of Precision and Recall with the threshold
from sklearn.model_selection import cross_val_predict

y_train_scores_dtree = cross_val_predict(dtree_clf, X_train, y_train, cv = 10, method = 'predict_proba')
y_train_scores_dtree

In [None]:
y_cross_val_scores = cross_val_score(dtree_clf, X_train, y_train , cv = 10, scoring = 'f1')
display_scores(y_cross_val_scores)

In [None]:
from sklearn.metrics import precision_recall_curve

precisions, recalls, thresholds = precision_recall_curve(y_train, y_train_scores_dtree[:,1])

# Plotting Precision vs Threshold, Recall Vs Threshold
fig = plt.figure(figsize = (8,6))
ax = plt.subplot(111)
plt.plot(thresholds, precisions[:-1], 'b--', label = "Precision")
plt.plot(thresholds, recalls[:-1], 'g-', label = "Recalls")
plt.legend()
plt.ylabel("Precisions and Recalls")
plt.xlabel("Thresholds")
plt.title("Precisions/Recalls vs Thresholds", fontsize = 14)
plt.show()

In [None]:
# Plotting the threshold with Precision only
plt.plot(thresholds, precisions[:-1])
plt.xlabel("Threshold")
plt.ylabel("Precision")
plt.title("Precisions vs Threshold", fontsize = 13)
plt.show()

In [None]:
fig = plt.figure(figsize = (8,6))
ax = plt.subplot(111)
plt.plot(recalls, precisions)
plt.xlabel("Recalls")
plt.ylabel("Precisions")
plt.title("Precisions vs Recalls", fontsize = 14)
plt.show()

In [None]:
# Calculating ROC AUC score for SGD and plottting
roc_auc_score(y_train, y_train_scores_dtree[:,1])
fpr, tpr, threholds = roc_curve(y_train, y_train_scores_dtree[:,1])
plt.plot(fpr, tpr, 'b-')
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC Curve of Decision Tree Classifier")
plt.text(0, 0.8, s = "ROC AUC score : {}".format(roc_auc_score(y_train, y_train_scores_dtree[:,1])))
plt.show()

### Random Forest Classifier

In [None]:
rand_frst = RandomForestClassifier(random_state = 1)
rand_frst.fit(X_train, y_train)

y_train_pred_forest = rand_frst.predict(X_train)

create_confusion_matrix_df(y_train, y_train_pred_forest)

In [None]:
#Using Cross Validation to see the variation of Precision and Recall with the threshold
from sklearn.model_selection import cross_val_predict

y_train_scores_forest = cross_val_predict(rand_frst, X_train, y_train, cv = 10, method = 'predict_proba')
#y_train_scores_forest

In [None]:
y_cross_val_scores = cross_val_score(rand_frst, X_train, y_train , cv = 10, scoring = 'f1')
display_scores(y_cross_val_scores)

In [None]:
from sklearn.metrics import precision_recall_curve

precisions, recalls, thresholds = precision_recall_curve(y_train, y_train_scores_forest[:,1])

# Plotting Precision vs Threshold, Recall Vs Threshold
fig = plt.figure(figsize = (8,6))
ax = plt.subplot(111)
plt.plot(thresholds, precisions[:-1], 'b--', label = "Precision")
plt.plot(thresholds, recalls[:-1], 'g-', label = "Recalls")
plt.legend()
plt.ylabel("Precisions and Recalls")
plt.xlabel("Thresholds")
plt.title("Precisions/Recalls vs Thresholds", fontsize = 14)
plt.show()

In [None]:
# Plotting the threshold with Precision only
plt.plot(thresholds, precisions[:-1])
plt.xlabel("Threshold")
plt.ylabel("Precision")
plt.title("Precisions vs Threshold", fontsize = 13)
plt.show()

A much better baseline model. We can expect a very good precision after tuning.

In [None]:
fig = plt.figure(figsize = (8,6))
ax = plt.subplot(111)
plt.plot(recalls, precisions)
plt.xlabel("Recalls")
plt.ylabel("Precisions")
plt.title("Precisions vs Recalls", fontsize = 14)
plt.show()

In [None]:
# Calculating ROC AUC score for SGD and plottting
roc_auc_score(y_train, y_train_scores_forest[:,1])
fpr, tpr, threholds = roc_curve(y_train, y_train_scores_forest[:,1])
plt.plot(fpr, tpr, 'b-')
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC Curve of Random Forest Classifier")
plt.text(0, 0.8, s = "ROC AUC score : {}".format(roc_auc_score(y_train, y_train_scores_forest[:,1])))
plt.show()

### SVM Classifier

In [None]:
from sklearn.svm import SVC
svc = SVC(random_state = 1, probability = True)
svc.fit(X_train, y_train)
y_train_pred_svm = svc.predict(X_train)
create_confusion_matrix_df(y_train, y_train_pred_svm)

In [None]:
# Let us try to cross validate the SVM 
y_scores_svc = cross_val_predict(svc, X_train, y_train, cv = 10, method = "predict_proba")

In [None]:
y_cross_val_scores = cross_val_score(svc, X_train, y_train , cv = 10, scoring = 'f1')
display_scores(y_cross_val_scores)

In [None]:
## Plotting a precision recall curve of SVC
precision, recall, threshold = precision_recall_curve(y_train, y_scores_svc[:,1])
plt.plot(threshold, precision[:-1], 'b--')
plt.plot(threshold, recall[:-1], 'g-')
plt.show()

In [None]:
fig = plt.figure(figsize = (8,6))
ax = plt.subplot(111)
plt.plot(recalls, precisions)
plt.xlabel("Recalls")
plt.ylabel("Precisions")
plt.title("Precisions vs Recalls", fontsize = 14)
plt.show() 

In [None]:
# Calculating ROC AUC score for SGD and plottting
roc_auc_score(y_train, y_scores_svc[:,1])
fpr, tpr, threholds = roc_curve(y_train, y_scores_svc[:,1])
plt.plot(fpr, tpr, 'b-')
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC Curve of SVC Classifier")
plt.text(0, 0.8, s = "ROC AUC score : {}".format(roc_auc_score(y_train, y_scores_svc[:,1])))
plt.show()

### KNN classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 9)
knn.fit(X_train, y_train)
y_train_pred_knn = knn.predict(X_train)
create_confusion_matrix_df(y_train, y_train_pred_knn)


In [None]:
# Let us try to cross validate the SVM 
y_scores_knn = cross_val_predict(knn, X_train, y_train, cv = 10, method = "predict_proba")
y_scores_knn

In [None]:
y_cross_val_scores = cross_val_score(knn, X_train, y_train , cv = 10, scoring = 'precision')
display_scores(y_cross_val_scores)

In [None]:
## Plotting a precision recall curve of SVC
precision, recall, threshold = precision_recall_curve(y_train, y_scores_svc[:,1])
plt.plot(threshold, precision[:-1], 'b--')
plt.plot(threshold, recall[:-1], 'g-')
plt.show()


In [None]:
fig = plt.figure(figsize = (8,6))
ax = plt.subplot(111)
plt.plot(recalls, precisions)
plt.xlabel("Recalls")
plt.ylabel("Precisions")
plt.title("Precisions vs Recalls", fontsize = 14)
plt.show()

In [None]:
# Calculating ROC AUC score for SGD and plottting
roc_auc_score(y_train, y_scores_knn[:,1])
fpr, tpr, threholds = roc_curve(y_train, y_scores_knn[:,1])
plt.plot(fpr, tpr, 'b-')
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC Curve of KNN Classifier")
plt.text(0, 0.8, s = "ROC AUC score : {}".format(roc_auc_score(y_train, y_scores_knn[:,1])))
plt.show()

In [None]:
baseline_models = ["Logistic Regression", "SGD Classifier", "Decisions Tree Classifer", "Random Forest", "SVC", "KNN"]
log_reg_scores_prec = cross_val_score(log_reg_clf, X_train, y_train , cv = 10, scoring = 'precision')
sgd_scores_prec = cross_val_score(sgd_clf, X_train, y_train , cv = 10, scoring = 'precision')
dtree_scores_prec = cross_val_score(dtree_clf, X_train, y_train , cv = 10, scoring = 'precision')
rand_frst_scores_prec = cross_val_score(rand_frst, X_train, y_train , cv = 10, scoring = 'precision')
SVC_scores_prec = cross_val_score(svc, X_train, y_train , cv = 10, scoring = 'precision')
knn_scores_prec = cross_val_score(knn, X_train, y_train , cv = 10, scoring = 'precision')

log_reg_scores_f1 = cross_val_score(log_reg_clf, X_train, y_train , cv = 10, scoring = 'f1')
sgd_scores_f1 = cross_val_score(sgd_clf, X_train, y_train , cv = 10, scoring = 'f1')
dtree_scores_f1 = cross_val_score(dtree_clf, X_train, y_train , cv = 10, scoring = 'f1')
rand_frst_scores_f1 = cross_val_score(rand_frst, X_train, y_train , cv = 10, scoring = 'f1')
SVC_scores_f1= cross_val_score(svc, X_train, y_train , cv = 10, scoring = 'f1')
knn_scores_f1= cross_val_score(knn, X_train, y_train , cv = 10, scoring = 'f1')


log_reg_scores_acc = cross_val_score(log_reg_clf, X_train, y_train , cv = 10, scoring = 'accuracy')
sgd_scores_acc = cross_val_score(sgd_clf, X_train, y_train , cv = 10, scoring = 'accuracy')
dtree_scores_acc = cross_val_score(dtree_clf, X_train, y_train , cv = 10, scoring = 'accuracy')
rand_frst_scores_acc = cross_val_score(rand_frst, X_train, y_train , cv = 10, scoring = 'accuracy')
SVC_scores_acc= cross_val_score(svc, X_train, y_train , cv = 10, scoring = 'accuracy')
knn_scores_acc= cross_val_score(knn, X_train, y_train , cv = 10, scoring = 'accuracy')

log_reg_scores_bal_acc = cross_val_score(log_reg_clf, X_train, y_train , cv = 10, scoring = 'balanced_accuracy')
sgd_scores_bal_acc = cross_val_score(sgd_clf, X_train, y_train , cv = 10, scoring = 'balanced_accuracy')
dtree_scores_bal_acc = cross_val_score(dtree_clf, X_train, y_train , cv = 10, scoring = 'balanced_accuracy')
rand_frst_scores_bal_acc = cross_val_score(rand_frst, X_train, y_train , cv = 10, scoring = 'balanced_accuracy')
SVC_scores_bal_acc= cross_val_score(svc, X_train, y_train , cv = 10, scoring = 'balanced_accuracy')
knn_scores_bal_acc= cross_val_score(knn, X_train, y_train , cv = 10, scoring = 'balanced_accuracy')

In [None]:
scores_df_prec = pd.DataFrame(np.array([log_reg_scores_prec, sgd_scores_prec,dtree_scores_prec,rand_frst_scores_prec, SVC_scores_prec,knn_scores_prec]), columns = ["Fold_" + str(i) for i in range(1,11)], index = baseline_models)
scores_df_prec

scores_df_prec.T.plot(figsize = (18,6))
plt.xlabel("Folds")
plt.ylabel("Precisions")
plt.title("Precisions score of Baseline models")
plt.show()


In [None]:
scores_df_f1 = pd.DataFrame(np.array([log_reg_scores_f1, sgd_scores_f1,dtree_scores_f1,rand_frst_scores_f1, SVC_scores_f1, knn_scores_f1]), columns = ["Fold_" + str(i) for i in range(1,11)], index = baseline_models)
scores_df_f1

scores_df_f1.T.plot(figsize = (18,6))
plt.xlabel("Folds")
plt.ylabel("F1 Scores")
plt.title("F1 score of Baseline models")
plt.show()

In [None]:
scores_df_acc = pd.DataFrame(np.array([log_reg_scores_acc, sgd_scores_acc,dtree_scores_acc,rand_frst_scores_acc, SVC_scores_acc,knn_scores_acc]), columns = ["Fold_" + str(i) for i in range(1,11)], index = baseline_models)
scores_df_acc

scores_df_acc.T.plot(figsize = (18,6))
plt.xlabel("Folds")
plt.ylabel("Accuracy")
plt.title("Accuracy score of Baseline models")
plt.show()

In [None]:
scores_df_bal_acc = pd.DataFrame(np.array([log_reg_scores_bal_acc, sgd_scores_bal_acc,dtree_scores_bal_acc,rand_frst_scores_bal_acc, SVC_scores_bal_acc,knn_scores_bal_acc]), columns = ["Fold_" + str(i) for i in range(1,11)], index = baseline_models)
scores_df_acc

scores_df_acc.T.plot(figsize = (18,6))
plt.xlabel("Folds")
plt.ylabel("Balanced Accuracy")
plt.title("Balanced Accuracy score of Baseline models")
plt.show()

In [None]:
mean_scores_cv_df = pd.DataFrame(index = baseline_models)
#mean_scores_cv_df["model"] = baseline_models
mean_scores_cv_df["Precision"] = [log_reg_scores_prec.mean(),sgd_scores_prec.mean(), dtree_scores_prec.mean(),rand_frst_scores_prec.mean(),SVC_scores_prec.mean(),knn_scores_prec.mean()]
mean_scores_cv_df["F1"] = [log_reg_scores_f1.mean(),sgd_scores_f1.mean(), dtree_scores_f1.mean(),rand_frst_scores_f1.mean(),SVC_scores_f1.mean(),knn_scores_f1.mean()]
mean_scores_cv_df["Accuracy"] = [log_reg_scores_acc.mean(),sgd_scores_acc.mean(), dtree_scores_acc.mean(),rand_frst_scores_acc.mean(),SVC_scores_acc.mean(),knn_scores_acc.mean()]
mean_scores_cv_df["Balanced"] = [log_reg_scores_bal_acc.mean(),sgd_scores_bal_acc.mean(), dtree_scores_bal_acc.mean(),rand_frst_scores_bal_acc.mean(),SVC_scores_bal_acc.mean(),knn_scores_bal_acc.mean()]
mean_scores_cv_df

In [None]:
plt.figure(figsize = (8,6))
sns.heatmap(mean_scores_cv_df, annot = True)
plt.title("Heat Map representation of Mean accuracy scores", fontsize = 14)
plt.show()

## Hyperparameter Tuning - Random Forest

In [None]:
# Let us separate a part of training set as a Hold Out set
X_ho = X_train[-100:,:]
y_ho = y_train[-100:]
X_ho.shape, y_ho.shape

In [None]:
param_grid = [{"criterion":["gini", "entropy"],
               "max_depth": [10,50,100],
               "min_samples_split": [10,100,500],
               "n_estimators": [10,100,1000],
               "bootstrap" :[True, False]
              }]
rand_frst_grid = GridSearchCV(rand_frst,param_grid, cv = 5, scoring = "f1", verbose = 1, n_jobs = -1)
rand_frst_grid.fit(X_train[:-100,:], y_train[:-100])

In [None]:
rand_frst_grid.best_estimator_, rand_frst_grid.best_score_

In [None]:
rand_frst_grid_res = rand_frst_grid.cv_results_
#rand_frst_grid_res

In [None]:
rand_forest_grid_res_df = pd.DataFrame(columns = ["Params", "Scores"])
rand_frst_grid_res = rand_frst_grid.cv_results_
#plt.plot(rand_frst_grid_res["params"], rand_frst_grid_res["mean_test_score"])
rand_forest_grid_res_df["Params"] = [str(i) for i in rand_frst_grid_res["params"]]
rand_forest_grid_res_df["Scores"] = rand_frst_grid_res["mean_test_score"].astype("float64")



fig = plt.figure(figsize = (15, 15))
ax = plt.subplot(111)
sns.barplot(data = rand_forest_grid_res_df.sort_values(by="Scores" , ascending = False).head(30),x = "Scores", y = "Params" )
plt.xticks(rotation = 90)
plt.show()

In [None]:
rand_forest_grid_res_df.sort_values(by="Scores" , ascending = False)

## Testing the tuned Random Forest Classifier in the hold out dataset

In [None]:
from sklearn.metrics import classification_report

In [None]:
estimator = rand_frst_grid.best_estimator_
estimator.fit(X_train[:-100,:], y_train[:-100])

y_ho_pred = estimator.predict(X_ho)

print(classification_report(y_ho, y_ho_pred))
create_confusion_matrix_df(y_ho, y_ho_pred)

## Testing the accuracy of the best estimator in Test Dataset

In [None]:
estimator.fit(X_train, y_train)
y_pred = estimator.predict(X_test)

print(classification_report(y_test, y_pred))
create_confusion_matrix_df(y_test, y_pred)

## Hyperparameter Tuning - SVM

In [None]:
# param_grid = [{"kernel": ["poly"],
#                "C": [0.001,0.01,0.1,10,100,1000],
#                "degree": [2,3],
#                "coef0": [1]
#               }]
# svc_grid = GridSearchCV(svc,param_grid, cv = 5, scoring = "f1", verbose = 1, n_jobs = -1)
# svc_grid.fit(X_train[:-100,:], y_train[:-100])

In [None]:
svc_grid.best_estimator_, svc_grid.best_score_

In [None]:
svc_grid_res_df = pd.DataFrame(columns = ["Params", "Scores"])
svc_grid_res = svc_grid.cv_results_
#plt.plot(rand_frst_grid_res["params"], rand_frst_grid_res["mean_test_score"])
svc_grid_res_df["Params"] = [str(i) for i in svc_grid_res["params"]]
svc_grid_res_df["Scores"] = svc_grid_res["mean_test_score"].astype("float64")



fig = plt.figure(figsize = (15, 15))
ax = plt.subplot(111)
sns.barplot(data = svc_grid_res_df.sort_values(by="Scores" , ascending = False).head(30),x = "Scores", y = "Params" )
plt.xticks(rotation = 90)
plt.show()

In [None]:
svc_grid_res_df.sort_values(by="Scores" , ascending = False)

In [None]:
estimator = svc_grid.best_estimator_
estimator.fit(X_train[:-100,:], y_train[:-100])

y_ho_pred = estimator.predict(X_ho)

print(classification_report(y_ho, y_ho_pred))
create_confusion_matrix_df(y_ho, y_ho_pred)

## Testing the accuracy of the best estimator in Test Dataset

In [None]:
estimator.fit(X_train, y_train)
y_pred = estimator.predict(X_test)

print(classification_report(y_test, y_pred))
create_confusion_matrix_df(y_test, y_pred)

I have tuned both the SVC with polynomial kernel with degree 2 and 3 with C value .001 to 1000 and a Random Forest Classifier with different hyperparameters. The Random forest performed better than the SVC. However, SVC had a slightly better training accuracy. The model has a lot of scope for improvements with and ensemble models can also be tried to improve the accuracy scores