In [None]:
##Data Analysis Packages##
import pandas as pd
import numpy as np
import sklearn as sk
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from sklearn.feature_selection import RFE, RFECV
from scipy import stats
import statsmodels.api as sm
import statsmodels.stats.api as sms


##Data Visualization Packages ##
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
##Importing Water Potability Dataset##
water_potability_data = pd.read_csv(r'../input/water-potability/water_potability.csv')
water_potability_data.info() #High Level Overview of Water Potability Dataset before Splitting and Performining Exploratory Data Analysis.

In [None]:
##Separating predictor (X) variables from response variable (y)##
y = water_potability_data.loc[:,'Potability'] #Response Variable (y): Potability (Water that is safe to drink)
X = water_potability_data.drop('Potability', axis=1) #Removing response variable in order to segregate from predictor set.

##Splitting data into train (80%) and test (20%) portions##
X_train,X_test,y_train, y_test = train_test_split(X,y, train_size=0.80, test_size=0.20, random_state=2021)

--Step 1. Performing Data Pre-Processing--

In [None]:
##Overview of Training Dataset Columns before Pre-Processing ##
X_train.info() #Columns with highest number of non-null observations (2620): Hardness, Solids, Chloramines, Conductivity, Organic_carbon, Turbidity.
X_train.head() 

In [None]:
##Descriptive statistics for predictor variables before removing null values##
X_train.describe().round(1)

In [None]:
##Descriptive statistics for response variable before removing null values##
y_train_cat_1 = y_train.astype('category')
y_train_cat_1.describe() #Highest frequency is 0 (non-potable) ~ 1606/2620 = 61% of observations correspond to non-potable water. Therefore, 39% of observations corresponse to potable water.

In [None]:
##Re-joining training dataset before removing observations with null values to ensure that responses are removed simultaneously when removing observations with nulls##
combined_train_data = X_train.join(y_train)

combined_train_data.dropna(inplace=True) #Dropping all null values.
combined_train_data.info() #1613 observations remaining.

In [None]:
##Separating cleaned training dataset into predictors and a response once again##
X_train_cleaned = combined_train_data.drop('Potability', axis=1)

y_train_cleaned = combined_train_data.loc[:, 'Potability']

In [None]:
##Displaying descriptive statistics of 'X_train_cleaned' dataframe##
X_train_cleaned.describe().round(1) #Rounding to 1 decimal place.

In [None]:
##Displaying descriptive statistics of 'y_train_cleaned' dataframe##
y_train_cat_2 = y_train_cleaned.astype('category')
y_train_cat_2.describe() #Highest frequency is 0 (non-potable) ~ 964/1613 = 60% of observations correspond to non-potable water. Therefore, 40% of observations corresponse to potable water.

In [None]:
##Creating correlation matrix for all variables from 'combined_train_data' dataframe for context##
combined_train_data_corr = combined_train_data.corr()
matrix = np.tril(combined_train_data_corr)
f, ax = plt.subplots(figsize=(15,12))
sns.heatmap(combined_train_data_corr, vmax=0.8, annot=True, mask=matrix) #No visibly strong correlations seen for predictors amongst themselves or with response variablee (all weak, pearson/point biserial correlation coefficients 0.20 or less)

In [None]:
##Histograms of all predictor variables in 'X_train_cleaned' dataframe##
X_train_cleaned.hist(bins=10, figsize=(20,15), layout=(3,3), color='green'); #All histograms appear relatively symmetrical (no drastic relative skew upon first glance)

Step 3. Performing Data Modelling

The following models will be attempted: Logistic Regression & Gradient Boost Classification.

Step 3a. Logistic Regression Modelling 

In [None]:
##Importing Logistic Regression Package##
from sklearn.linear_model import LogisticRegression

In [None]:
##Standardizing input variables for logistic regression model due to magnitude of differences between predictor variables##
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler() #Re-scaling so that mean =0 and standard deviation = 1.

X_train_clean_scale = scaler.fit_transform(X_train_cleaned) #Standardizing training data.

##Converting scaled data to dataframe and re-adding column names
X_train_clean_scale_df = pd.DataFrame(X_train_clean_scale, columns = X_train_cleaned.columns)

In [None]:
##Applying Recursive Elimination with Cross-Validation (RFECV) for Logistic Regression in support of Feature Selection##
logit_estimator = LogisticRegression(solver='liblinear', random_state=2021)

##Identification of optimal number of features to select with RFECV. Applying 10 folds.
logit_opt_feat_num_rfecv = RFECV(estimator = logit_estimator, step=1, cv=StratifiedKFold(10), scoring='balanced_accuracy', min_features_to_select=1)

logit_opt_feat_num_rfecv.fit(X_train_clean_scale_df, np.ravel(y_train_cleaned))

##Extracting optimal number of features
print(logit_opt_feat_num_rfecv.n_features_) #3 out of the 9 features were selected as important.

##Plotting chart in support of optimum number of features
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross Validation Score")
plt.plot(range(1,
               len(logit_opt_feat_num_rfecv.grid_scores_) + 1),
         logit_opt_feat_num_rfecv.grid_scores_)
plt.show() 

In [None]:
##Training the logit classifier with optimal number of features already identified.
logit_classifier = RFE(estimator=logit_estimator, n_features_to_select=3, step=1)
logit_classifier.fit(X_train_clean_scale_df, np.ravel(y_train_cleaned))

##Extracting the 6 factors with highest importance for the logistic regression model.##
logit_feat = pd.DataFrame()
logit_feat['feature_name'] = X_train_clean_scale_df.columns
logit_feat['importance'] = logit_classifier.support_
print(logit_classifier.ranking_)
logit_feat

In [None]:
##Creating filtered training and testing datasets having the highest importance to the logistic regression model##
X_train_cleanreduce_scale = X_train_clean_scale_df.filter(['ph', 'Solids', 'Turbidity'])

X_test_reduce = X_test.filter(['ph', 'Solids', 'Turbidity'])

In [None]:
##Checking for Multicollinearity before building model##
from statsmodels.stats.outliers_influence import variance_inflation_factor

X_train_cleanreduce_vif = sm.add_constant(X_train_cleanreduce_scale) #For evaluating VIF only.

vif = [variance_inflation_factor(X_train_cleanreduce_vif.values,i) for i in range(X_train_cleanreduce_vif.shape[1])]

pd.DataFrame({'vif': vif[1:]}, index=X_train_cleanreduce_scale.columns).T #Multicollinearity interpretted as high when VIF > 5. All appear acceptable (<5).

In [None]:
##Evaluating sample size before building model##

##Rule of Thumb: (10*k)/p, where k = number of predictor variables. P = fraction of positive/'1' observations. 

##Minimum Sample size should be: (10*6)/0.40 = 150 observations; Fulfilled.

In [None]:
##Building Logistic Regression Model with statsmodel for visibility to coefficients##
logit_sm = sm.Logit(np.ravel(y_train_cleaned), X_train_cleanreduce_scale)
result_logit_sm = logit_sm.fit()
result_logit_sm.summary() #No p-values appear to be identified as statistically signficant (p-value all >0.05)

In [None]:
##Building Logistic Regression Model for Sklearn Prediction##
logit_skmodel = LogisticRegression(solver='liblinear',random_state=2021).fit(X_train_cleanreduce_scale, np.ravel(y_train_cleaned))

In [None]:
##Cross-Validation Accuracy Score##
logit_skmodel_cvs = cross_val_score(logit_skmodel, X_train_cleanreduce_scale, np.ravel(y_train_cleaned), cv=StratifiedKFold(10))
logit_skmodel_cvs.mean() #0.5976458860516832

In [None]:
##When originally attempting to run model prediction with scaled test dataset, a notification indicated that there were null values present. Therefore, null values were filled with a constant placeholder and data was re-scaled.##
X_test_reduce = X_test_reduce.fillna(0)

X_test_scaled = scaler.fit_transform(X_test_reduce) #Standardizing test data separate from training data to avoid information leakage.

##Converting scaled data to dataframe and re-adding column names
X_test_scalereduce_df = pd.DataFrame(X_test_scaled, columns = X_test_reduce.columns)

In [None]:
##Response Prediction##
logit_ypred = logit_skmodel.predict(X_test_scaled)

In [None]:
##Determining accuracy scores##
accuracy_score(y_test,logit_ypred) #0.6051829268292683

In [None]:
##Creating classification report for logistic regression model.##
print(classification_report(y_test, logit_ypred))

In [None]:
##Creating confusion matrix for logistic regression model. True negatives (TN) are in the upper-left position, False Negatives (FN) are in the lower-left position, False Positives (FP) are in the upper-right position, True Positives (TP) are in the lower-right position.##
confusion_matrix(y_test, logit_ypred)

In [None]:
##Determining AUC score for logistic regression model.##
roc_auc_score(y_test, logit_ypred) #0.5100881261595548

In [None]:
##Determining F1 score for the logistic regresion model##
f1_score(y_test, logit_ypred,average='binary') #0.04428044280442805; a poor F1 score, is close to 0.0. Best F1 score is close to 1.

**The final metrics obtained for this Logistic Regression Model were:

1. Final Test Score: 0.60; 
2. Sensitivity: TP/(TP+FN) = (6/(6+258)) = 0.02;
3. Specificity: TN/(TN+FP) = (391/(391+1)) = 1.00;
4. AUC Score: 0.5;
5. F1 Score: 0;**

Step 3b:Gradient Boost Classification Modelling

In [None]:
##Importing Gradient Boosting Classifier##
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
##Building GB Classification Model for Sklearn Prediction##
gb_classifier = GradientBoostingClassifier(n_estimators=100, random_state=2021)

##Identification of optimal number of features to select with RFECV approach. Selecting 10 folds 
opt_gb_rfecv = RFECV(estimator = gb_classifier, step=1, cv=StratifiedKFold(10), scoring='balanced_accuracy', min_features_to_select=1)
opt_gb_rfecv.fit(X_train_cleaned, np.ravel(y_train_cleaned))
print(opt_gb_rfecv.n_features_) #5 out of 8 selected as important.

##Plotting Cross Validation performance
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score")
plt.plot(range(1,
               len(opt_gb_rfecv.grid_scores_) + 1),
         opt_gb_rfecv.grid_scores_)
plt.show() 

In [None]:
##Training the gradient boost classifier with optimal number of features already identified.
gb_rfe_classifier = RFE(estimator=gb_classifier, n_features_to_select=5, step=1)
gb_rfe_classifier.fit(X_train_cleaned, np.ravel(y_train_cleaned))

In [None]:
##Determining features of highest importance for the gradient boost model.##
gb_feat = pd.DataFrame()
gb_feat['feature_name'] = X_train_cleaned.columns
gb_feat['importance'] = gb_rfe_classifier.support_
print(gb_rfe_classifier.ranking_)
gb_feat

In [None]:
##Only columns found to have  importance to the gradient boost model via RFECV.##
X_train_reduced_gb = X_train_cleaned.filter(['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate'])
X_test_reduced_gb = X_test.filter(['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate'])

In [None]:
##Building Gradient Boost Classification Model with Selected Variables##
model_varimp = GradientBoostingClassifier(n_estimators=100, random_state=2021).fit(X_train_reduced_gb, np.ravel(y_train_cleaned))

In [None]:
##Cross-Validation Accuracy Score##
gb_model_cvs = cross_val_score(model_varimp, X_train_reduced_gb, np.ravel(y_train_cleaned), cv=StratifiedKFold(10))
gb_model_cvs.mean() #0.6602676175139944

In [None]:
##Filling null values with 0 for purpose of prediction##
X_test_reduced_gb = X_test_reduced_gb.fillna(0) 

##Response Prediction
y_pred_gb = model_varimp.predict(X_test_reduced_gb)

In [None]:
##Determining test accuracy score##
accuracy_score(y_test, y_pred_gb) #0.6036585365853658

In [None]:
##Creating classification report for GB Classification Model##
print(classification_report(y_test, y_pred_gb))

In [None]:
##Creating confusion matrix for GB Classification Model. True negatives (TN) are in the upper-left position, False Negatives (FN) are in the lower-left position, False Positives (FP) are in the upper-right position, True Positives (TP) are in the lower-right position.##
confusion_matrix(y_test, y_pred_gb)

In [None]:
##Determining AUC score for the GB Classification Model.##
roc_auc_score(y_test, y_pred_gb) #0.5725108225108225

In [None]:
##Determining F1 score for the GB Classification Model.##
f1_score(y_test, y_pred_gb,average='binary') #0.45606694560669453; a poor F1 score, is close to 0.0. Best F1 score is close to 1.

In [None]:
##Generating values for feature importance plot.##
reduced_list_gb = list(['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate'])

##Numerical Importance of Predictors
importance_gb = list(model_varimp.feature_importances_)

##Merged and Sorted with Predictors of importance
var_importance_merge_gb = [(predictor,round(importance,2)) for predictor, importance in zip(reduced_list_gb,importance_gb)]

var_importance_merge_gb = sorted(var_importance_merge_gb, key = lambda x: x[1], reverse = True)

print(var_importance_merge_gb)

In [None]:
##Plotting feature importance.
df_importance_gb = pd.DataFrame(var_importance_merge_gb, columns = ['PREDICTOR','IMPORTANCE_LEVEL'])

#Predictor Rank Plot
sns.catplot(x="IMPORTANCE_LEVEL", y='PREDICTOR', data = df_importance_gb, kind = "bar", height =14)

In [None]:
##Hypertuning with GridSearchCV Part 1 - max_depth and min_samples_split##
param_grid1 = {
    'max_depth':[4,8,12],
    'min_samples_split':[1,40,2],
}

gb_gscv1 = GridSearchCV(estimator = gb_classifier, param_grid = param_grid1, cv=StratifiedKFold(10), n_jobs=-1, verbose = 2)

In [None]:
##Fitting GSCV with training data##
gb_gscv1.fit(X_train_reduced_gb, np.ravel(y_train_cleaned))

In [None]:
##Extracting best params from GSCV 1##
gb_gscv1.best_params_ #{'max_depth': 4, 'min_samples_split': 2}

In [None]:
##Hypertuning with GridSearchCV Part 2 - min_samples_leaf##
param_grid2 = {
    'min_samples_leaf':[1,20,2],
    'max_depth':[4],
    'min_samples_split': [2]
}

gb_gscv2 = GridSearchCV(estimator = gb_classifier, param_grid = param_grid2, cv=StratifiedKFold(10), n_jobs=-1, verbose = 2)

In [None]:
##Fitting GSCV with training data##
gb_gscv2.fit(X_train_reduced_gb, np.ravel(y_train_cleaned))

In [None]:
##Extracting best params from GSCV 2##
gb_gscv2.best_params_ #{'max_depth': 4, 'min_samples_leaf': 20, 'min_samples_split': 2}

In [None]:
##Hypertuning with GridSearchCV Part 3 - max_features##
param_grid3 = {
    'min_samples_leaf':[20],
    'max_depth':[4],
    'min_samples_split': [2],
    'max_features':['sqrt','log2']
}

gb_gscv3 = GridSearchCV(estimator = gb_classifier, param_grid = param_grid3, cv=StratifiedKFold(10), n_jobs=-1, verbose = 2)

In [None]:
##Fitting GSCV with training data##
gb_gscv3.fit(X_train_reduced_gb, np.ravel(y_train_cleaned))

In [None]:
##Extracting best params from GSCV 3##
gb_gscv3.best_params_ #{'max_depth': 4, 'min_samples_leaf': 20, 'min_samples_split': 2, 'max_features': 'sqrt'}

In [None]:
##Hypertuning with GridSearchCV Part 4 - learning rate and n_estimators##
param_grid4 = {
    'min_samples_leaf':[20],
    'max_depth':[4],
    'min_samples_split': [2],
    'max_features':['sqrt'],
    'learning_rate':[0.01,0.1,0.02],
    'n_estimators':[80,120,5]
}

gb_gscv4 = GridSearchCV(estimator = gb_classifier, param_grid = param_grid4, cv=StratifiedKFold(10), n_jobs=-1, verbose = 2)

In [None]:
##Fitting GSCV with training data##
gb_gscv4.fit(X_train_reduced_gb, np.ravel(y_train_cleaned))

In [None]:
##Extracting best params from GSCV 3##
gb_gscv4.best_params_ #{'learning_rate': 0.1,'max_depth': 4, 'max_features': 'sqrt','min_samples_leaf': 20,'min_samples_split': 2,'n_estimators': 120}

In [None]:
##Re-fitting a second gradient boosting classification model with hypertuned parameters##
model_gb_final= GradientBoostingClassifier(learning_rate=0.1, n_estimators=120, min_samples_split=2, min_samples_leaf=20, max_depth=4, random_state=2021).fit(X_train_reduced_gb, np.ravel(y_train_cleaned))

In [None]:
##Cross-Validation Accuracy Score##
model_gb_final_cvs = cross_val_score(model_gb_final, X_train_reduced_gb, np.ravel(y_train_cleaned), cv=StratifiedKFold(10))
model_gb_final_cvs.mean() #0.6807415075531017

In [None]:
##Response Prediction##
y_pred_gb_final = model_gb_final.predict(X_test_reduced_gb)

In [None]:
##Determining Test Score##
accuracy_score(y_test, y_pred_gb_final)#0.6067073170731707

In [None]:
##Creating classification report for gradient boosting classification.##
print(classification_report(y_test, y_pred_gb_final))

In [None]:
##Creating confusion matrix for gradient boosting model. True negatives (TN) are in the upper-left position, False Negatives (FN) are in the lower-left position, False Positives (FP) are in the upper-right position, True Positives (TP) are in the lower-right position.##
confusion_matrix(y_test, y_pred_gb_final)

In [None]:
##Determining AUC score for gradient boosting classification model.##
roc_auc_score(y_test, y_pred_gb_final) #0.5762987012987013

In [None]:
##Determining F1 score for the for classification model.##
f1_score(y_test, y_pred_gb_final,average='binary') #0.4625; a poor F1 score, is close to 0.0. Best F1 score is close to 1.

In [None]:
##Generating values for feature importance plot.
reduced_list_gb_final = list(['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate'])

##Numerical Importance of Predictors
importance_gb_final = list(model_gb_final.feature_importances_)

##Merged and Sorted with Predictors of importance
var_importance_merge_gb_final = [(predictor,round(importance,2)) for predictor, importance in zip(reduced_list_gb_final,importance_gb_final)]

var_importance_merge_gb_final = sorted(var_importance_merge_gb_final, key = lambda x: x[1], reverse = True)

print(var_importance_merge_gb_final)


In [None]:
##Plotting feature importance.##
df_importance_gb_final= pd.DataFrame(var_importance_merge_gb_final, columns = ['PREDICTOR','IMPORTANCE_LEVEL'])

##Predictor Rank Plot
sns.catplot(x="IMPORTANCE_LEVEL", y='PREDICTOR', data = df_importance_gb_final, kind = "bar", height =14)

**The final metrics obtained for this Gradient Boosting Classification Model were:                                        1. Final Test Score: 0.68;                                      2. Sensitivity: TP/(TP+FN) = (111/(111+153)) = 0.42;              3. Specificity: TN/(TN+FP) = (287/(287+153)) = 0.65; 
4. AUC Score: 0.6; 
5. F1 Score: 0.5**
