In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, QuantileTransformer
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE
from tpot import TPOTClassifier
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, balanced_accuracy_score, classification_report, confusion_matrix, matthews_corrcoef
plt.style.use({'font.size': 16})

In [None]:
data_file = '../input/water-potability/water_potability.csv'
random_state = 42

### Import data
read data from a csv file into pandas dataframe

In [None]:
data = pd.read_csv(data_file)
data.head()

the dataset has **3276** rows **10** columns in total with `Potability` being our target for classification

In [None]:
data.info()

### Observe data
before feeding data to machine learning (ML) algorithms, we need to deal with missing values. <br>
here we have **Sulfate** column with highest number of missing values **(781)** . if we **drop** rows with missing values then we will **loose at least 781** rows

In [None]:
data.isna().sum()

select column names that contain missing values <br>
fill missing values with target group mean value which will result in zero missing values for all columns in dataset

In [None]:
na_cols = pd.DataFrame(data.isna().any(), columns=['is_na'])
na_cols = list(na_cols[na_cols['is_na']].index)

In [None]:
for col in na_cols:
    data[col].fillna(data.groupby(['Potability'])[col].transform('mean'), inplace=True)

no more null or missing values !!

In [None]:
data.isna().sum()


look at summary statistics
we can see that the column values are not at the same scale (i.e. maximum value of columns are at different scale) which can make learning process take longer to converge (but we can solve this problem later)

In [None]:
data.describe().transpose()

separate features and target column

In [None]:
TARGET = 'Potability'
y = data[TARGET]
X = data.drop(columns=[TARGET])

look at correlation (relationship) between each pair of column

The higher the magnitude of correlation coefficient, the stronger the linear relation is. Sign tells the direction of relationship <br>

<img src="https://www.investopedia.com/thmb/PXAx5y_OS5z7n-Rn9m--QOC29rw=/1500x1000/filters:no_upscale():max_bytes(150000):strip_icc()/TC_3126228-how-to-calculate-the-correlation-coefficient-5aabeb313de423003610ee40.png" />
<br>

- positive correlation value between column A and B means if A increases B increases vice versa
- negative correlation value between column A and B means if A increases B decreases vice versa
- zero correlation value between column A and B means column A and B does not have relationship

the higer the magnitude of correlation between column A and B, the more we can conclude that column A and B have linear relationship

we can have correlation value from -1 to 1

In [None]:
target_classes = list(y.unique())
fig, ax = plt.subplots(1, len(target_classes), figsize=(20,8))
for i in range(len(target_classes)):
    sns.set(font_scale=1.1)
    _ = sns.heatmap(data[data[TARGET] == target_classes[i]].drop(columns=[TARGET]).corr(), ax=ax[i], annot=True, fmt='.2f', vmin=-1, cmap='RdBu', linewidths=1)
    ax[i].set_title(f'Correlation plot for {TARGET.lower()}: {target_classes[i]}', fontdict={'fontsize': 18})

for better understanding, we can visualize the relationship between each pair of feature columns <br>

we can conclude that pairs of feature colums have weak relationship. Thus, we can use all of feature columns for our classification task as it is less likely to be affected by correlated features

In [None]:
_ = sns.pairplot(data, kind='reg', hue=TARGET)

look for target class imbalance <br>

we can conclude that we are little likely to be affected by class imbalance because <br> 
we have samples with `Potability = 0` around `61 %` and samples with `Potability = 1` around `39 %` of all data

In [None]:
ratio = pd.DataFrame((y.value_counts() / sum(y.value_counts()))).reset_index()
ratio.columns = [TARGET, 'Ratio']
ratio

plot boxplot to see if any outliers exist for each feature column separated by `Potability` class

In [None]:
sns.set(font_scale=1.1)
fig, axs = plt.subplots(3,3, figsize=(18,16))
axs = axs.flatten()
for i in range(len(axs)):
    axs[i] = sns.boxplot(x=data[TARGET] , y=data[X.columns[i]], ax=axs[i])

### Data splitting
split data into `2 sets` for `training` model and `testing` the trained model

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state)

### Over sampling
do over sampling to reduce impact of class imbalance on classification by synthesizing new examples from the minority class.

In [None]:
samp = SMOTE()
X_train, y_train = samp.fit_resample(X_train, y_train)

### Data scaling
scale the features to make the learning converge faster and give equal importance to all features in this classification task

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Transformation
transform features using quantile transformation to get features having nearly normal distribution and reduce impact of outliers on classification

In [None]:
trans = QuantileTransformer(output_distribution='normal')
X_train = trans.fit_transform(X_train)
X_test = trans.transform(X_test)

### Algorithm selection
find learning algorithm giving highest performance on unseen data <br><br>

In [None]:
'''
utility function to help in parameter tuning and to summarize the results 
(best performance and parameter set)
'''
def doTuneCV(model, X, y, space, cv=5, scoring='accuracy'):
    search = GridSearchCV(model, space, cv=cv, scoring=scoring)
    result = search.fit(X, y)
    print(f"Best params: {result.best_params_}")
    print(f"Best validation {scoring} : {result.best_score_}")
    return result

keep result from each time we do cross validation in `scores` dictionary so that later we can compare results.

In [None]:
scores = {} # key: learning algorithm name, value: cross validation result
cv = StratifiedKFold(n_splits=5)
scoring = 'balanced_accuracy'

#### Logistic regression

In [None]:
scores['logistic regression']  = cross_validate(LogisticRegression(), X_train, y_train, cv=cv, scoring=scoring)

#### Decision tree

In [None]:
scores['decision tree'] = cross_validate(DecisionTreeClassifier(), X_train, y_train, cv=cv, scoring=scoring)

#### Random forest

In [None]:
scores['random forest'] = cross_validate(RandomForestClassifier(), X_train, y_train, cv=cv, scoring=scoring)

#### Support vector machine

In [None]:
scores['support vector machine'] = cross_validate(SVC(), X_train, y_train, cv=5, scoring=scoring)

#### K-nearest neighbor

In [None]:
scores['k nearest neighbor'] = cross_validate(KNeighborsClassifier(), X_train, y_train, cv=5, scoring=scoring)

#### Ada boost

In [None]:
scores['adaboost'] = cross_validate(AdaBoostClassifier(), X_train, y_train, cv=5, scoring=scoring)

#### Bagging

In [None]:
scores['bagging'] = cross_validate(BaggingClassifier(), X_train, y_train, cv=5, scoring=scoring)

#### Gradient boosting

In [None]:
scores['gradient boosting'] = cross_validate(GradientBoostingClassifier(), X_train, y_train, cv=5, scoring=scoring)

#### Extra tree

In [None]:
scores['extra tree'] = cross_validate(ExtraTreeClassifier(), X_train, y_train, cv=5, scoring=scoring)

### Comparison
create a score map so that we can better compare the performance of different classifiers

In [None]:
score_map = {'algorithm': [], 'test_score': []}
for k,v in scores.items():
    score_map['algorithm'].append(k)
    score_map['test_score'].append(v['test_score'].mean())

best performing classifier can be seen at the top row the table below

In [None]:
score_df = pd.DataFrame(score_map).sort_values('test_score', ascending=False).reset_index().drop(columns='index')
score_df

In [None]:
'''
utility function to help summarizing model evaluation
here we have chosen evaluation metrics that will be less impacted by class imbalance
'''
def print_scores(model, X_test, y_test):
    print(model)
    print(f'test accuracy: {balanced_accuracy_score(y_test, model.predict(X_test))}')
    print(f'test roc auc: {roc_auc_score(y_test, model.predict(X_test))}')
    print(f'test mcc: {matthews_corrcoef(y_test, model.predict(X_test))}')

### Parameter tuning
select best classifiers for parameter tuning

In [None]:
space_rf = {
    'n_estimators': range(100, 800, 100)
}
rf_classifier = doTuneCV(RandomForestClassifier(), X_train, y_train, space_rf, cv, scoring)

In [None]:
space_gb = {
    'n_estimators': range(100, 800, 100)
}
gb_classifier = doTuneCV(GradientBoostingClassifier(), X_train, y_train, space_gb, cv, scoring)

In [None]:
rf_model = rf_classifier.estimator
rf_model.set_params(**rf_classifier.best_params_)
rf_model.fit(X_train, y_train)
print_scores(rf_model, X_test, y_test)

In [None]:
gb_model = gb_classifier.estimator
gb_model.set_params(**gb_classifier.best_params_)
gb_model.fit(X_train, y_train)
print_scores(gb_model, X_test, y_test)

### Choose final classifier
here we have used `Matthews correlation coefficient (MCC)` to figure out which classifier is performing best

In [None]:
best_classifier = gb_model if matthews_corrcoef(y_test, gb_model.predict(X_test)) > matthews_corrcoef(y_test, rf_model.predict(X_test)) else rf_model

In [None]:
'''
utility function to help plot confusion matrix 
showing number of correct and incorrect predictions for data provided
'''
def plot_conf_mat(classifier, X, y):
    sns.set(font_scale=1.25)
    fig, ax = plt.subplots(figsize=(6,4))

    _ = sns.heatmap(confusion_matrix(y, classifier.predict(X)), ax= ax, annot=True, fmt='.2f', cmap='Blues')
    plt.xlabel('actual')
    plt.ylabel('predicted')
    _ = plt.yticks(rotation=0)

In [None]:
plot_conf_mat(best_classifier, X_test, y_test)
print(classification_report(y_test, best_classifier.predict(X_test)))

### Auto ML
use auto ML to automate model training and selection

In [None]:
pipeline_optimizer = TPOTClassifier(cv=cv, verbosity=2, n_jobs=-1, max_time_mins=15, \
                                    scoring=scoring
                                   )
pipeline_optimizer.fit(X_train, y_train)

evaluate classifier from auto ML

In [None]:
print_scores(pipeline_optimizer, X_test, y_test)

In [None]:
plot_conf_mat(pipeline_optimizer, X_test, y_test)
print(classification_report(y_test, pipeline_optimizer.predict(X_test)))