# Table of Contents
* [Target](#1)
* [Features](#2)
* [Target vs Features](#3)
* [PCA Visualization](#4)
* [Model](#5)

In [None]:
# packages

# standard
import numpy as np
import pandas as pd
import time

# plots
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from statsmodels.graphics.mosaicplot import mosaic

# missing values visualization
import missingno as msno

# PCA / Clustering
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# machine learning tools
import h2o
from h2o.estimators import H2OGeneralizedLinearEstimator, H2ORandomForestEstimator, H2OGradientBoostingEstimator

In [None]:
# load data
df = pd.read_csv('../input/water-potability/water_potability.csv')
df.head()
n = df.shape[0]

In [None]:
# structure of data
df.info()

#### We observe missings for some of the columns.

In [None]:
# show structure of missings
msno.matrix(df)
plt.show()

In [None]:
# percentage of missing values
print('Missings ph             :', np.round(100*df.ph.isna().sum() / n, 2), '%')
print('Missings Sulfate        :', np.round(100*df.Sulfate.isna().sum() / n, 2), '%')
print('Missings Trihalomethanes:', np.round(100*df.Trihalomethanes.isna().sum()/n, 2), '%')

<a id='1'></a>
# Target

In [None]:
# basic stats
print(df.Potability.value_counts())
df.Potability.value_counts().plot(kind='bar')
plt.grid()
plt.show()

#### Target = 1 means potable water!

<a id='2'></a>
# Features

In [None]:
# features
features_num = ['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 
                'Conductivity', 'Organic_carbon', 'Trihalomethanes',
                'Turbidity']

In [None]:
# plot distribution of numerical features
for f in features_num:
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10,6), sharex=True)
    ax1.hist(df[f], bins=30)
    ax1.grid()
    ax1.set_title(f)
    # for boxplot we need to remove the NaNs first
    feature_wo_nan = df[~np.isnan(df[f])][f]
    ax2.boxplot(feature_wo_nan, vert=False)
    ax2.grid()
    ax2.set_title(f + ' - boxplot')
    plt.show()

### Correlations

In [None]:
corr_pearson = df[features_num].corr(method='pearson')
corr_spearman = df[features_num].corr(method='spearman')

plt.figure(figsize=(16,6))
ax1 = plt.subplot(1,2,1)
sns.heatmap(corr_pearson, annot=True, cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title('Pearson Correlation')

ax2 = plt.subplot(1,2,2, sharex=ax1)
sns.heatmap(corr_spearman, annot=True, cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title('Spearman Correlation')
plt.show()

In [None]:
# pairwise scatter plot of numerical features
t1 = time.time()
sns.pairplot(df[features_num],
             diag_kws = {'alpha': 1.0},
             plot_kws = {'alpha': 0.1})
plt.show()
t2 = time.time()
print('Elapsed time:', np.round(t2-t1,2))

<a id='3'></a>
# Target vs Features

In [None]:
# plot target vs BINNED numerical features using mosaic plot
plt_para_save = plt.rcParams['figure.figsize'] # remember plot settings

for f in features_num:
    # add binned version of each numerical feature first
    new_var = f + '_bin'
    df[new_var] = pd.qcut(df[f], 10)
    # then create mosaic plot
    plt.rcParams['figure.figsize'] = (16,6) # increase plot size for mosaics
    mosaic(df, [new_var, 'Potability'], title='Target vs ' + f + ' [binned]')
    plt.show()
    
# reset plot size again
plt.rcParams['figure.figsize'] = plt_para_save

### Alternative Visualization:

In [None]:
for f in features_num:
    plt.figure(figsize=(6,4))
    sns.violinplot(y=f, x='Potability', data=df)
    my_title = f + ' - split by target'
    plt.title(my_title)
    plt.grid()

<a id='4'></a>
# PCA Visualization

In [None]:
# use PCA to reduce dimension of data
df4pca = df.copy().dropna(axis=0) # remove rows having missings first
# standardize features
df4pca_std = StandardScaler().fit_transform(df4pca[features_num])
# define 3D PCA model
pc_model = PCA(n_components=3)
# and apply model
pc = pc_model.fit_transform(df4pca_std)
# add components to data frame
df4pca['pc_1'] = pc[:,0]
df4pca['pc_2'] = pc[:,1]
df4pca['pc_3'] = pc[:,2]
# show extended data frame
df4pca.head()

In [None]:
# interactive plot
df4pca['size'] = 1
df4pca.Potability = df4pca.Potability.astype('category')
fig = px.scatter_3d(df4pca, x='pc_1', y='pc_2', z='pc_3',
                    color='Potability',
                    size='size',
                    size_max=10,
                    opacity=0.5)
fig.update_layout(title='PCA 3D Interactive')
fig.show()

#### The plot looks like we will have some trouble finding a good model discriminating potable/non-potable.

<a id='5'></a>
# Model

In [None]:
# start H2O
h2o.init(max_mem_size='12G', nthreads=4) # Use maximum of 12 GB RAM and 4 cores

In [None]:
# upload data frame in H2O environment
t1 = time.time()
df_hex = h2o.H2OFrame(df)
t2 = time.time()
print('Elapsed time [s]: ', np.round(t2-t1,2))

In [None]:
# select features
predictors = features_num
print('Number of predictors: ', len(predictors))
print(predictors)

# define target
target = 'Potability'
# explicitly convert target to categorical => classification problem
df_hex[target] = df_hex[target].asfactor()

In [None]:
# train / test split
train_perc = 0.5
train_hex, test_hex = df_hex.split_frame(ratios=[train_perc], seed=999)

# Pandas versions of train/test set
df_train = train_hex.as_data_frame()
df_test = test_hex.as_data_frame()

In [None]:
# define (distributed) random forest model
fit_DRF = H2ORandomForestEstimator(ntrees=30,
                                   max_depth=20,
                                   min_rows=5,
                                   nfolds=5,
                                   seed=999)

# train model
t1 = time.time()
fit_DRF.train(x=predictors,
              y=target,
              training_frame=train_hex)
t2 = time.time()
print('Elapsed time [s]: ', np.round(t2-t1,2))

In [None]:
# show training scoring history
fit_DRF.plot()

In [None]:
# variable importance
fit_DRF.varimp_plot()

In [None]:
# cross validation metrics
fit_DRF.cross_validation_metrics_summary()

### Performance on Training Data / CV

In [None]:
# training performance
perf_train = fit_DRF.model_performance(train=True)
perf_train.plot()

In [None]:
# CV performance
perf_cv = fit_DRF.model_performance(xval=True)
perf_cv.plot()

In [None]:
# predict on training data
pred_train = fit_DRF.predict(train_hex)
# add actual target
pred_train['target'] = train_hex[target]
pred_train = pred_train.as_data_frame()
pred_train.head()

In [None]:
# plot probabilities
plt.figure(figsize=(8,4))
plt.hist(pred_train.p1, bins=30)
plt.title('Predictions on Train Set')
plt.grid()
plt.show()

In [None]:
# check calibration
n_actual = sum(df_train.Potability)
n_pred = sum(pred_train.p1)

print('Actual Frequency    :', n_actual)
print('Predicted Frequency :', n_pred)
print('Calibration Ratio   :', n_pred / n_actual)

In [None]:
# adjust threshold for 0/1 translation
binary_threshold = 0.4212 # chose such that actual frequency is (approximately) met
pred_train_binary = np.where(pred_train.p1 > binary_threshold, 1, 0)
print('Actual Frequency      :', n_actual)
print('Calibrated Prediction :', sum(pred_train_binary))

In [None]:
# confusion matrix; rows ~ actual observations, cols ~ predictions
conf_train = pd.crosstab(pred_train['target'], pred_train_binary)
# visualize
sns.heatmap(conf_train, cmap='Blues', annot=True, 
            cbar=False, fmt='d',
            linecolor='black',
            linewidths=0.1)
plt.show()

### Performance on Test Set

In [None]:
# predict on test set
pred_test = fit_DRF.predict(test_hex)
# add actual target
pred_test['target'] = test_hex[target]
pred_test = pred_test.as_data_frame()
pred_test.head()

In [None]:
plt.figure(figsize=(8,4))
plt.hist(pred_test.p1, bins=30)
plt.title('Predictions on Test Set')
plt.grid()
plt.show()

In [None]:
# convert to 0/1
pred_test_binary = np.where(pred_test.p1 > binary_threshold, 1, 0)

In [None]:
# confusion matrix; rows ~ actual observations, cols ~ predictions
conf_test = pd.crosstab(pred_test['target'], pred_test_binary)
# visualize
sns.heatmap(conf_test, cmap='Blues', annot=True, 
            cbar=False, fmt='d',
            linecolor='black',
            linewidths=0.1)
plt.show()

In [None]:
print('Accuracy - Test Set:', np.round((conf_test.loc[0,0]+conf_test.loc[1,1])/conf_test.sum().sum(),4))