# Binary Classification of Pulsars

## Table of Contents
* [Target](#1)
* [Numerical Features](#2)
* [Target vs Features](#3)
* [Visualization using PCA](#4)
* [Fit Model](#5)
* [Evaluate Model Performance](#6)
* [Explanations](#7)


In [None]:
# packages

# standard
import numpy as np
import pandas as pd
import time

# plots
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

# PCA
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Machine Learning/H2O
import h2o
from h2o.estimators import H2ORandomForestEstimator

In [None]:
# load data
df = pd.read_csv('../input/pulsar-classification-for-class-prediction/Pulsar.csv')

# type conversions
df.Class = df.Class.astype('category')

In [None]:
# preview
df.head()

In [None]:
# structure of data frame
df.info()

#### No missing values...

<a id='1'></a>
# Target

In [None]:
# eval target
print(df['Class'].value_counts())
print()
print(df['Class'].value_counts(normalize=True))

plt.figure(figsize=(8,6))
df['Class'].value_counts().plot(kind='bar')
plt.title('Target (Class)')
plt.grid()
plt.show()

<a id='2'></a>
# Numerical Features

In [None]:
features_num = ['Mean_Integrated', 'SD', 'EK', 
                'Skewness', 'Mean_DMSNR_Curve',
                'SD_DMSNR_Curve', 'EK_DMSNR_Curve',
                'Skewness_DMSNR_Curve']

# summary stats for numerical features
df[features_num].describe()

In [None]:
# plot distributions (histogram + boxplot)
for f in features_num:
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(11,7), sharex=True)
    ax1.hist(df[f], bins=20)
    ax1.grid()
    ax1.set_title(f)
    ax2.boxplot(df[f], vert=False)
    ax2.grid()   
    ax2.set_title(f + ' - boxplot')
    plt.show()

In [None]:
# pairwise scatter plot
sns.pairplot(df[features_num], 
             kind='reg',
             plot_kws={'line_kws':{'color':'magenta'}, 
                       'scatter_kws': {'alpha': 0.25}})
plt.show()

### Correlations

In [None]:
# calc correlations
corr_pearson = df[features_num].corr(method='pearson')
corr_spearman = df[features_num].corr(method='spearman')

# and plot
plt.figure(figsize=(8,6))
sns.heatmap(corr_pearson, annot=True, cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title('Pearson Correlation')
plt.show()

In [None]:
# plot correlation matrix (Spearman)
plt.figure(figsize=(8,6))
sns.heatmap(corr_spearman, annot=True, cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title('Spearman Correlation')
plt.show()

<a id='3'></a>
# Target vs Features

In [None]:
for f in features_num:
    plt.figure(figsize=(10,4))
    sns.violinplot(x=f, y='Class', data=df)
    my_title = 'Distribution by Class for ' + f
    plt.title(my_title)
    plt.grid()

<a id='4'></a>
# Visualization using PCA

In [None]:
# use PCA to reduce dimension of data
df4pca = df[features_num]
# standardize first
df4pca_std = StandardScaler().fit_transform(df4pca)
# define 3D PCA
pc_model = PCA(n_components=3)
# apply PCA
pc = pc_model.fit_transform(df4pca_std)
# add to original data frame
df['pc_1'] = pc[:,0]
df['pc_2'] = pc[:,1]
df['pc_3'] = pc[:,2]
# show extended data frame
df.head()

In [None]:
# interactive plot - click on legend to filter for individual classes
df_subset = df.sample(2500) # use subset only to get performant plot
df_subset['size'] = 1
fig = px.scatter_3d(df_subset, x='pc_1', y='pc_2', z='pc_3',
                    color='Class',
                    size='size',
                    size_max=10,
                    opacity=0.5)
fig.update_layout(title='PCA 3D Interactive')
fig.show()

<a id='5'></a>
# Fit Model

In [None]:
# init H2O
h2o.init(max_mem_size='12G', nthreads=4)

In [None]:
# upload data frame in H2O environment
t1 = time.time()
df_hex = h2o.H2OFrame(df)
t2 = time.time()
print('Elapsed time [s]: ', np.round(t2-t1,2))

In [None]:
# define target
target = 'Class'
# select features
features = features_num
print('Features used:', features)
# explicitly convert target to categorical => classification problem
df_hex[target] = df_hex[target].asfactor()

In [None]:
# train / test split
train_perc = 0.7
train_hex, test_hex = df_hex.split_frame(ratios=[train_perc], seed=999)

In [None]:
# check target distribution in train set
train_hex[target].as_data_frame().value_counts()

In [None]:
# check target distribution in test set
test_hex[target].as_data_frame().value_counts()

In [None]:
# define (distributed) random forest model
n_cv = 5
fit_DRF = H2ORandomForestEstimator(ntrees=50,
                                   max_depth=20,
                                   min_rows=5,
                                   nfolds=n_cv,
                                   score_each_iteration=True,
                                   stopping_metric='auc',
                                   stopping_rounds=5,
                                   stopping_tolerance=1e-4,
                                   seed=999)

# train model
t1 = time.time()
fit_DRF.train(x=features,
              y=target,
              training_frame=train_hex)
t2 = time.time()
print('Elapsed time [s]: ', np.round(t2-t1,2))

In [None]:
# variable importance
fit_DRF.varimp_plot()

In [None]:
# alternative variable importance using SHAP
# => see direction as well as severity of feature impact
t1 = time.time()
fit_DRF.shap_summary_plot(train_hex);
t2 = time.time()
print('Elapsed time [s]: ', np.round(t2-t1,2))

In [None]:
# cross validation metrics
fit_DRF.cross_validation_metrics_summary()

In [None]:
# show scoring history - training vs cross validations
for i in range(n_cv):
    cv_model_temp = fit_DRF.cross_validation_models()[i]
    df_cv_score_history = cv_model_temp.score_history()
    my_title = 'CV ' + str(1+i) + ' - Scoring History [AUC]'
    plt.scatter(df_cv_score_history.number_of_trees,
                y=df_cv_score_history.training_auc, 
                c='blue', label='training')
    plt.scatter(df_cv_score_history.number_of_trees,
                y=df_cv_score_history.validation_auc, 
                c='darkorange', label='validation')
    plt.title(my_title)
    plt.xlabel('Number of Trees')
    plt.ylabel('AUC')
    plt.ylim(0.9,1.0)
    plt.legend()
    plt.grid()
    plt.show()

<a id='6'></a>
# Evaluate Model Performance

### Train Set / CV Performance

In [None]:
# training performance
perf_train = fit_DRF.model_performance(train=True)
perf_train.plot()

In [None]:
# cross validation performance
perf_cv = fit_DRF.model_performance(xval=True)
perf_cv.plot()

In [None]:
# predict on training set
pred_train = fit_DRF.predict(train_hex)
# add actual target
pred_train['target'] = train_hex[target]
pred_train = pred_train.as_data_frame()
# preview
pred_train.head()

In [None]:
# confusion matrix; rows ~ actual observations, cols ~ predictions
conf_train = pd.crosstab(pred_train['target'], pred_train['predict'])
# visualize
sns.heatmap(conf_train, cmap='Blues', annot=True, 
            cbar=False, fmt='d',
            linecolor='black',
            linewidths=0.1)
plt.show()

### Test Set Performance

In [None]:
# predict
pred_test = fit_DRF.predict(test_hex)
# add actual target
pred_test['target'] = test_hex[target]
pred_test = pred_test.as_data_frame()
pred_test.head()

In [None]:
# confusion matrix; rows ~ actual observations, cols ~ predictions
conf_test = pd.crosstab(pred_test['target'], pred_test['predict'])
# visualize
sns.heatmap(conf_test, cmap='Blues', annot=True, 
            cbar=False, fmt='d',
            linecolor='black',
            linewidths=0.1)
plt.show()

<a id='7'></a>
# Explanations

In [None]:
# pick an example
my_row = 5
train_hex[my_row,:]

In [None]:
# show explanations for this row
fit_DRF.explain_row(frame=train_hex, row_index=my_row);