# Table of contents
* [Feature exploration](#1)
* [Target vs Features](#2)
* [PCA Visualization](#3)
* [Fit Model](#4)

In [None]:
# packages

# standard
import numpy as np
import pandas as pd
import time

# plots
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from statsmodels.graphics.mosaicplot import mosaic

# PCA
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# start H2O
import h2o
from h2o.estimators import H2ORandomForestEstimator

In [None]:
# read data / preview
df = pd.read_csv('../input/orbitclassification/classast - pha.csv')
df.head()

In [None]:
# structure of data frame
df.info()

In [None]:
# summary stats for numerical features
df.describe()

In [None]:
# eval target
print(df['class'].value_counts())

plt.figure(figsize=(8,6))
df['class'].value_counts().plot(kind='bar')
plt.title('Target')
plt.grid()
plt.show()

<a id='1'></a>
# Feature exploration

In [None]:
features_num = ['a (AU)', 'e', 'i (deg)', 'w (deg)', 
                'Node (deg)', 'M (deg)', 'q (AU)',
                'Q (AU)', 'P (yr)', 'H (mag)', 'MOID (AU)']

In [None]:
# plot distributions
for f in features_num:
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12,7))
    ax1.hist(df[f], bins=50)
    ax1.grid()
    ax1.set_title(f)
    ax2.boxplot(df[f], vert=False)
    ax2.grid()   
    ax2.set_title(f + '- boxplot')
    plt.show()

In [None]:
# look at outlier
df[df['a (AU)'] > 17]

In [None]:
# remove outlier for the following
df = df[df['a (AU)']<=17]

In [None]:
# check boxplots again
for f in features_num:
    plt.figure(figsize=(10,2))
    df[f].plot(kind='box', vert=False)
    plt.title(f + ' - outlier removed')
    plt.grid()
    plt.show()

In [None]:
# correlations
corr_pearson = df[features_num].corr(method='pearson')
corr_spearman = df[features_num].corr(method='spearman')

fig = plt.figure(figsize = (10,8))
sns.heatmap(corr_pearson, annot=True, cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title('Pearson Correlation')
plt.show()

fig = plt.figure(figsize = (10,8))
sns.heatmap(corr_spearman, annot=True, cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title('Spearman Correlation')
plt.show()

### There are a few really strong correlations, let's have a closer look:

In [None]:
fx = 'a (AU)'
fy = 'Q (AU)'
plt.scatter(df[fx], df[fy], alpha=0.5)
plt.xlabel(fx)
plt.ylabel(fy)
plt.grid()
plt.show()

In [None]:
fx = 'Q (AU)'
fy = 'P (yr)'
plt.scatter(df[fx], df[fy], alpha=0.5)
plt.xlabel(fx)
plt.ylabel(fy)
plt.grid()
plt.show()

In [None]:
fx = 'a (AU)'
fy = 'P (yr)'
plt.scatter(df[fx], df[fy], alpha=0.5)
plt.xlabel(fx)
plt.ylabel(fy)
plt.grid()
plt.show()

In [None]:
df[fx].corr(df[fy], method='spearman')

#### Well, P (yr) is (almost) 100% correlated to a (AU). Therefore we will not use it as a feature later...

<a id='2'></a>
# Target vs Features

In [None]:
for f in features_num:
    plt.figure(figsize=(10,5))
    sns.violinplot(x=f, y='class', data=df)
    my_title = 'Distribution by class for ' + f
    plt.title(my_title)
    plt.grid()

<a id='3'></a>
# PCA Visualization

In [None]:
# use PCA to reduce dimension of data
df4pca = df[features_num]
# standardize first
df4pca_std = StandardScaler().fit_transform(df4pca)
# define 3D PCA
pc_model = PCA(n_components=3)
# apply PCA
pc = pc_model.fit_transform(df4pca_std)
# add to original data frame
df['pc_1'] = pc[:,0]
df['pc_2'] = pc[:,1]
df['pc_3'] = pc[:,2]
# show extended data frame
df.head()

In [None]:
# interactive plot - click on legend to filter for individual classes
df['size'] = 1
fig = px.scatter_3d(df, x='pc_1', y='pc_2', z='pc_3',
                    color='class',
                    size='size',
                    size_max=10,
                    opacity=0.5)
fig.update_layout(title='PCA 3D')
fig.show()

<a id='4'></a>
# Fit Model

In [None]:
# init H2O
h2o.init(max_mem_size='12G', nthreads=4)

In [None]:
# upload data frame in H2O environment
t1 = time.time()
df_hex = h2o.H2OFrame(df)
t2 = time.time()
print('Elapsed time [s]: ', np.round(t2-t1,2))

In [None]:
# define target
target = 'class'
# select features
features = features_num
features.remove('P (yr)') # removed due to 100% (rank) correlation with other feature

# explicitly convert target to categorical => classification problem
df_hex[target] = df_hex[target].asfactor()

In [None]:
# train / test split
train_perc = 0.7
train_hex, test_hex = df_hex.split_frame(ratios=[train_perc], seed=999)

### Check distribution of target in train / test

In [None]:
train_hex['class'].as_data_frame().value_counts()

In [None]:
test_hex['class'].as_data_frame().value_counts()

In [None]:
# factors for class sampling => mitigate unbalanced target distribution
csf = [1.0,10.0,10.0,60.0,120.0,120.0]

In [None]:
# define (distributed) random forest model
fit_DRF = H2ORandomForestEstimator(ntrees=100,
                                   max_depth=20,
                                   min_rows=5,
                                   nfolds=5,
                                   balance_classes=True,
                                   class_sampling_factors=csf,
                                   seed=999)

# train model
t1 = time.time()
fit_DRF.train(x=features,
              y=target,
              training_frame=train_hex)
t2 = time.time()
print('Elapsed time [s]: ', np.round(t2-t1,2))

In [None]:
# show training scoring history
fit_DRF.plot()

In [None]:
# variable importance
fit_DRF.varimp_plot()

In [None]:
# cross validation metrics
fit_DRF.cross_validation_metrics_summary()

### Performance on Training Data

In [None]:
pred_train = fit_DRF.predict(train_hex)
# add actual target
pred_train['target'] = train_hex[target]
pred_train = pred_train.as_data_frame()
pred_train.head()

In [None]:
# confusion matrix; rows ~ actual observations, cols ~ predictions
conf_train = pd.crosstab(pred_train['target'], pred_train['predict'])
# visualize
sns.heatmap(conf_train, cmap='Blues', annot=True, 
            cbar=False, fmt='d',
            linecolor='black',
            linewidths=0.1)
plt.show()

### Performance on Test Set

In [None]:
# predict
pred_test = fit_DRF.predict(test_hex)
# add actual target
pred_test['target'] = test_hex[target]
pred_test = pred_test.as_data_frame()
pred_test.head()

In [None]:
# confusion matrix; rows ~ actual observations, cols ~ predictions
conf_test = pd.crosstab(pred_test['target'], pred_test['predict'])
# visualize
sns.heatmap(conf_test, cmap='Blues', annot=True, 
            cbar=False, fmt='d',
            linecolor='black',
            linewidths=0.1)
plt.show()