# Table of Contents
* [Target](#1)
* [Numerical Features](#num)
* [Categorical Features](#cat)
* [Target vs Features](#target_feats)
* [Fit Model](#model)
* [Predict on Test Set](#pred_test)

In [None]:
# packages

# standard
import numpy as np
import pandas as pd
import time
import gc

# plots
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.graphics.mosaicplot import mosaic

# machine learning tools
import h2o
from h2o.estimators import H2OGradientBoostingEstimator

In [None]:
# change number format in outputs
pd.options.display.float_format = "{:.2f}".format

In [None]:
# load data + first glance
t1 = time.time()
df_train = pd.read_csv('../input/tabular-playground-series-dec-2021/train.csv')
df_test = pd.read_csv('../input/tabular-playground-series-dec-2021/test.csv')
df_sub = pd.read_csv('../input/tabular-playground-series-dec-2021/sample_submission.csv')
t2 = time.time()

print('Elapsed time [s]:' , np.round(t2-t1,4))

In [None]:
# first glance (training data)
df_train.head()

In [None]:
# dimensions
print('Train Set:', df_train.shape)
print('Test Set :', df_test.shape)

In [None]:
# structure
df_train.info()

<a id='target'></a>
# Target

In [None]:
# convert target to categorical
df_train.Cover_Type = df_train.Cover_Type.astype('category')

In [None]:
# basic stats
print(df_train.Cover_Type.value_counts())
df_train.Cover_Type.value_counts().sort_index().plot(kind='bar')
plt.grid()
plt.show()

<a id='num'></a>
# Numerical Features

In [None]:
features_num = ['Elevation', 'Aspect', 'Slope',
                'Horizontal_Distance_To_Hydrology', 
                'Vertical_Distance_To_Hydrology',
                'Horizontal_Distance_To_Roadways',
                'Hillshade_9am', 'Hillshade_Noon','Hillshade_3pm',
                'Horizontal_Distance_To_Fire_Points']

In [None]:
# basic summary stats
df_train[features_num].describe()

In [None]:
# plot features
for f in features_num:
    df_train[f].plot(kind='hist', bins=50)
    plt.title(f + ' - Training Data')
    plt.grid()
    plt.show()

In [None]:
# correlation
corr_pearson = df_train[features_num].corr(method='pearson')
plt.figure(figsize=(5,4))
sns.heatmap(corr_pearson, annot=False, cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title('Pearson Correlation')
plt.show()

In [None]:
# rank correlation
corr_pearson = df_train[features_num].corr(method='spearman')
plt.figure(figsize=(5,4))
sns.heatmap(corr_pearson, annot=False, cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title('Spearman Correlation')
plt.show()

<a id='cat'></a>
# Categorical Features

In [None]:
# check if encoding is unique
feature_list_wild = ['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3','Wilderness_Area4']
# type conversion
df_train[feature_list_wild] =  df_train[feature_list_wild].astype(np.uint8)
df_test[feature_list_wild] =  df_test[feature_list_wild].astype(np.uint8)
# sum indicators (0/1)
df_train['Wilderness_Area_Sum'] = df_train[feature_list_wild].sum(axis=1)
df_train.Wilderness_Area_Sum.value_counts().sort_index()

### Hmm, we don't have a one hot encoded variable here. There are many cases having more than one "1" in a row.

In [None]:
# count frequencies nevertheless
print(df_train[feature_list_wild].sum())
df_train[feature_list_wild].sum().plot(kind='bar')
plt.grid()
plt.show()

In [None]:
# check if encoding is unique
feature_list_soil = ['Soil_Type1', 'Soil_Type2', 'Soil_Type3',
                     'Soil_Type4', 'Soil_Type5', 'Soil_Type6', 'Soil_Type7', 'Soil_Type8',
                     'Soil_Type9', 'Soil_Type10', 'Soil_Type11', 'Soil_Type12',
                     'Soil_Type13', 'Soil_Type14', 'Soil_Type15', 'Soil_Type16',
                     'Soil_Type17', 'Soil_Type18', 'Soil_Type19', 'Soil_Type20',
                     'Soil_Type21', 'Soil_Type22', 'Soil_Type23', 'Soil_Type24',
                     'Soil_Type25', 'Soil_Type26', 'Soil_Type27', 'Soil_Type28',
                     'Soil_Type29', 'Soil_Type30', 'Soil_Type31', 'Soil_Type32',
                     'Soil_Type33', 'Soil_Type34', 'Soil_Type35', 'Soil_Type36',
                     'Soil_Type37', 'Soil_Type38', 'Soil_Type39', 'Soil_Type40']
# type conversion
df_train[feature_list_soil] =  df_train[feature_list_soil].astype(np.uint8)
df_test[feature_list_soil] =  df_test[feature_list_soil].astype(np.uint8)

# sum indicators (0/1)
df_train['Soil_Type_Sum'] = df_train[feature_list_soil].sum(axis=1)
df_train.Soil_Type_Sum.value_counts().sort_index()

### Also Soil_Type is not "unique"... and the majority of rows have actually no entry at all.

In [None]:
# count frequencies nevertheless
print(df_train[feature_list_soil].sum())
plt.figure(figsize=(12,4))
df_train[feature_list_soil].sum().plot(kind='bar')
plt.grid()
plt.show()

In [None]:
# show structure again
df_train.info()

In [None]:
# garbage collection
gc.collect();

<a id='target_feats'></a>
# Target vs Features

### Numerical Features:

In [None]:
# violinplots by class
t1 = time.time()
for f in features_num:
    plt.figure(figsize=(10,5))
    sns.violinplot(x=f, y='Cover_Type', data=df_train)
    my_title = 'Distribution by class for ' + f
    plt.title(my_title)
    plt.grid()
t2 = time.time()
print('Elapsed time [s]:' , np.round(t2-t1,4))

#### We can ignore class 5 in the pictures above as we have only one observation for this class!

### Categorical Features:

In [None]:
# change number format in outputs
pd.options.display.float_format = "{:.8f}".format

In [None]:
# evaluate impact of Wilderness_Area
for f in feature_list_wild:
    #### cross table - calc absolute counts...
    ctab = pd.crosstab(df_train.Cover_Type, df_train[f])
    # ...and normalized by column
    ctab_norm = ctab / ctab.sum()
    print(ctab_norm)

In [None]:
# evaluate impact of Soil_Type
for f in feature_list_soil:
    #### cross table - calc absolute counts...
    ctab = pd.crosstab(df_train.Cover_Type, df_train[f])
    # ...and normalized by column
    ctab_norm = ctab / ctab.sum()
    print(ctab_norm)

<a id='model'></a>
# Fit Model

In [None]:
# select predictors
predictors = features_num + feature_list_wild + feature_list_soil
# predictors = predictors + ['Wilderness_Area_Sum','Soil_Type_Sum']                 
print('Number of predictors: ', len(predictors))

In [None]:
# start H2O
h2o.init(max_mem_size='12G', nthreads=4) # Use maximum of 12 GB RAM and 4 cores

In [None]:
# upload data frame in H2O environment
t1 = time.time()
train_hex = h2o.H2OFrame(df_train)
t2 = time.time()
print('Elapsed time [s]: ', np.round(t2-t1,2))

# force categorical target
train_hex['Cover_Type'] = train_hex['Cover_Type'].asfactor()

In [None]:
# memory management: remove original data frame + garbage collection
del df_train
gc.collect();

In [None]:
# fit Gradient Boosting model
n_cv = 5

fit_GBM = H2OGradientBoostingEstimator(ntrees=100,
                                       max_depth=6,
                                       min_rows=50,
                                       learn_rate=0.05, # default: 0.1
                                       sample_rate=0.25,
                                       col_sample_rate=0.5,
                                       nfolds=n_cv,
                                       score_each_iteration=True,
                                       stopping_metric='logloss',
                                       stopping_rounds=5,
                                       stopping_tolerance=0.0001,
                                       seed=999)
# train model
t1 = time.time()
fit_GBM.train(x=predictors,
              y='Cover_Type',
              training_frame=train_hex)
t2 = time.time()
print('Elapsed time [s]: ', np.round(t2-t1,2))

In [None]:
# show cross validation metrics
fit_GBM.cross_validation_metrics_summary()

In [None]:
# show scoring history - training vs cross validations
for i in range(n_cv):
    cv_model_temp = fit_GBM.cross_validation_models()[i]
    df_cv_score_history = cv_model_temp.score_history()
    my_title = 'CV ' + str(1+i) + ' - Scoring History'
    plt.scatter(df_cv_score_history.number_of_trees,
                y=df_cv_score_history.training_logloss, 
                c='blue', label='training')
    plt.scatter(df_cv_score_history.number_of_trees,
                y=df_cv_score_history.validation_logloss, 
                c='darkorange', label='validation')
    plt.title(my_title)
    plt.xlabel('Number of Trees')
    plt.ylabel('logloss')
    plt.ylim(0,2)
    plt.legend()
    plt.grid()
    plt.show()

In [None]:
# variable importance
fit_GBM.varimp_plot(-1)

### Evaluate on training data:

In [None]:
# predict on train set
pred_train_GBM = fit_GBM.predict(train_hex).as_data_frame()
# add ground truth
pred_train_GBM['Cover_Type'] = train_hex['Cover_Type'].as_data_frame()
pred_train_GBM.head()

In [None]:
# predicted frequencies
pd.options.display.float_format = "{:.2f}".format
pred_train_GBM[['p1','p2','p3','p4','p5','p6','p7']].sum()

In [None]:
# actual frequencies
train_hex['Cover_Type'].as_data_frame().value_counts().sort_index()

In [None]:
# confusion matrix - training data
conf_train = pd.crosstab(pred_train_GBM.Cover_Type, pred_train_GBM.predict)
sns.heatmap(conf_train, cmap='Blues',
            annot=True, fmt='d',
            vmin=0, vmax=3e6,
            linecolor='black',
            linewidths=0.1)
plt.title('Confusion Matrix - Training')
plt.show()

<a id='pred_test'></a>
# Predict on Test Set

In [None]:
t1 = time.time()
test_hex = h2o.H2OFrame(df_test)
t2 = time.time()
print('Elapsed time [s]: ', np.round(t2-t1,2))

In [None]:
# memory management: remove original data frame + garbage collection
del df_test
gc.collect();

In [None]:
# predict on test set
pred_test_GBM = fit_GBM.predict(test_hex).as_data_frame()
pred_test_GBM

In [None]:
# export predictions incl. probabilities
pred_test_GBM.to_csv('pred_test_GBM.csv', index=False)

In [None]:
# submission
df_sub_GBM = df_sub.copy()
df_sub_GBM.Cover_Type = pred_test_GBM.predict
df_sub_GBM

In [None]:
# check frequencies
df_sub_GBM.Cover_Type.value_counts()

In [None]:
# export submission
df_sub_GBM.to_csv('submission_GBM.csv', index=False)

## WORK IN PROGRESS...