# Introduction

This notebook is aimed at experimenting with the capabilities and performance of H2O.ai's AutoML technology on the multi-class prediction problem for this competition.

It will contain the log different model training scenarios (different number of models, different training time, different feature engineering composition etc.)

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import datetime as dt
from typing import Tuple

In [None]:
# main flow
start_time = dt.datetime.now()
print("Started at ", start_time)

In [None]:
%%time
trainfile = '/kaggle/input/tabular-playground-series-dec-2021/train.csv'
testfile = '/kaggle/input/tabular-playground-series-dec-2021/test.csv'
subfile = '/kaggle/input/tabular-playground-series-dec-2021/sample_submission.csv'

train = pd.read_csv(trainfile)
test = pd.read_csv(testfile)
sub = pd.read_csv(subfile)
print(train.shape, test.shape)
train.head()

In [None]:
%%time
target = 'Cover_Type'
print('rows dropped = ', train[((train[target] == 4) | (train[target] == 5))].shape)
train = train[~((train[target] == 4) | (train[target] == 5))]
print(train.shape)

In [None]:
%%time
# additional Feature Engineering (FE)

# remove useless features
zero_variance_features = [ 'Soil_Type7', 'Soil_Type15', 'Id']

train = train.drop(zero_variance_features, axis=1)
test = test.drop(zero_variance_features, axis=1)

# extra feature engineering
def r(x):
    if x+180>360:
        return x-180
    else:
        return x+180

def fe(df):
    
    features_Hillshade = ['Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm']
    
    df['EHiElv'] = df['Horizontal_Distance_To_Roadways'] * df['Elevation']
    df['EViElv'] = df['Vertical_Distance_To_Hydrology'] * df['Elevation']
    df['Aspect2'] = df.Aspect.map(r)
    ### source: https://www.kaggle.com/c/tabular-playground-series-dec-2021/discussion/293373
    df["Aspect"][df["Aspect"] < 0] += 360
    df["Aspect"][df["Aspect"] > 359] -= 360
    df.loc[df["Hillshade_9am"] < 0, "Hillshade_9am"] = 0
    df.loc[df["Hillshade_Noon"] < 0, "Hillshade_Noon"] = 0
    df.loc[df["Hillshade_3pm"] < 0, "Hillshade_3pm"] = 0
    df.loc[df["Hillshade_9am"] > 255, "Hillshade_9am"] = 255
    df.loc[df["Hillshade_Noon"] > 255, "Hillshade_Noon"] = 255
    df.loc[df["Hillshade_3pm"] > 255, "Hillshade_3pm"] = 255
    ########
    df['Highwater'] = (df.Vertical_Distance_To_Hydrology < 0).astype(int)
    df['EVDtH'] = df.Elevation - df.Vertical_Distance_To_Hydrology
    df['EHDtH'] = (df.Elevation - df.Horizontal_Distance_To_Hydrology * 0.2).astype(int)
    df['Euclidean_Distance_to_Hydrolody'] = ((df['Horizontal_Distance_To_Hydrology']**2 + df['Vertical_Distance_To_Hydrology']**2)**0.5).astype(int)
    df['Manhattan_Distance_to_Hydrolody'] = df['Horizontal_Distance_To_Hydrology'] + df['Vertical_Distance_To_Hydrology']
    df['Hydro_Fire_1'] = df['Horizontal_Distance_To_Hydrology'] + df['Horizontal_Distance_To_Fire_Points']
    df['Hydro_Fire_2'] = abs(df['Horizontal_Distance_To_Hydrology'] - df['Horizontal_Distance_To_Fire_Points'])
    df['Hydro_Road_1'] = abs(df['Horizontal_Distance_To_Hydrology'] + df['Horizontal_Distance_To_Roadways'])
    df['Hydro_Road_2'] = abs(df['Horizontal_Distance_To_Hydrology'] - df['Horizontal_Distance_To_Roadways'])
    df['Fire_Road_1'] = abs(df['Horizontal_Distance_To_Fire_Points'] + df['Horizontal_Distance_To_Roadways'])
    df['Fire_Road_2'] = abs(df['Horizontal_Distance_To_Fire_Points'] - df['Horizontal_Distance_To_Roadways'])
    df['Hillshade_3pm_is_zero'] = (df.Hillshade_3pm == 0).astype(int)
    
    df["Hillshade_mean"] = df[features_Hillshade].mean(axis=1).astype(int)
    df['amp_Hillshade'] = df[features_Hillshade].max(axis=1) - df[features_Hillshade].min(axis=1).astype(int)
    return df

train = fe(train)
test = fe(test)

# Summed features pointed out by @craigmthomas (https://www.kaggle.com/c/tabular-playground-series-dec-2021/discussion/292823)
soil_features = [x for x in train.columns if x.startswith("Soil_Type")]
wilderness_features = [x for x in train.columns if x.startswith("Wilderness_Area")]

train["soil_type_count"] = train[soil_features].sum(axis=1)
test["soil_type_count"] = test[soil_features].sum(axis=1)

train["wilderness_area_count"] = train[wilderness_features].sum(axis=1)
test["wilderness_area_count"] = test[wilderness_features].sum(axis=1)

In [None]:
%%time
# subset of useful features, as per https://www.kaggle.com/gvyshnya/eli5-perm-fi-with-additional-fe-dec-21-tpc
pi_features = ['Elevation',
 'EVDtH',
 'EHDtH',
 'soil_type_count',
 'Wilderness_Area3',
 'Wilderness_Area1',
 'Soil_Type39',
 'EHiElv',
 'Soil_Type38',
 'Horizontal_Distance_To_Roadways',
 'Wilderness_Area4',
 'Soil_Type40',
 'Fire_Road_1',
 'Hydro_Fire_2',
 'Horizontal_Distance_To_Fire_Points',
 'Fire_Road_2',
 'Soil_Type2',
 'Hydro_Fire_1',
 'Soil_Type4',
 'Soil_Type22',
 'Soil_Type10',
 'Hydro_Road_1',
 'Hydro_Road_2',
 'Soil_Type35',
 'Soil_Type37',
 'Soil_Type3',
 'Soil_Type23',
 'Manhattan_Distance_to_Hydrolody',
 'Soil_Type36',
 'Soil_Type33',
 'wilderness_area_count',
 'Euclidean_Distance_to_Hydrolody',
 'Soil_Type1',
 'Vertical_Distance_To_Hydrology',
 'Soil_Type5',
 'EViElv',
 'Horizontal_Distance_To_Hydrology',
 'Soil_Type32',
 'Soil_Type31',
 'Soil_Type13',
 'Hillshade_Noon',
 'Soil_Type11',
 'Aspect']

pi_features_and_target = [fi for fi in pi_features]
pi_features_and_target.append(target)

In [None]:
%%time
# subset the training and validation sets with the important features only
train = train.loc[:, pi_features_and_target]
test   = test.loc[:, pi_features]

In [None]:
%%time
# Starting H2O
import h2o
print(h2o.__version__)
from h2o.automl import H2OAutoML

h2o.init(max_mem_size='16G')

In [None]:
%%time
train_h2o = h2o.H2OFrame(train)
test_h2o = h2o.H2OFrame(test)

In [None]:
%%time
x = test_h2o.columns
y = target

In [None]:
# Run AutoML for 200 base models, up max_runtime_secs in terms of duration (limited to 1 hour max runtime by default)
# max_runtime_secs = 14400 # 4 h in terms of duration (limited to 1 hour max runtime by default)
max_runtime_secs = 16200 # 4.5 h in terms of training duration
# max_runtime_secs = 1800 # 30 min in terms of training duration
aml = H2OAutoML(max_models=200, seed=47, max_runtime_secs=max_runtime_secs)
aml.train(x=x, y=y, training_frame=train_h2o)

In [None]:
# View the AutoML Leaderboard
lb = aml.leaderboard
lb.head(rows=lb.nrows)  # Print all rows instead of default (10 rows)

In [None]:
%%time
# The leader model is stored here
aml.leader

In [None]:
%%time
# If you need to generate predictions on a test set, you can make
# predictions directly on the `"H2OAutoML"` object, or on the leader
# model object directly

preds = aml.predict(test_h2o)

In [None]:
%%time
preds.as_data_frame().values.flatten()

In [None]:
# since H2O predicts it as a float, we round it to the nearest int value
target_class = [round(a) for a in preds.as_data_frame().values.flatten()]

In [None]:
%%time
sub[target] = target_class
sub.to_csv('h2o_automl_submission.csv', index=False)

# Log of Model Performance

- **0.89266** - the baseline prediction (useful raw features only, 30 min training cap, up to 200 models to train)
- **0.91640** - the baseline prediction (useful raw features only, 4.5 h training cap, up to 200 models to train)
- **0.91702** - the prediction with additional feature engineering (important features as detected by https://www.kaggle.com/gvyshnya/eli5-perm-fi-with-additional-fe-dec-21-tpc, 30  min training cap, up to 200 models to train)

In [None]:
print('We are done. That is all, folks!')
finish_time = dt.datetime.now()
print("Finished at ", finish_time)
elapsed = finish_time - start_time
print("Elapsed time: ", elapsed)