# Notebook 3 - Feature Engineering

In this notebook we consider various feature engineering techniques. Some of these were suggested in the forums for TPS 12 as well as in the original forest cover type competition.

In [1]:
# Global variables for testing changes to this notebook quickly
RANDOM_SEED = 0
NUM_FOLDS = 12

In [2]:
import numpy as np
import pandas as pd
import time
import pyarrow
import gc

# Model evaluation
from functools import partial
from sklearn.base import clone
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, recall_score
from sklearn.inspection import partial_dependence, permutation_importance
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier

# Plotting
import matplotlib
import seaborn as sns
from matplotlib import pyplot as plt

# Hide warnings
import warnings
warnings.filterwarnings('ignore')

# Prepare Data

In [3]:
# Encode soil type
def categorical_encoding(input_df):
    data = input_df.copy()
    data['Soil_Type'] = 0
    soil_features = list()
    for i in range(1,41):
        data['Soil_Type'] += i*data[f'Soil_Type{i}']
        soil_features.append(f'Soil_Type{i}')
    nonsoil_features = [x for x in data.columns if x not in soil_features]
    return data[nonsoil_features]

In [4]:
%%time

# Load original data
original = categorical_encoding(pd.read_feather('../data/original.feather'))

# Label Encode
old_encoder = LabelEncoder()
original["Cover_Type"] = old_encoder.fit_transform(original["Cover_Type"])
y_train = original['Cover_Type'].iloc[:15119]
y_test = original['Cover_Type'].iloc[15119:]

# Get feature columns
features = [x for x in original.columns if x not in ['Id','Cover_Type']]

# Data structures for summary scores
bagging_scores = list()
extratrees_scores = list()
adaboost_scores = list()
random_scores = list()

Wall time: 303 ms


# Scoring Function

In [5]:
def train_original(sklearn_model, processing = None):
    
    # Original Training/Test Split
    X_temp = original[features].iloc[:15119]
    X_test = original[features].iloc[15119:]
    y_temp = original['Cover_Type'].iloc[:15119]
    y_test = original['Cover_Type'].iloc[15119:]
    
    # Feature Engineering
    if processing:
        X_temp = processing(X_temp)
        X_test = processing(X_test)
        
    # Store the out-of-fold predictions
    test_preds = np.zeros((X_test.shape[0],7))
    oof_preds = np.zeros((X_temp.shape[0],))
    scores, times = np.zeros(NUM_FOLDS), np.zeros(NUM_FOLDS)
    
    # Stratified k-fold cross-validation
    skf = StratifiedKFold(n_splits = NUM_FOLDS, shuffle = True, random_state = RANDOM_SEED)
    for fold, (train_idx, valid_idx) in enumerate(skf.split(X_temp,y_temp)):
       
        # Training and Validation Sets
        X_train, X_valid = X_temp.iloc[train_idx], X_temp.iloc[valid_idx]
        y_train, y_valid = y_temp.iloc[train_idx], y_temp.iloc[valid_idx]
        
        # Create model
        start = time.time()
        model = clone(sklearn_model)
        model.fit(X_train, y_train)

        # validation and test predictions
        valid_preds = np.ravel(model.predict(X_valid))
        oof_preds[valid_idx] = valid_preds
        test_preds += model.predict_proba(X_test)
        
        # Save scores and times
        scores[fold] = accuracy_score(y_valid, valid_preds)
        end = time.time()
        times[fold] = end-start
        time.sleep(0.5)
    
    test_preds = np.argmax(test_preds, axis = 1)
    test_score = accuracy_score(y_test, test_preds)
    print('\n'+model.__class__.__name__)
    print("Train Accuracy:", round(scores.mean(), 5))
    print('Test Accuracy:', round(test_score, 5))
    print(f'Training Time: {round(times.sum(), 2)}s')
    
    return scores.mean(), oof_preds, test_score

# Models

We use the following 4 models from the scikit-learn library:

1. AdaBoost 
2. ExtraTrees
3. Bagging
4. Random Forest

In [6]:
# AdaBoost Classifier
adaboost = AdaBoostClassifier(
    base_estimator = DecisionTreeClassifier(
        splitter = 'random',
        random_state = RANDOM_SEED,
    ),
    random_state = RANDOM_SEED,
)

# ExtraTrees Classifier
extratrees = ExtraTreesClassifier(
    n_jobs = -1,
    random_state = RANDOM_SEED,
    max_features = None,
)

# Bagging Classifier
bagging = BaggingClassifier(
    base_estimator = DecisionTreeClassifier(
        splitter = 'random',
        random_state = RANDOM_SEED,
    ),
    n_jobs = -1,
    random_state = RANDOM_SEED
)

# Random Forest Classifier
randomforest = RandomForestClassifier(
    n_jobs = -1,
    random_state = RANDOM_SEED,
)

# Baselines

In [7]:
# AdaBoost
cv_score, oof_preds, test_score = train_original(adaboost)

adaboost_scores.append((
    'Baseline', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

# ExtraTrees
cv_score, oof_preds, test_score = train_original(extratrees)

extratrees_scores.append((
    'Baseline', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

# Bagging
cv_score, oof_preds, test_score = train_original(bagging)

bagging_scores.append((
    'Baseline', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

# Random Forest
cv_score, oof_preds, test_score = train_original(randomforest)

random_scores.append((
    'Baseline', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))


AdaBoostClassifier
Train Accuracy: 0.78596
Test Accuracy: 0.76229
Training Time: 3.63s

ExtraTreesClassifier
Train Accuracy: 0.88538
Test Accuracy: 0.78206
Training Time: 38.97s

BaggingClassifier
Train Accuracy: 0.85006
Test Accuracy: 0.75982
Training Time: 19.56s

RandomForestClassifier
Train Accuracy: 0.86454
Test Accuracy: 0.74822
Training Time: 37.17s


# Feature Engineering

1. Water Distance
2. Shade Features
3. Distance Interactions
4. Binary Features
5. Elevation Interactions

## 1. Water Distance Features

Combine the horizonal/vertical distances into 2d distances:

1. Euclidean Distance
2. Manhatten Distance

In [8]:
def water_distance_features(data):
    df = data.copy()
    
    # use float64 for squaring
    df["Horizontal_Distance_To_Hydrology"] = df["Horizontal_Distance_To_Hydrology"].astype('float64')
    df["Vertical_Distance_To_Hydrology"] = df["Vertical_Distance_To_Hydrology"].astype('float64')
    
    # compute metrics
    df["Hydro_Taxicab"] = np.abs(df["Horizontal_Distance_To_Hydrology"]) + np.abs(df["Vertical_Distance_To_Hydrology"])
    df["Hydro_Euclid"] = (df["Horizontal_Distance_To_Hydrology"]**2 + np.abs(df["Vertical_Distance_To_Hydrology"])**2)**0.5
    
    # convert back
    df["Horizontal_Distance_To_Hydrology"] = df["Horizontal_Distance_To_Hydrology"].astype('float32')
    df["Vertical_Distance_To_Hydrology"] = df["Vertical_Distance_To_Hydrology"].astype('float32')
    df["Hydro_Taxicab"] = df["Hydro_Taxicab"].astype('float32')
    df["Hydro_Euclid"] = df["Hydro_Euclid"].astype('float32')
    
    return df

In [9]:
# AdaBoost
cv_score, oof_preds, test_score = train_original(adaboost, water_distance_features)

adaboost_scores.append((
    'Water_Dist', cv_score, test_score,
     *recall_score(original['Cover_Type'].iloc[:15119], oof_preds, average = None)
))

# Extra Trees
cv_score, oof_preds, test_score = train_original(extratrees, water_distance_features)

extratrees_scores.append((
    'Water_Dist', cv_score, test_score,
     *recall_score(original['Cover_Type'].iloc[:15119], oof_preds, average = None)
))

# Bagging
cv_score, oof_preds, test_score = train_original(bagging, water_distance_features)

bagging_scores.append((
    'Water_Dist', cv_score, test_score,
     *recall_score(original['Cover_Type'].iloc[:15119], oof_preds, average = None)
))

# Random Forest
cv_score, oof_preds, test_score = train_original(randomforest, water_distance_features)

random_scores.append((
    'Water_Dist', cv_score, test_score,
     *recall_score(original['Cover_Type'].iloc[:15119], oof_preds, average = None)
))


AdaBoostClassifier
Train Accuracy: 0.78848
Test Accuracy: 0.75383
Training Time: 3.92s

ExtraTreesClassifier
Train Accuracy: 0.8826
Test Accuracy: 0.7796
Training Time: 39.9s

BaggingClassifier
Train Accuracy: 0.85151
Test Accuracy: 0.75368
Training Time: 20.24s

RandomForestClassifier
Train Accuracy: 0.86408
Test Accuracy: 0.74689
Training Time: 37.58s


## 2. Shade Features

1. Average Hillshade
2. Hillshade Range

In [10]:
def new_shade_features(data):
    df = data.copy()
    shade_features = ['Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm']
    df["Hillshade_Avg"] = df[shade_features].mean(axis=1)
    df['Hillshade_Range'] = df[shade_features].max(axis=1) - df[shade_features].min(axis=1)
    return df

In [11]:
# AdaBoost
cv_score, oof_preds, test_score = train_original(adaboost, new_shade_features)

adaboost_scores.append((
    'Shade_Features', cv_score, test_score,
     *recall_score(original['Cover_Type'].iloc[:15119], oof_preds, average = None)
))

# Extra Trees
cv_score, oof_preds, test_score = train_original(extratrees, new_shade_features)

extratrees_scores.append((
    'Shade_Features', cv_score, test_score,
     *recall_score(original['Cover_Type'].iloc[:15119], oof_preds, average = None)
))

# Bagging 
cv_score, oof_preds, test_score = train_original(bagging, new_shade_features)

bagging_scores.append((
    'Shade_Features', cv_score, test_score,
     *recall_score(original['Cover_Type'].iloc[:15119], oof_preds, average = None)
))

# Random Forest
cv_score, oof_preds, test_score = train_original(randomforest, new_shade_features)

random_scores.append((
    'Shade_Features', cv_score, test_score,
     *recall_score(original['Cover_Type'].iloc[:15119], oof_preds, average = None)
))


AdaBoostClassifier
Train Accuracy: 0.78034
Test Accuracy: 0.74709
Training Time: 3.83s

ExtraTreesClassifier
Train Accuracy: 0.88154
Test Accuracy: 0.77561
Training Time: 38.75s

BaggingClassifier
Train Accuracy: 0.84926
Test Accuracy: 0.74928
Training Time: 20.7s

RandomForestClassifier
Train Accuracy: 0.86018
Test Accuracy: 0.74274
Training Time: 36.07s


## 3. Distance Interactions

Various features created by adding and subtracting the distance features from each other.

In [12]:
def distance_interactions(data):
    df = data.copy()
    df['Hydro_Fire_1'] = df['Horizontal_Distance_To_Hydrology'] + df['Horizontal_Distance_To_Fire_Points']
    df['Hydro_Fire_2'] = abs(df['Horizontal_Distance_To_Hydrology'] - df['Horizontal_Distance_To_Fire_Points'])
    df['Hydro_Road_1'] = abs(df['Horizontal_Distance_To_Hydrology'] + df['Horizontal_Distance_To_Roadways'])
    df['Hydro_Road_2'] = abs(df['Horizontal_Distance_To_Hydrology'] - df['Horizontal_Distance_To_Roadways'])
    df['Fire_Road_1'] = abs(df['Horizontal_Distance_To_Fire_Points'] + df['Horizontal_Distance_To_Roadways'])
    df['Fire_Road_2'] = abs(df['Horizontal_Distance_To_Fire_Points'] - df['Horizontal_Distance_To_Roadways'])
    return df

In [13]:
# AdaBoost
cv_score, oof_preds, test_score = train_original(adaboost, distance_interactions)

adaboost_scores.append((
    'Dist_Interactions', cv_score, test_score,
     *recall_score(original['Cover_Type'].iloc[:15119], oof_preds, average = None)
))

# Extra Trees
cv_score, oof_preds, test_score = train_original(extratrees, distance_interactions)

extratrees_scores.append((
    'Dist_Interactions', cv_score, test_score,
     *recall_score(original['Cover_Type'].iloc[:15119], oof_preds, average = None)
))

# Bagging
cv_score, oof_preds, test_score = train_original(bagging, distance_interactions)

bagging_scores.append((
    'Dist_Interactions', cv_score, test_score,
     *recall_score(original['Cover_Type'].iloc[:15119], oof_preds, average = None)
))

# Random Forest
cv_score, oof_preds, test_score = train_original(randomforest, distance_interactions)

random_scores.append((
    'Dist_Interactions', cv_score, test_score,
     *recall_score(original['Cover_Type'].iloc[:15119], oof_preds, average = None)
))


AdaBoostClassifier
Train Accuracy: 0.80739
Test Accuracy: 0.7873
Training Time: 3.6s

ExtraTreesClassifier
Train Accuracy: 0.89933
Test Accuracy: 0.80702
Training Time: 38.28s

BaggingClassifier
Train Accuracy: 0.87003
Test Accuracy: 0.78387
Training Time: 21.93s

RandomForestClassifier
Train Accuracy: 0.88663
Test Accuracy: 0.7814
Training Time: 40.5s


## 4. Binary Features

In [14]:
def binary_features(data):
    df = data.copy()
    df['Highwater'] = (df.Vertical_Distance_To_Hydrology < 0).astype(int)
    df['Hillshade_3pm_is_zero'] = (df.Hillshade_3pm == 0).astype(int)
    return df

In [15]:
# AdaBoost
cv_score, oof_preds, test_score = train_original(adaboost, binary_features)

adaboost_scores.append((
    'Binary', cv_score, test_score,
     *recall_score(original['Cover_Type'].iloc[:15119], oof_preds, average = None)
))

# Extra Trees
cv_score, oof_preds, test_score = train_original(extratrees, binary_features)

extratrees_scores.append((
    'Binary', cv_score, test_score,
     *recall_score(original['Cover_Type'].iloc[:15119], oof_preds, average = None)
))

# Bagging
cv_score, oof_preds, test_score = train_original(bagging, binary_features)

bagging_scores.append((
    'Binary', cv_score, test_score,
     *recall_score(original['Cover_Type'].iloc[:15119], oof_preds, average = None)
))

# Random Forest
cv_score, oof_preds, test_score = train_original(randomforest, binary_features)

random_scores.append((
    'Binary', cv_score, test_score,
     *recall_score(original['Cover_Type'].iloc[:15119], oof_preds, average = None)
))


AdaBoostClassifier
Train Accuracy: 0.78286
Test Accuracy: 0.75505
Training Time: 3.53s

ExtraTreesClassifier
Train Accuracy: 0.88372
Test Accuracy: 0.78193
Training Time: 35.45s

BaggingClassifier
Train Accuracy: 0.85336
Test Accuracy: 0.75683
Training Time: 21.54s

RandomForestClassifier
Train Accuracy: 0.86659
Test Accuracy: 0.75107
Training Time: 35.96s


## 5. Elevation Interactions

Interaction features created by combining various numerical features.

In [16]:
def elevation_interactions(data):
    df = data.copy()
    df['EHiElv'] = df['Horizontal_Distance_To_Roadways'] * df['Elevation']
    df['EViElv'] = df['Vertical_Distance_To_Hydrology'] * df['Elevation']
    df['EVDtH'] = df.Elevation - df.Vertical_Distance_To_Hydrology
    df['EHDtH'] = df.Elevation - df.Horizontal_Distance_To_Hydrology * 0.2
    return df

In [17]:
# AdaBoost
cv_score, oof_preds, test_score = train_original(adaboost, elevation_interactions)

adaboost_scores.append((
    'Elev_Interactions', cv_score, test_score,
     *recall_score(original['Cover_Type'].iloc[:15119], oof_preds, average = None)
))

# Extra Trees
cv_score, oof_preds, test_score = train_original(extratrees, elevation_interactions)

extratrees_scores.append((
    'Elev_Interactions', cv_score, test_score,
     *recall_score(original['Cover_Type'].iloc[:15119], oof_preds, average = None)
))

# Bagging
cv_score, oof_preds, test_score = train_original(bagging, elevation_interactions)

bagging_scores.append((
    'Elev_Interactions', cv_score, test_score,
     *recall_score(original['Cover_Type'].iloc[:15119], oof_preds, average = None)
))

# Random Forest
cv_score, oof_preds, test_score = train_original(randomforest, elevation_interactions)

random_scores.append((
    'Elev_Interactions', cv_score, test_score,
     *recall_score(original['Cover_Type'].iloc[:15119], oof_preds, average = None)
))


AdaBoostClassifier
Train Accuracy: 0.78438
Test Accuracy: 0.75217
Training Time: 3.66s

ExtraTreesClassifier
Train Accuracy: 0.88617
Test Accuracy: 0.7785
Training Time: 36.41s

BaggingClassifier
Train Accuracy: 0.84827
Test Accuracy: 0.74915
Training Time: 21.89s

RandomForestClassifier
Train Accuracy: 0.86117
Test Accuracy: 0.73893
Training Time: 41.65s


## Summary 

The distance interaction features are the only ones that seem particularly promising.

In [22]:
# AdaBoost
pd.DataFrame.from_records(
    data = adaboost_scores,
    columns = ['model','cv_score','holdout','recall_0','recall_1','recall_2','recall_3','recall_4','recall_5','recall_6']
).sort_values('cv_score')

Unnamed: 0,model,cv_score,holdout,recall_0,recall_1,recall_2,recall_3,recall_4,recall_5,recall_6
2,Shade_Features,0.780344,0.747088,0.641204,0.625,0.735526,0.911111,0.880093,0.750463,0.918981
4,Binary,0.782856,0.755046,0.666204,0.615741,0.71283,0.915278,0.887963,0.758796,0.923148
5,Elev_Interactions,0.784377,0.752169,0.648611,0.618519,0.730431,0.918981,0.884259,0.759722,0.930093
0,Baseline,0.785965,0.762291,0.674537,0.6375,0.72302,0.918519,0.878704,0.74537,0.924074
1,Water_Dist,0.788479,0.753833,0.665741,0.601852,0.747568,0.92037,0.876852,0.77037,0.936574
3,Dist_Interactions,0.807395,0.787304,0.686111,0.65,0.769338,0.916667,0.90463,0.776852,0.948148


In [23]:
# ExtraTrees
pd.DataFrame.from_records(
    data = extratrees_scores,
    columns = ['model','cv_score','holdout','recall_0','recall_1','recall_2','recall_3','recall_4','recall_5','recall_6']
).sort_values('cv_score')

Unnamed: 0,model,cv_score,holdout,recall_0,recall_1,recall_2,recall_3,recall_4,recall_5,recall_6
2,Shade_Features,0.88154,0.775611,0.783333,0.725,0.855489,0.972222,0.958333,0.901389,0.975
1,Water_Dist,0.882599,0.779601,0.786574,0.730556,0.853173,0.971759,0.960648,0.901852,0.973611
4,Binary,0.883723,0.781931,0.78287,0.740278,0.855025,0.973611,0.959722,0.901852,0.972685
0,Baseline,0.885377,0.78206,0.7875,0.7375,0.86151,0.971759,0.961111,0.905556,0.972685
5,Elev_Interactions,0.88617,0.778504,0.783333,0.736574,0.857805,0.975463,0.962037,0.910648,0.977315
3,Dist_Interactions,0.899333,0.80702,0.806481,0.768519,0.877258,0.977315,0.96713,0.921296,0.977315


In [24]:
# Bagging
pd.DataFrame.from_records(
    data = bagging_scores,
    columns = ['model','cv_score','holdout','recall_0','recall_1','recall_2','recall_3','recall_4','recall_5','recall_6']
).sort_values('cv_score')

Unnamed: 0,model,cv_score,holdout,recall_0,recall_1,recall_2,recall_3,recall_4,recall_5,recall_6
5,Elev_Interactions,0.848271,0.74915,0.768056,0.639352,0.833256,0.967593,0.941667,0.82963,0.958333
2,Shade_Features,0.849263,0.749276,0.781944,0.650463,0.826772,0.964352,0.940278,0.82963,0.951389
0,Baseline,0.850057,0.75982,0.775,0.672222,0.819824,0.958333,0.939815,0.830556,0.95463
1,Water_Dist,0.851512,0.753678,0.768519,0.663426,0.834646,0.959259,0.947685,0.831019,0.956019
4,Binary,0.853365,0.756829,0.769907,0.664815,0.830477,0.96713,0.943981,0.839815,0.957407
3,Dist_Interactions,0.870032,0.783867,0.799537,0.688889,0.86012,0.968056,0.952315,0.853704,0.967593


In [25]:
# Random Forest
pd.DataFrame.from_records(
    data = random_scores,
    columns = ['model','cv_score','holdout','recall_0','recall_1','recall_2','recall_3','recall_4','recall_5','recall_6']
).sort_values('cv_score')

Unnamed: 0,model,cv_score,holdout,recall_0,recall_1,recall_2,recall_3,recall_4,recall_5,recall_6
2,Shade_Features,0.860177,0.742739,0.759722,0.685648,0.825845,0.968519,0.953241,0.865278,0.962963
5,Elev_Interactions,0.861168,0.738926,0.756019,0.67037,0.837425,0.970833,0.956944,0.86713,0.969444
1,Water_Dist,0.864079,0.746887,0.771296,0.693981,0.825382,0.968056,0.954167,0.868056,0.967593
0,Baseline,0.864542,0.748216,0.765741,0.69537,0.824456,0.970833,0.955556,0.871759,0.968056
4,Binary,0.866592,0.751071,0.774537,0.702778,0.824919,0.971296,0.95463,0.875,0.962963
3,Dist_Interactions,0.886633,0.781395,0.802315,0.725463,0.856878,0.975463,0.964815,0.903241,0.978241
