# Notebook 3 - Feature Engineering

In this notebook we consider various feature engineering techniques. Some of these were suggested in the forums for TPS 12 as well as in the original forest cover type competition.

In [1]:
# Global variables for testing changes to this notebook quickly
RANDOM_SEED = 0
NUM_FOLDS = 12

In [2]:
import numpy as np
import pandas as pd
import time
import pyarrow
import gc

# Model evaluation
from functools import partial
from sklearn.base import clone
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, recall_score
from sklearn.inspection import partial_dependence, permutation_importance
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier

# Plotting
import matplotlib
import seaborn as sns
from matplotlib import pyplot as plt

# Hide warnings
import warnings
warnings.filterwarnings('ignore')

# Load Data

In [3]:
%%time

# Load original data
original = pd.read_feather('../data/original.feather')

# Label Encode
old_encoder = LabelEncoder()
original["Cover_Type"] = old_encoder.fit_transform(original["Cover_Type"])
y_train = original['Cover_Type'].iloc[:15119]
y_test = original['Cover_Type'].iloc[15119:]

# Get feature columns
features = [x for x in original.columns if x not in ['Id','Cover_Type']]

# Data structures for summary scores
bagging_scores = list()
extratrees_scores = list()
adaboost_scores = list()
random_scores = list()

Wall time: 52 ms


# Scoring Function

In [4]:
def train_original(sklearn_model, processing = None):
    
    # Original Training/Test Split
    X_temp = original[features].iloc[:15119]
    X_test = original[features].iloc[15119:]
    y_temp = original['Cover_Type'].iloc[:15119]
    y_test = original['Cover_Type'].iloc[15119:]
    
    # Feature Engineering
    if processing:
        X_temp = processing(X_temp)
        X_test = processing(X_test)
        
    # Store the out-of-fold predictions
    test_preds = np.zeros((X_test.shape[0],7))
    oof_preds = np.zeros((X_temp.shape[0],))
    scores, times = np.zeros(NUM_FOLDS), np.zeros(NUM_FOLDS)
    
    # Stratified k-fold cross-validation
    skf = StratifiedKFold(n_splits = NUM_FOLDS, shuffle = True, random_state = RANDOM_SEED)
    for fold, (train_idx, valid_idx) in enumerate(skf.split(X_temp,y_temp)):
       
        # Training and Validation Sets
        X_train, X_valid = X_temp.iloc[train_idx], X_temp.iloc[valid_idx]
        y_train, y_valid = y_temp.iloc[train_idx], y_temp.iloc[valid_idx]
        
        # Create model
        start = time.time()
        model = clone(sklearn_model)
        model.fit(X_train, y_train)

        # validation and test predictions
        valid_preds = np.ravel(model.predict(X_valid))
        oof_preds[valid_idx] = valid_preds
        test_preds += model.predict_proba(X_test)
        
        # Save scores and times
        scores[fold] = accuracy_score(y_valid, valid_preds)
        end = time.time()
        times[fold] = end-start
        time.sleep(0.5)
    
    test_preds = np.argmax(test_preds, axis = 1)
    test_score = accuracy_score(y_test, test_preds)
    print('\n'+model.__class__.__name__)
    print("Train Accuracy:", round(scores.mean(), 5))
    print('Test Accuracy:', round(test_score, 5))
    print(f'Training Time: {round(times.sum(), 2)}s')
    
    return scores.mean(), oof_preds, test_score

# Models

We use the following 4 models from the scikit-learn library:

1. AdaBoost 
2. ExtraTrees
3. Bagging
4. Random Forest

In [5]:
# AdaBoost Classifier
adaboost = AdaBoostClassifier(
    base_estimator = DecisionTreeClassifier(
        splitter = 'random',
        random_state = RANDOM_SEED,
    ),
    random_state = RANDOM_SEED,
)

# ExtraTrees Classifier
extratrees = ExtraTreesClassifier(
    n_jobs = -1,
    random_state = RANDOM_SEED,
    max_features = None,
)

# Bagging Classifier
bagging = BaggingClassifier(
    base_estimator = DecisionTreeClassifier(
        splitter = 'random',
        random_state = RANDOM_SEED,
    ),
    n_jobs = -1,
    random_state = RANDOM_SEED
)

# Random Forest Classifier
randomforest = RandomForestClassifier(
    n_jobs = -1,
    random_state = RANDOM_SEED,
)

# Baselines

In [6]:
# AdaBoost
cv_score, oof_preds, test_score = train_original(adaboost)

adaboost_scores.append((
    'Baseline', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

# ExtraTrees
cv_score, oof_preds, test_score = train_original(extratrees)

extratrees_scores.append((
    'Baseline', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

# Bagging
cv_score, oof_preds, test_score = train_original(bagging)

bagging_scores.append((
    'Baseline', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

# Random Forest
cv_score, oof_preds, test_score = train_original(randomforest)

random_scores.append((
    'Baseline', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))


AdaBoostClassifier
Train Accuracy: 0.80356
Test Accuracy: 0.75373
Training Time: 4.02s

ExtraTreesClassifier
Train Accuracy: 0.88491
Test Accuracy: 0.77808
Training Time: 35.99s

BaggingClassifier
Train Accuracy: 0.85581
Test Accuracy: 0.75372
Training Time: 20.71s

RandomForestClassifier
Train Accuracy: 0.86395
Test Accuracy: 0.74895
Training Time: 34.48s


# Feature Engineering

1. Water Distance
2. Shade Features
3. Distance Interactions
4. Binary Features
5. Elevation Interactions

## 1. Water Distance Features

Combine the horizonal/vertical distances into 2d distances:

1. Euclidean Distance
2. Manhatten Distance

In [7]:
def water_distance_features(data):
    df = data.copy()
    
    # use float64 for squaring
    df["Horizontal_Distance_To_Hydrology"] = df["Horizontal_Distance_To_Hydrology"].astype('float64')
    df["Vertical_Distance_To_Hydrology"] = df["Vertical_Distance_To_Hydrology"].astype('float64')
    
    # compute metrics
    df["Hydro_Taxicab"] = np.abs(df["Horizontal_Distance_To_Hydrology"]) + np.abs(df["Vertical_Distance_To_Hydrology"])
    df["Hydro_Euclid"] = (df["Horizontal_Distance_To_Hydrology"]**2 + np.abs(df["Vertical_Distance_To_Hydrology"])**2)**0.5
    
    # convert back
    df["Horizontal_Distance_To_Hydrology"] = df["Horizontal_Distance_To_Hydrology"].astype('float32')
    df["Vertical_Distance_To_Hydrology"] = df["Vertical_Distance_To_Hydrology"].astype('float32')
    df["Hydro_Taxicab"] = df["Hydro_Taxicab"].astype('float32')
    df["Hydro_Euclid"] = df["Hydro_Euclid"].astype('float32')
    
    return df

In [8]:
# AdaBoost
cv_score, oof_preds, test_score = train_original(adaboost, water_distance_features)

adaboost_scores.append((
    'Water_Dist', cv_score, test_score,
     *recall_score(original['Cover_Type'].iloc[:15119], oof_preds, average = None)
))

# Extra Trees
cv_score, oof_preds, test_score = train_original(extratrees, water_distance_features)

extratrees_scores.append((
    'Water_Dist', cv_score, test_score,
     *recall_score(original['Cover_Type'].iloc[:15119], oof_preds, average = None)
))

# Bagging
cv_score, oof_preds, test_score = train_original(bagging, water_distance_features)

bagging_scores.append((
    'Water_Dist', cv_score, test_score,
     *recall_score(original['Cover_Type'].iloc[:15119], oof_preds, average = None)
))

# Random Forest
cv_score, oof_preds, test_score = train_original(randomforest, water_distance_features)

random_scores.append((
    'Water_Dist', cv_score, test_score,
     *recall_score(original['Cover_Type'].iloc[:15119], oof_preds, average = None)
))


AdaBoostClassifier
Train Accuracy: 0.79714
Test Accuracy: 0.75943
Training Time: 4.01s

ExtraTreesClassifier
Train Accuracy: 0.88445
Test Accuracy: 0.7734
Training Time: 36.1s

BaggingClassifier
Train Accuracy: 0.84999
Test Accuracy: 0.7522
Training Time: 18.06s

RandomForestClassifier
Train Accuracy: 0.8615
Test Accuracy: 0.74562
Training Time: 34.9s


## 2. Shade Features

1. Average Hillshade
2. Hillshade Range

In [9]:
def new_shade_features(data):
    df = data.copy()
    shade_features = ['Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm']
    df["Hillshade_Avg"] = df[shade_features].mean(axis=1)
    df['Hillshade_Range'] = df[shade_features].max(axis=1) - df[shade_features].min(axis=1)
    return df

In [10]:
# AdaBoost
cv_score, oof_preds, test_score = train_original(adaboost, new_shade_features)

adaboost_scores.append((
    'Shade_Features', cv_score, test_score,
     *recall_score(original['Cover_Type'].iloc[:15119], oof_preds, average = None)
))

# Extra Trees
cv_score, oof_preds, test_score = train_original(extratrees, new_shade_features)

extratrees_scores.append((
    'Shade_Features', cv_score, test_score,
     *recall_score(original['Cover_Type'].iloc[:15119], oof_preds, average = None)
))

# Bagging 
cv_score, oof_preds, test_score = train_original(bagging, new_shade_features)

bagging_scores.append((
    'Shade_Features', cv_score, test_score,
     *recall_score(original['Cover_Type'].iloc[:15119], oof_preds, average = None)
))

# Random Forest
cv_score, oof_preds, test_score = train_original(randomforest, new_shade_features)

random_scores.append((
    'Shade_Features', cv_score, test_score,
     *recall_score(original['Cover_Type'].iloc[:15119], oof_preds, average = None)
))


AdaBoostClassifier
Train Accuracy: 0.7937
Test Accuracy: 0.75256
Training Time: 5.06s

ExtraTreesClassifier
Train Accuracy: 0.883
Test Accuracy: 0.77191
Training Time: 36.86s

BaggingClassifier
Train Accuracy: 0.85297
Test Accuracy: 0.7513
Training Time: 27.04s

RandomForestClassifier
Train Accuracy: 0.85707
Test Accuracy: 0.73824
Training Time: 35.89s


## 3. Distance Interactions

Various features created by adding and subtracting the distance features from each other.

In [11]:
def distance_interactions(data):
    df = data.copy()
    df['Hydro_Fire_1'] = df['Horizontal_Distance_To_Hydrology'] + df['Horizontal_Distance_To_Fire_Points']
    df['Hydro_Fire_2'] = abs(df['Horizontal_Distance_To_Hydrology'] - df['Horizontal_Distance_To_Fire_Points'])
    df['Hydro_Road_1'] = abs(df['Horizontal_Distance_To_Hydrology'] + df['Horizontal_Distance_To_Roadways'])
    df['Hydro_Road_2'] = abs(df['Horizontal_Distance_To_Hydrology'] - df['Horizontal_Distance_To_Roadways'])
    df['Fire_Road_1'] = abs(df['Horizontal_Distance_To_Fire_Points'] + df['Horizontal_Distance_To_Roadways'])
    df['Fire_Road_2'] = abs(df['Horizontal_Distance_To_Fire_Points'] - df['Horizontal_Distance_To_Roadways'])
    return df

In [12]:
# AdaBoost
cv_score, oof_preds, test_score = train_original(adaboost, distance_interactions)

adaboost_scores.append((
    'Dist_Interactions', cv_score, test_score,
     *recall_score(original['Cover_Type'].iloc[:15119], oof_preds, average = None)
))

# Extra Trees
cv_score, oof_preds, test_score = train_original(extratrees, distance_interactions)

extratrees_scores.append((
    'Dist_Interactions', cv_score, test_score,
     *recall_score(original['Cover_Type'].iloc[:15119], oof_preds, average = None)
))

# Bagging
cv_score, oof_preds, test_score = train_original(bagging, distance_interactions)

bagging_scores.append((
    'Dist_Interactions', cv_score, test_score,
     *recall_score(original['Cover_Type'].iloc[:15119], oof_preds, average = None)
))

# Random Forest
cv_score, oof_preds, test_score = train_original(randomforest, distance_interactions)

random_scores.append((
    'Dist_Interactions', cv_score, test_score,
     *recall_score(original['Cover_Type'].iloc[:15119], oof_preds, average = None)
))


AdaBoostClassifier
Train Accuracy: 0.81692
Test Accuracy: 0.77843
Training Time: 4.03s

ExtraTreesClassifier
Train Accuracy: 0.90066
Test Accuracy: 0.80163
Training Time: 37.23s

BaggingClassifier
Train Accuracy: 0.87307
Test Accuracy: 0.78025
Training Time: 21.09s

RandomForestClassifier
Train Accuracy: 0.8908
Test Accuracy: 0.78466
Training Time: 34.94s


## 4. Binary Features

In [13]:
def binary_features(data):
    df = data.copy()
    df['Highwater'] = (df.Vertical_Distance_To_Hydrology < 0).astype(int)
    df['Hillshade_3pm_is_zero'] = (df.Hillshade_3pm == 0).astype(int)
    return df

In [14]:
# AdaBoost
cv_score, oof_preds, test_score = train_original(adaboost, binary_features)

adaboost_scores.append((
    'Binary', cv_score, test_score,
     *recall_score(original['Cover_Type'].iloc[:15119], oof_preds, average = None)
))

# Extra Trees
cv_score, oof_preds, test_score = train_original(extratrees, binary_features)

extratrees_scores.append((
    'Binary', cv_score, test_score,
     *recall_score(original['Cover_Type'].iloc[:15119], oof_preds, average = None)
))

# Bagging
cv_score, oof_preds, test_score = train_original(bagging, binary_features)

bagging_scores.append((
    'Binary', cv_score, test_score,
     *recall_score(original['Cover_Type'].iloc[:15119], oof_preds, average = None)
))

# Random Forest
cv_score, oof_preds, test_score = train_original(randomforest, binary_features)

random_scores.append((
    'Binary', cv_score, test_score,
     *recall_score(original['Cover_Type'].iloc[:15119], oof_preds, average = None)
))


AdaBoostClassifier
Train Accuracy: 0.79979
Test Accuracy: 0.75834
Training Time: 4.13s

ExtraTreesClassifier
Train Accuracy: 0.88432
Test Accuracy: 0.77729
Training Time: 36.02s

BaggingClassifier
Train Accuracy: 0.85548
Test Accuracy: 0.75269
Training Time: 22.25s

RandomForestClassifier
Train Accuracy: 0.86269
Test Accuracy: 0.7494
Training Time: 35.16s


## 5. Elevation Interactions

Interaction features created by combining various numerical features.

In [15]:
def elevation_interactions(data):
    df = data.copy()
    df['EHiElv'] = df['Horizontal_Distance_To_Roadways'] * df['Elevation']
    df['EViElv'] = df['Vertical_Distance_To_Hydrology'] * df['Elevation']
    df['EVDtH'] = df.Elevation - df.Vertical_Distance_To_Hydrology
    df['EHDtH'] = df.Elevation - df.Horizontal_Distance_To_Hydrology * 0.2
    return df

In [16]:
# AdaBoost
cv_score, oof_preds, test_score = train_original(adaboost, elevation_interactions)

adaboost_scores.append((
    'Elev_Interactions', cv_score, test_score,
     *recall_score(original['Cover_Type'].iloc[:15119], oof_preds, average = None)
))

# Extra Trees
cv_score, oof_preds, test_score = train_original(extratrees, elevation_interactions)

extratrees_scores.append((
    'Elev_Interactions', cv_score, test_score,
     *recall_score(original['Cover_Type'].iloc[:15119], oof_preds, average = None)
))

# Bagging
cv_score, oof_preds, test_score = train_original(bagging, elevation_interactions)

bagging_scores.append((
    'Elev_Interactions', cv_score, test_score,
     *recall_score(original['Cover_Type'].iloc[:15119], oof_preds, average = None)
))

# Random Forest
cv_score, oof_preds, test_score = train_original(randomforest, elevation_interactions)

random_scores.append((
    'Elev_Interactions', cv_score, test_score,
     *recall_score(original['Cover_Type'].iloc[:15119], oof_preds, average = None)
))


AdaBoostClassifier
Train Accuracy: 0.79979
Test Accuracy: 0.75642
Training Time: 5.23s

ExtraTreesClassifier
Train Accuracy: 0.88538
Test Accuracy: 0.77656
Training Time: 37.43s

BaggingClassifier
Train Accuracy: 0.85469
Test Accuracy: 0.75069
Training Time: 26.85s

RandomForestClassifier
Train Accuracy: 0.86388
Test Accuracy: 0.73719
Training Time: 35.68s


## Summary 

In [17]:
# AdaBoost
pd.DataFrame.from_records(
    data = adaboost_scores,
    columns = ['model','cv_score','holdout','recall_0','recall_1','recall_2','recall_3','recall_4','recall_5','recall_6']
).sort_values('holdout')

Unnamed: 0,model,cv_score,holdout,recall_0,recall_1,recall_2,recall_3,recall_4,recall_5,recall_6
2,Shade_Features,0.793703,0.752559,0.668519,0.63287,0.728578,0.925,0.89213,0.780093,0.928704
0,Baseline,0.803559,0.753727,0.681944,0.650463,0.745716,0.926389,0.893519,0.791204,0.935648
5,Elev_Interactions,0.799787,0.75642,0.665741,0.630556,0.762853,0.930556,0.8875,0.784259,0.937037
4,Binary,0.799788,0.758339,0.671296,0.628704,0.752663,0.933796,0.893056,0.788889,0.930093
1,Water_Dist,0.797143,0.759428,0.67037,0.637963,0.746642,0.936574,0.889815,0.768519,0.930093
3,Dist_Interactions,0.81692,0.77843,0.687037,0.666667,0.771654,0.933333,0.9125,0.808796,0.938426


In [18]:
# ExtraTrees
pd.DataFrame.from_records(
    data = extratrees_scores,
    columns = ['model','cv_score','holdout','recall_0','recall_1','recall_2','recall_3','recall_4','recall_5','recall_6']
).sort_values('holdout')

Unnamed: 0,model,cv_score,holdout,recall_0,recall_1,recall_2,recall_3,recall_4,recall_5,recall_6
2,Shade_Features,0.882996,0.771911,0.781481,0.733333,0.859657,0.972222,0.958333,0.90463,0.971296
1,Water_Dist,0.884451,0.773404,0.77963,0.738889,0.860584,0.973148,0.960185,0.907407,0.971296
5,Elev_Interactions,0.885377,0.776562,0.780556,0.736574,0.859194,0.972222,0.960648,0.914352,0.974074
4,Binary,0.884318,0.777292,0.778241,0.739815,0.861973,0.970833,0.959722,0.909259,0.97037
0,Baseline,0.884914,0.778078,0.786574,0.734259,0.866142,0.971759,0.961111,0.903704,0.970833
3,Dist_Interactions,0.900655,0.801627,0.803704,0.769444,0.887448,0.975463,0.966667,0.922685,0.979167


In [19]:
# Bagging
pd.DataFrame.from_records(
    data = bagging_scores,
    columns = ['model','cv_score','holdout','recall_0','recall_1','recall_2','recall_3','recall_4','recall_5','recall_6']
).sort_values('holdout')

Unnamed: 0,model,cv_score,holdout,recall_0,recall_1,recall_2,recall_3,recall_4,recall_5,recall_6
5,Elev_Interactions,0.854686,0.750693,0.767593,0.661111,0.84113,0.962037,0.944444,0.846759,0.959722
2,Shade_Features,0.852967,0.751299,0.771759,0.651389,0.836035,0.961111,0.941204,0.853704,0.955556
1,Water_Dist,0.849991,0.752204,0.766204,0.650463,0.827235,0.961574,0.940278,0.846759,0.957407
4,Binary,0.855481,0.752688,0.774074,0.664815,0.837888,0.960185,0.943519,0.848611,0.959259
0,Baseline,0.855812,0.753724,0.769907,0.670833,0.837888,0.964352,0.934259,0.85,0.963426
3,Dist_Interactions,0.873074,0.780252,0.791667,0.697685,0.855952,0.965741,0.956944,0.876389,0.96713


In [20]:
# Random Forest
pd.DataFrame.from_records(
    data = random_scores,
    columns = ['model','cv_score','holdout','recall_0','recall_1','recall_2','recall_3','recall_4','recall_5','recall_6']
).sort_values('holdout')

Unnamed: 0,model,cv_score,holdout,recall_0,recall_1,recall_2,recall_3,recall_4,recall_5,recall_6
5,Elev_Interactions,0.86388,0.737191,0.751852,0.685185,0.83233,0.973148,0.957407,0.875,0.972222
2,Shade_Features,0.857068,0.738237,0.761574,0.691667,0.812413,0.969444,0.945833,0.853241,0.965278
1,Water_Dist,0.861499,0.745623,0.76713,0.693056,0.809171,0.969444,0.951852,0.872222,0.967593
0,Baseline,0.863947,0.748949,0.767593,0.701852,0.819824,0.971759,0.950926,0.869444,0.966204
4,Binary,0.86269,0.7494,0.766667,0.696759,0.816119,0.971296,0.951852,0.869907,0.966204
3,Dist_Interactions,0.8908,0.784662,0.8,0.739352,0.863826,0.980093,0.965741,0.905556,0.981019
