# TPS 12/21 - Feature Engineering

In this notebook we test out some feature engineering techniques using XGBoost with default settings to see if we get any improvement over the baseline.

In [1]:
# Global variables for testing changes to this notebook quickly
RANDOM_SEED = 0
NUM_FOLDS = 3
TRAIN_SIZE = 500000

In [2]:
import numpy as np
import pandas as pd
import time
import os
import pyarrow
import gc

# Model/Evaluation
from sklearn.base import clone
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, recall_score
from sklearn.inspection import permutation_importance
from xgboost import XGBClassifier

# Hide warnings
import warnings
warnings.filterwarnings('ignore')

# Load and Prepare Data

In [3]:
# Load full training data
train = pd.read_feather('../data/train.feather')

# Drop low/no variance 
train.drop(["Soil_Type7", "Id", "Soil_Type15"], axis=1, inplace=True)
train = train[train.Cover_Type != 5]

# Label Encoding
new_encoder = LabelEncoder()
train["Cover_Type"] = new_encoder.fit_transform(train["Cover_Type"])

# Split synthetic data
train, test = train_test_split(
    train, 
    train_size = TRAIN_SIZE, 
    random_state = RANDOM_SEED,
    stratify = train['Cover_Type'],
)
y_train = train['Cover_Type']


# features, data structure for summary scores
features = [x for x in train.columns if x not in ['Id','Cover_Type']]
nonsoil = [x for x in features if not x.startswith('Soil_Type')]
new_rows = list()
gc.collect()

print(f'Training Size: {train.shape[0]} rows, {train.shape[1]} cols')
print(f'Holdout Size: {test.shape[0]} rows, {test.shape[1]} cols\n')

Training Size: 500000 rows, 53 cols
Holdout Size: 3499999 rows, 53 cols



# Model and Scoring Function

In [4]:
# XGBoost Classifier
xgb_pipeline = make_pipeline(
    XGBClassifier(
        booster = 'gbtree',
        tree_method = 'hist',
        eval_metric = 'mlogloss',
        random_state = RANDOM_SEED,
    ),
)

In [5]:
def score_features(sklearn_model, processing = None):
    
    # Original Training/Test Split
    features = [x for x in train.columns if x not in ['Id','Cover_Type']]
    X_temp, X_test = train[features], test[features]
    y_temp, y_test = train['Cover_Type'], test['Cover_Type']
    
    # Feature Engineering
    if processing:
        X_temp = processing(X_temp)
        X_test = processing(X_test)
    features = [x for x in X_temp.columns]
    
    # Store the out-of-fold predictions
    test_preds = np.zeros((X_test.shape[0],6))
    oof_preds = np.zeros((X_temp.shape[0],))
    fi_scores = np.zeros((X_temp.shape[1],))
    scores, times = np.zeros(NUM_FOLDS), np.zeros(NUM_FOLDS)
    
    # Stratified k-fold cross-validation
    skf = StratifiedKFold(n_splits = NUM_FOLDS, shuffle = True, random_state = RANDOM_SEED)
    for fold, (train_idx, valid_idx) in enumerate(skf.split(X_temp,y_temp)):
       
        # Training and Validation Sets
        X_train, X_valid = X_temp.iloc[train_idx], X_temp.iloc[valid_idx]
        y_train, y_valid = y_temp.iloc[train_idx], y_temp.iloc[valid_idx]
        
        # Create model
        start = time.time()
        model = clone(sklearn_model)
        model.fit(X_train, y_train)

        # Permutation Importance
        result = permutation_importance(
            model, X_valid, y_valid, 
            n_repeats=10, random_state=RANDOM_SEED
        )
        fi_scores += result.importances_mean / NUM_FOLDS

        # validation/holdout predictions
        valid_preds = np.ravel(model.predict(X_valid))
        oof_preds[valid_idx] = valid_preds
        test_preds += model.predict_proba(X_test)

        # Save scores and times
        scores[fold] = accuracy_score(y_valid, valid_preds)
        end = time.time()
        times[fold] = end-start
        print(f'Fold {fold} Accuracy:  {round(scores[fold], 5)} in {round(end-start,2)}s.')
        time.sleep(0.5)
    
    nonsoil = [x for x in X_test.columns if not x.startswith('Soil_Type')]
    test_preds = np.argmax(test_preds, axis = 1)
    test_score = accuracy_score(y_test, test_preds)
    #print('\n'+model.__class__.__name__)
    print("Train Accuracy:", round(scores.mean(), 5))
    print('Test Accuracy:', round(test_score, 5))
    print(f'Training Time: {round(times.sum(), 2)}s')
    
    fi_scores = pd.Series(
        data = fi_scores, 
        index = features
    ).loc[nonsoil].sort_values()
    
    return scores.mean(), oof_preds, test_score, fi_scores

# XGBoost Baseline

In [6]:

cv_score, oof_preds, test_score, fi_scores = score_features(
    xgb_pipeline
)

new_rows.append((
    'Baseline', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

fi_scores

Fold 0 Accuracy:  0.95743 in 154.26s.
Fold 1 Accuracy:  0.95725 in 153.69s.
Fold 2 Accuracy:  0.95721 in 156.14s.
Train Accuracy: 0.9573
Test Accuracy: 0.95816
Training Time: 464.09s


Hillshade_9am                         0.000039
Slope                                 0.000052
Aspect                                0.000066
Wilderness_Area2                      0.000083
Hillshade_3pm                         0.000094
Hillshade_Noon                        0.000512
Wilderness_Area4                      0.002685
Wilderness_Area1                      0.010470
Horizontal_Distance_To_Hydrology      0.010586
Vertical_Distance_To_Hydrology        0.018294
Wilderness_Area3                      0.019530
Horizontal_Distance_To_Fire_Points    0.031450
Horizontal_Distance_To_Roadways       0.047001
Elevation                             0.460613
dtype: float64

# Feature Engineering

These function perform various feature cleaning and engineering tasks

## 1. Fix Aspect Range

In [7]:
# Fix aspect
def fix_aspect(data):
    
    df = data.copy()
    
    df["Aspect"][df["Aspect"] < 0] += 360
    df["Aspect"][df["Aspect"] > 359] -= 360
    
    return df

In [8]:
cv_score, oof_preds, test_score, fi_scores = score_features(
    xgb_pipeline, 
    fix_aspect
)

new_rows.append((
    'Fix_Aspect', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

fi_scores

Fold 0 Accuracy:  0.9575 in 155.46s.
Fold 1 Accuracy:  0.95721 in 155.15s.
Fold 2 Accuracy:  0.95708 in 155.96s.
Train Accuracy: 0.95726
Test Accuracy: 0.95816
Training Time: 466.57s


Hillshade_9am                        -0.000038
Slope                                -0.000030
Hillshade_3pm                        -0.000007
Aspect                                0.000013
Wilderness_Area2                      0.000072
Hillshade_Noon                        0.000387
Wilderness_Area4                      0.002617
Wilderness_Area1                      0.010499
Horizontal_Distance_To_Hydrology      0.010533
Vertical_Distance_To_Hydrology        0.018263
Wilderness_Area3                      0.019499
Horizontal_Distance_To_Fire_Points    0.031355
Horizontal_Distance_To_Roadways       0.046910
Elevation                             0.460702
dtype: float64

## 2. Fix Hillshade Range

In [9]:
# Fix Hillshade
def fix_hillshade(data):
    
    df = data.copy()
    
    df.loc[df["Hillshade_9am"] < 0, "Hillshade_9am"] = 0
    df.loc[df["Hillshade_Noon"] < 0, "Hillshade_Noon"] = 0
    df.loc[df["Hillshade_3pm"] < 0, "Hillshade_3pm"] = 0
    df.loc[df["Hillshade_9am"] > 255, "Hillshade_9am"] = 255
    df.loc[df["Hillshade_Noon"] > 255, "Hillshade_Noon"] = 255
    df.loc[df["Hillshade_3pm"] > 255, "Hillshade_3pm"] = 255
    
    return df

In [10]:
cv_score, oof_preds, test_score, fi_scores = score_features(
    xgb_pipeline, 
    fix_hillshade
)

new_rows.append((
    'Fix_Hillshade', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

fi_scores

Fold 0 Accuracy:  0.95743 in 155.3s.
Fold 1 Accuracy:  0.95679 in 155.59s.
Fold 2 Accuracy:  0.95705 in 156.2s.
Train Accuracy: 0.95709
Test Accuracy: 0.9581
Training Time: 467.09s


Hillshade_9am                        -0.000019
Hillshade_3pm                        -0.000009
Aspect                                0.000084
Wilderness_Area2                      0.000090
Slope                                 0.000090
Hillshade_Noon                        0.000410
Wilderness_Area4                      0.002709
Wilderness_Area1                      0.010068
Horizontal_Distance_To_Hydrology      0.010418
Vertical_Distance_To_Hydrology        0.018162
Wilderness_Area3                      0.019812
Horizontal_Distance_To_Fire_Points    0.031145
Horizontal_Distance_To_Roadways       0.046771
Elevation                             0.460368
dtype: float64

## 3. Water Distance Features

In [11]:
# Distance to Water
def water_distance_features(data):
    df = data.copy()
    
    # use float64 for squaring
    df["Horizontal_Distance_To_Hydrology"] = df["Horizontal_Distance_To_Hydrology"].astype('float64')
    df["Vertical_Distance_To_Hydrology"] = df["Vertical_Distance_To_Hydrology"].astype('float64')
    
    # compute metrics
    df["Hydro_Taxicab"] = np.abs(df["Horizontal_Distance_To_Hydrology"]) + np.abs(df["Vertical_Distance_To_Hydrology"])
    df["Hydro_Euclid"] = (df["Horizontal_Distance_To_Hydrology"]**2 + np.abs(df["Vertical_Distance_To_Hydrology"])**2)**0.5
    
    # convert back
    df["Horizontal_Distance_To_Hydrology"] = df["Horizontal_Distance_To_Hydrology"].astype('float32')
    df["Vertical_Distance_To_Hydrology"] = df["Vertical_Distance_To_Hydrology"].astype('float32')
    df["Hydro_Taxicab"] = df["Hydro_Taxicab"].astype('float32')
    df["Hydro_Euclid"] = df["Hydro_Euclid"].astype('float32')
    
    return df

In [12]:
cv_score, oof_preds, test_score, fi_scores = score_features(
    xgb_pipeline, 
    water_distance_features
)

new_rows.append((
    'Water_Dist', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

fi_scores

Fold 0 Accuracy:  0.9573 in 171.85s.
Fold 1 Accuracy:  0.95664 in 172.12s.
Fold 2 Accuracy:  0.95698 in 169.63s.
Train Accuracy: 0.95697
Test Accuracy: 0.95808
Training Time: 513.61s


Hillshade_9am                        -0.000067
Slope                                -0.000030
Hillshade_3pm                         0.000033
Wilderness_Area2                      0.000056
Aspect                                0.000059
Hillshade_Noon                        0.000379
Horizontal_Distance_To_Hydrology      0.000436
Hydro_Euclid                          0.000669
Wilderness_Area4                      0.002630
Hydro_Taxicab                         0.006645
Wilderness_Area1                      0.010222
Vertical_Distance_To_Hydrology        0.014285
Wilderness_Area3                      0.019672
Horizontal_Distance_To_Fire_Points    0.031091
Horizontal_Distance_To_Roadways       0.046837
Elevation                             0.460220
dtype: float64

## 4. Count Features

In [13]:
# Create count features
def count_features(data):
    
    df = data.copy()
    soil_features = [x for x in df.columns if x.startswith("Soil_Type")]
    wilderness_features = [x for x in df.columns if x.startswith("Wilderness_Area")]

    # Count features
    df["Soil_Count"] = df[soil_features].apply(sum, axis=1)
    df["Wilderness_Count"] = df[wilderness_features].apply(sum, axis=1)
    
    return df

In [14]:
cv_score, oof_preds, test_score, fi_scores = score_features(
    xgb_pipeline, 
    count_features
)

new_rows.append((
    'Count', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

fi_scores

Fold 0 Accuracy:  0.95967 in 182.14s.
Fold 1 Accuracy:  0.95925 in 180.58s.
Fold 2 Accuracy:  0.95918 in 178.6s.
Train Accuracy: 0.95937
Test Accuracy: 0.96015
Training Time: 541.32s


Slope                                -0.000033
Hillshade_9am                        -0.000026
Hillshade_3pm                         0.000008
Aspect                                0.000071
Wilderness_Area2                      0.000103
Hillshade_Noon                        0.000430
Wilderness_Count                      0.000466
Wilderness_Area4                      0.004969
Horizontal_Distance_To_Hydrology      0.010961
Wilderness_Area1                      0.012118
Wilderness_Area3                      0.018808
Vertical_Distance_To_Hydrology        0.018826
Horizontal_Distance_To_Fire_Points    0.032385
Soil_Count                            0.035849
Horizontal_Distance_To_Roadways       0.048730
Elevation                             0.461500
dtype: float64

## 5. Shade Features

In [15]:
# Create Shade features
def new_shade_features(data):
    
    df = data.copy()
    shade_features = ['Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm']

    # Hillshade
    df["Hillshade_Avg"] = df[shade_features].mean(axis=1)
    df['Hillshade_Range'] = df[shade_features].max(axis=1) - df[shade_features].min(axis=1)
    
    return df

In [16]:
cv_score, oof_preds, test_score, fi_scores = score_features(
    xgb_pipeline, 
    new_shade_features
)

new_rows.append((
    'Shade_Features', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

fi_scores

Fold 0 Accuracy:  0.95715 in 181.81s.
Fold 1 Accuracy:  0.95684 in 182.05s.
Fold 2 Accuracy:  0.95713 in 179.93s.
Train Accuracy: 0.95704
Test Accuracy: 0.95813
Training Time: 543.79s


Hillshade_3pm                        -0.000052
Slope                                -0.000030
Hillshade_9am                        -0.000011
Aspect                               -0.000002
Hillshade_Avg                         0.000035
Hillshade_Range                       0.000047
Wilderness_Area2                      0.000090
Hillshade_Noon                        0.000373
Wilderness_Area4                      0.002604
Horizontal_Distance_To_Hydrology      0.010375
Wilderness_Area1                      0.010522
Vertical_Distance_To_Hydrology        0.018211
Wilderness_Area3                      0.019498
Horizontal_Distance_To_Fire_Points    0.031270
Horizontal_Distance_To_Roadways       0.046671
Elevation                             0.460469
dtype: float64

## 6. Distance Interaction Features

In [17]:
def distance_interactions(data):
    
    df = data.copy()
    df['Hydro_Fire_1'] = df['Horizontal_Distance_To_Hydrology'] + df['Horizontal_Distance_To_Fire_Points']
    df['Hydro_Fire_2'] = abs(df['Horizontal_Distance_To_Hydrology'] - df['Horizontal_Distance_To_Fire_Points'])
    df['Hydro_Road_1'] = abs(df['Horizontal_Distance_To_Hydrology'] + df['Horizontal_Distance_To_Roadways'])
    df['Hydro_Road_2'] = abs(df['Horizontal_Distance_To_Hydrology'] - df['Horizontal_Distance_To_Roadways'])
    df['Fire_Road_1'] = abs(df['Horizontal_Distance_To_Fire_Points'] + df['Horizontal_Distance_To_Roadways'])
    df['Fire_Road_2'] = abs(df['Horizontal_Distance_To_Fire_Points'] - df['Horizontal_Distance_To_Roadways'])
    return df

In [18]:
cv_score, oof_preds, test_score, fi_scores = score_features(
    xgb_pipeline, 
    distance_interactions
)

new_rows.append((
    'Dist_Interactions', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

fi_scores

Fold 0 Accuracy:  0.95825 in 179.99s.
Fold 1 Accuracy:  0.95807 in 179.14s.
Fold 2 Accuracy:  0.95811 in 180.59s.
Train Accuracy: 0.95814
Test Accuracy: 0.95879
Training Time: 539.72s


Slope                                -0.000006
Hillshade_3pm                         0.000001
Hillshade_9am                         0.000049
Wilderness_Area2                      0.000105
Aspect                                0.000118
Hillshade_Noon                        0.000416
Horizontal_Distance_To_Fire_Points    0.001680
Hydro_Road_1                          0.001819
Wilderness_Area4                      0.002963
Hydro_Road_2                          0.003320
Fire_Road_2                           0.003695
Hydro_Fire_1                          0.005899
Horizontal_Distance_To_Hydrology      0.006099
Hydro_Fire_2                          0.006586
Fire_Road_1                           0.006747
Wilderness_Area1                      0.010478
Vertical_Distance_To_Hydrology        0.018505
Wilderness_Area3                      0.019864
Horizontal_Distance_To_Roadways       0.023743
Elevation                             0.460564
dtype: float64

## 7. Elevation Interaction

In [19]:
def elevation_interactions(data):
    
    df = data.copy()
    df['EHiElv'] = df['Horizontal_Distance_To_Roadways'] * df['Elevation']
    df['EViElv'] = df['Vertical_Distance_To_Hydrology'] * df['Elevation']
    df['EVDtH'] = df.Elevation - df.Vertical_Distance_To_Hydrology
    df['EHDtH'] = df.Elevation - df.Horizontal_Distance_To_Hydrology * 0.2
    return df

In [20]:
cv_score, oof_preds, test_score, fi_scores = score_features(
    xgb_pipeline, 
    elevation_interactions
)

new_rows.append((
    'Elev_Interactions', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

fi_scores

Fold 0 Accuracy:  0.95806 in 207.57s.
Fold 1 Accuracy:  0.95754 in 212.56s.
Fold 2 Accuracy:  0.9583 in 194.59s.
Train Accuracy: 0.95797
Test Accuracy: 0.95882
Training Time: 614.72s


Hillshade_9am                        -0.000006
Slope                                 0.000002
Hillshade_3pm                         0.000024
EViElv                                0.000026
EHiElv                                0.000035
Wilderness_Area2                      0.000079
Aspect                                0.000111
Hillshade_Noon                        0.000404
Vertical_Distance_To_Hydrology        0.000465
Wilderness_Area4                      0.003132
Horizontal_Distance_To_Hydrology      0.004215
Wilderness_Area1                      0.010090
Wilderness_Area3                      0.019532
Horizontal_Distance_To_Fire_Points    0.031707
EHDtH                                 0.034618
Horizontal_Distance_To_Roadways       0.046862
Elevation                             0.135773
EVDtH                                 0.148534
dtype: float64

## 8. Misc Features

In [21]:
def misc_features(data):
    
    df = data.copy()
    df['Highwater'] = (df.Vertical_Distance_To_Hydrology < 0).astype(int)
    df['Hillshade_3pm_is_zero'] = (df.Hillshade_3pm == 0).astype(int)
    return df

In [22]:
cv_score, oof_preds, test_score, fi_scores = score_features(
    xgb_pipeline, 
    misc_features
)

new_rows.append((
    'Misc_Features', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

fi_scores

Fold 0 Accuracy:  0.95743 in 173.15s.
Fold 1 Accuracy:  0.95725 in 170.58s.
Fold 2 Accuracy:  0.95721 in 169.39s.
Train Accuracy: 0.9573
Test Accuracy: 0.95816
Training Time: 513.12s


Highwater                             0.000000
Hillshade_3pm_is_zero                 0.000000
Hillshade_9am                         0.000039
Slope                                 0.000052
Aspect                                0.000066
Wilderness_Area2                      0.000083
Hillshade_3pm                         0.000094
Hillshade_Noon                        0.000512
Wilderness_Area4                      0.002685
Wilderness_Area1                      0.010470
Horizontal_Distance_To_Hydrology      0.010586
Vertical_Distance_To_Hydrology        0.018294
Wilderness_Area3                      0.019530
Horizontal_Distance_To_Fire_Points    0.031450
Horizontal_Distance_To_Roadways       0.047001
Elevation                             0.460613
dtype: float64

# Summary

In [23]:
pd.DataFrame.from_records(
    data = new_rows,
    columns = ['features','cv_scores','holdout','recall_0', 'recall_1','recall_2','recall_3','recall_4','recall_5']
).sort_values('holdout')

Unnamed: 0,features,cv_scores,holdout,recall_0,recall_1,recall_2,recall_3,recall_4,recall_5
3,Water_Dist,0.956974,0.958081,0.963175,0.97152,0.872956,0.191489,0.393557,0.654375
2,Fix_Hillshade,0.95709,0.958101,0.963453,0.97152,0.871975,0.234043,0.408964,0.655274
5,Shade_Features,0.957042,0.958129,0.9633,0.97163,0.872507,0.12766,0.397059,0.652962
0,Baseline,0.957298,0.958156,0.963497,0.971616,0.874714,0.212766,0.39986,0.65733
8,Misc_Features,0.957298,0.958156,0.963497,0.971616,0.874714,0.212766,0.39986,0.65733
1,Fix_Aspect,0.957264,0.958158,0.963529,0.971686,0.872956,0.234043,0.398459,0.657459
6,Dist_Interactions,0.958142,0.958786,0.964565,0.972111,0.875981,0.170213,0.434874,0.658229
7,Elev_Interactions,0.957968,0.958823,0.96372,0.972344,0.877657,0.212766,0.406863,0.658101
4,Count,0.959366,0.960151,0.964445,0.971746,0.885464,0.191489,0.445378,0.721059
