# TPS12 - Feature Engineering Baselines

In this notebook we take our feature engineering techniques from the previous notebooks and test them all together and compare with a baseline

In [1]:
# Global variables for testing changes to this notebook quickly
RANDOM_SEED = 0
NUM_FOLDS = 10
TRAIN_SIZE = 500000

In [2]:
import numpy as np
import pandas as pd
import time
import os
import pyarrow
import gc

# Hide warnings
import warnings
warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

# Model/Evaluation
from functools import partial
from sklearn.base import clone
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder, RobustScaler
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, recall_score
from sklearn.inspection import permutation_importance

# Tensorflow/Keras
import tensorflow as tf
from tensorflow import keras

# Keras imports
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

# Gradient Boosting
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Load and Prepare Data

In [3]:
# Load full training data
train = pd.read_feather('../data/train.feather')

# Drop low/no variance 
train.drop(["Soil_Type7", "Id", "Soil_Type15"], axis=1, inplace=True)
train = train[train.Cover_Type != 5]

# Label Encoding
new_encoder = LabelEncoder()
train["Cover_Type"] = new_encoder.fit_transform(train["Cover_Type"])

# Split synthetic data
train, test = train_test_split(
    train, 
    train_size = TRAIN_SIZE, 
    random_state = RANDOM_SEED,
    stratify = train['Cover_Type'],
)
y_train = train['Cover_Type']


# features, data structure for summary scores
features = [x for x in train.columns if x not in ['Id','Cover_Type']]
nonsoil = [x for x in features if not x.startswith('Soil_Type')]
new_rows = list()
gc.collect()

print(f'Training Size: {train.shape[0]} rows, {train.shape[1]} cols')
print(f'Holdout Size: {test.shape[0]} rows, {test.shape[1]} cols\n')

Training Size: 500000 rows, 53 cols
Holdout Size: 3499999 rows, 53 cols



# Models

1. Neural Network (Keras)
2. XGBoost
3. LightGBM
4. CatBoost

## 1. Keras

In [4]:
# Keras Parameters
BATCH_SIZE = 512
EPOCHS = 100
EARLY_STOP = 10
VERBOSE = 0
NUM_CLASSES = len(train.Cover_Type.value_counts())

In [5]:
def build_model(input_size):
    model = keras.Sequential(
        [
            layers.InputLayer(input_shape = (input_size,)),
            layers.Dense(input_size, kernel_initializer="lecun_normal", activation="selu"),
            layers.Dense(3*input_size//4, kernel_initializer="lecun_normal", activation="selu"),
            layers.Dense(input_size//2, kernel_initializer="lecun_normal", activation="selu"),
            layers.Dense(NUM_CLASSES, activation="softmax"),
        ]
    )

    model.compile(
        loss = tf.keras.losses.SparseCategoricalCrossentropy(), 
        optimizer = tf.keras.optimizers.Adam(),
        metrics=['accuracy'],
    )
    return model 

In [6]:
# Need to adjust input size depending on # of features
def get_pipeline(input_size = len(features)):
    return make_pipeline(
        RobustScaler(),
        KerasClassifier(
            partial(build_model, input_size = input_size),
            batch_size = BATCH_SIZE,
            epochs = EPOCHS,
            verbose = VERBOSE,
            shuffle = True,
            validation_split = 0.1,
            callbacks = [
                EarlyStopping(
                    patience=EARLY_STOP,
                    monitor='val_loss',
                    restore_best_weights=True,
                ),
            ],
        )
    )

## 2. XGBoost

In [7]:
# XGBoost Classifier
xgb_pipeline = make_pipeline(
    XGBClassifier(
        booster = 'gbtree',
        tree_method = 'hist',
        eval_metric = 'mlogloss',
        random_state = RANDOM_SEED,
    ),
)

## 3. LightGBM

In [8]:
# LightGBM Classifier
lgbm_pipeline = make_pipeline(
    LGBMClassifier(
        unbalanced_sets = True,
        metric = 'multi_logloss',
        random_state = RANDOM_SEED,
        n_jobs = 4,
    ),
)

## 4. CatBoost

In [9]:
# CatBoost Classifier
catboost_pipeline = make_pipeline(
    CatBoostClassifier(
        eval_metric = 'MultiClass',
        boosting_type = 'Plain',
        verbose = False,
        random_state = RANDOM_SEED,
    ),
)

# Scoring Function

In [10]:
def score_features(sklearn_model, processing = None):
    
    # Original Training/Test Split
    features = [x for x in train.columns if x not in ['Id','Cover_Type']]
    X_temp, X_test = train[features], test[features]
    y_temp, y_test = train['Cover_Type'], test['Cover_Type']
    
    # Feature Engineering
    if processing:
        X_temp = processing(X_temp)
        X_test = processing(X_test)
    
    # Store the out-of-fold predictions
    test_preds = np.zeros((X_test.shape[0],6))
    oof_preds = np.zeros((X_temp.shape[0],))
    fi_scores = np.zeros((X_temp.shape[1],))
    scores, times = np.zeros(NUM_FOLDS), np.zeros(NUM_FOLDS)
    
    # Stratified k-fold cross-validation
    skf = StratifiedKFold(n_splits = NUM_FOLDS, shuffle = True, random_state = RANDOM_SEED)
    for fold, (train_idx, valid_idx) in enumerate(skf.split(train[features],train['Cover_Type'])):
       
        # Training and Validation Sets
        X_train, X_valid = X_temp.iloc[train_idx], X_temp.iloc[valid_idx]
        y_train, y_valid = y_temp.iloc[train_idx], y_temp.iloc[valid_idx]
        
        # Create model
        start = time.time()
        model = clone(sklearn_model)
        model.fit(X_train, y_train)

        # Permutation Importance
        result = permutation_importance(
            model, X_valid, y_valid, 
            random_state=RANDOM_SEED
        )
        fi_scores += result.importances_mean / NUM_FOLDS

        # validation/holdout predictions
        valid_preds = np.ravel(model.predict(X_valid))
        oof_preds[valid_idx] = valid_preds
        test_preds += model.predict_proba(X_test)

        # Save scores and times
        scores[fold] = accuracy_score(y_valid, valid_preds)
        end = time.time()
        times[fold] = end-start
        print(f'Fold {fold} Accuracy:  {round(scores[fold], 5)} in {round(end-start,2)}s.')
        time.sleep(0.5)
    
    features = [x for x in X_temp.columns]
    nonsoil = [x for x in X_test.columns if not x.startswith('Soil_Type')]
    test_preds = np.argmax(test_preds, axis = 1)
    test_score = accuracy_score(y_test, test_preds)
    #print('\n'+model.__class__.__name__)
    print("Train Accuracy:", round(scores.mean(), 5))
    print('Test Accuracy:', round(test_score, 5))
    print(f'Training Time: {round(times.sum(), 2)}s\n')
    
    fi_scores = pd.Series(
        data = fi_scores, 
        index = features
    ).loc[nonsoil].sort_values()
    
    return scores.mean(), oof_preds, test_score, fi_scores

# Baselines

## 1. Keras

In [11]:
cv_score, oof_preds, test_score, fi_scores = score_features(
    get_pipeline()
)

new_rows.append((
    'Keras', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

fi_scores

Fold 0 Accuracy:  0.95726 in 111.86s.
Fold 1 Accuracy:  0.95838 in 99.95s.
Fold 2 Accuracy:  0.95442 in 80.36s.
Fold 3 Accuracy:  0.95822 in 104.74s.
Fold 4 Accuracy:  0.95662 in 89.48s.
Fold 5 Accuracy:  0.95836 in 90.57s.
Fold 6 Accuracy:  0.95616 in 94.05s.
Fold 7 Accuracy:  0.95618 in 81.73s.
Fold 8 Accuracy:  0.95796 in 92.29s.
Fold 9 Accuracy:  0.95594 in 99.98s.
Train Accuracy: 0.95695
Test Accuracy: 0.95955
Training Time: 945.02s



Hillshade_3pm                        -0.000068
Slope                                -0.000045
Aspect                               -0.000010
Wilderness_Area2                      0.000042
Hillshade_9am                         0.000153
Hillshade_Noon                        0.000491
Wilderness_Area1                      0.004206
Horizontal_Distance_To_Hydrology      0.010933
Wilderness_Area4                      0.011107
Vertical_Distance_To_Hydrology        0.018869
Wilderness_Area3                      0.024657
Horizontal_Distance_To_Fire_Points    0.031888
Horizontal_Distance_To_Roadways       0.049890
Elevation                             0.462528
dtype: float64

## 2. XGBoost

In [12]:
cv_score, oof_preds, test_score, fi_scores = score_features(
    xgb_pipeline
)

new_rows.append((
    'XGBoost', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

fi_scores

Fold 0 Accuracy:  0.95862 in 38.99s.
Fold 1 Accuracy:  0.95792 in 39.86s.
Fold 2 Accuracy:  0.95696 in 42.81s.
Fold 3 Accuracy:  0.95772 in 46.24s.
Fold 4 Accuracy:  0.95616 in 39.27s.
Fold 5 Accuracy:  0.9593 in 39.78s.
Fold 6 Accuracy:  0.95668 in 39.26s.
Fold 7 Accuracy:  0.95776 in 39.36s.
Fold 8 Accuracy:  0.95814 in 39.29s.
Fold 9 Accuracy:  0.95734 in 39.74s.
Train Accuracy: 0.95766
Test Accuracy: 0.95833
Training Time: 404.62s



Hillshade_3pm                        -0.000010
Hillshade_9am                         0.000017
Slope                                 0.000034
Wilderness_Area2                      0.000062
Aspect                                0.000184
Hillshade_Noon                        0.000486
Wilderness_Area4                      0.002694
Wilderness_Area1                      0.010030
Horizontal_Distance_To_Hydrology      0.010533
Vertical_Distance_To_Hydrology        0.018364
Wilderness_Area3                      0.019716
Horizontal_Distance_To_Fire_Points    0.031628
Horizontal_Distance_To_Roadways       0.047089
Elevation                             0.460872
dtype: float64

## 3. LightGBM

In [13]:
cv_score, oof_preds, test_score, fi_scores = score_features(
    lgbm_pipeline
)

new_rows.append((
    'LightGBM', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

fi_scores

Fold 0 Accuracy:  0.94612 in 179.9s.
Fold 1 Accuracy:  0.94622 in 174.65s.
Fold 2 Accuracy:  0.94598 in 181.38s.
Fold 3 Accuracy:  0.95146 in 190.01s.
Fold 4 Accuracy:  0.933 in 164.22s.
Fold 5 Accuracy:  0.94682 in 168.87s.
Fold 6 Accuracy:  0.94716 in 185.65s.
Fold 7 Accuracy:  0.94874 in 188.35s.
Fold 8 Accuracy:  0.94918 in 181.41s.
Fold 9 Accuracy:  0.94716 in 187.85s.
Train Accuracy: 0.94618
Test Accuracy: 0.95539
Training Time: 1802.28s



Slope                                -0.000078
Aspect                                0.000046
Wilderness_Area2                      0.000048
Hillshade_9am                         0.000096
Hillshade_3pm                         0.000103
Hillshade_Noon                        0.000410
Wilderness_Area4                      0.003531
Horizontal_Distance_To_Hydrology      0.009818
Wilderness_Area1                      0.015300
Vertical_Distance_To_Hydrology        0.016540
Wilderness_Area3                      0.020757
Horizontal_Distance_To_Fire_Points    0.028108
Horizontal_Distance_To_Roadways       0.043208
Elevation                             0.454769
dtype: float64

## 4. CatBoost

In [14]:
cv_score, oof_preds, test_score, fi_scores = score_features(
    catboost_pipeline
)

new_rows.append((
    'CatBoost', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

fi_scores

Fold 0 Accuracy:  0.95832 in 179.41s.
Fold 1 Accuracy:  0.9595 in 182.46s.
Fold 2 Accuracy:  0.95632 in 178.38s.
Fold 3 Accuracy:  0.95848 in 170.9s.
Fold 4 Accuracy:  0.95648 in 177.64s.
Fold 5 Accuracy:  0.95926 in 182.95s.
Fold 6 Accuracy:  0.95704 in 176.75s.
Fold 7 Accuracy:  0.95814 in 181.73s.
Fold 8 Accuracy:  0.95884 in 183.72s.
Fold 9 Accuracy:  0.95724 in 169.34s.
Train Accuracy: 0.95796
Test Accuracy: 0.95835
Training Time: 1783.29s



Hillshade_3pm                        -0.000010
Slope                                -0.000010
Hillshade_9am                        -0.000006
Aspect                                0.000063
Wilderness_Area2                      0.000095
Hillshade_Noon                        0.000571
Wilderness_Area4                      0.001954
Wilderness_Area1                      0.005708
Horizontal_Distance_To_Hydrology      0.010645
Vertical_Distance_To_Hydrology        0.018152
Wilderness_Area3                      0.023757
Horizontal_Distance_To_Fire_Points    0.031412
Horizontal_Distance_To_Roadways       0.047918
Elevation                             0.461982
dtype: float64

# Feature Engineering 

## 1. Keras

In [17]:
# Helper function
def start_at_eps(series, eps=1e-10): 
    return series - series.min() + eps  

def keras_features(data):
    df = data.copy()
    shade_features = ['Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm']
    soil_features = [x for x in df.columns if x.startswith("Soil_Type")]
    wilderness_features = [x for x in df.columns if x.startswith("Wilderness_Area")]
    
    # Aspect Features
    df['Aspect_360'] = df['Aspect'] % 360
    #df['Aspect_Sine'] = (df['Aspect']* np.pi / 180).apply(np.sin)
    #df['Aspect_Alt'] = (df['Aspect']-180).where(df['Aspect']+180 > 360, df['Aspect'] + 180)

    # Hillshade Features
    df["Hillshade_9am_Clipped"] = df["Hillshade_9am"].clip(lower=0, upper=255)
    df["Hillshade_Noon_Clipped"] = df["Hillshade_Noon"].clip(lower=0, upper=255)
    df["Hillshade_3pm_Clipped"] = df["Hillshade_3pm"].clip(lower=0, upper=255)
    #df["Hillshade_Avg"] = df[shade_features].mean(axis=1)
    df["Hillshade_Sum"] = df[shade_features].sum(axis=1)
    df['Hillshade_Range'] = df[shade_features].max(axis=1) - df[shade_features].min(axis=1)
    
    # Water Features
    #df["Horizontal_Distance_To_Hydrology"] = df["Horizontal_Distance_To_Hydrology"].astype('float64')
    #df["Vertical_Distance_To_Hydrology"] = df["Vertical_Distance_To_Hydrology"].astype('float64')
    #pos_h_hydrology = start_at_eps(df["Horizontal_Distance_To_Hydrology"])
    #pos_v_hydrology = start_at_eps(df['Vertical_Distance_To_Hydrology'])
    #df["Hydro_Taxicab"] = np.abs(df["Horizontal_Distance_To_Hydrology"]) + np.abs(df["Vertical_Distance_To_Hydrology"])
    #df['Hydro_Taxicab_Pos'] = (pos_h_hydrology ** 2 + pos_v_hydrology ** 2).apply(np.sqrt).rename('Euclidean_positive_hydrology').astype(np.float32)
    #df["Hydro_Euclid"] = (df["Horizontal_Distance_To_Hydrology"]**2 + np.abs(df["Vertical_Distance_To_Hydrology"])**2)**0.5
    #df['Hydro_Euclid_Pos'] = (pos_h_hydrology ** 2 + pos_v_hydrology ** 2).apply(np.sqrt)
    #df['Water_Direction'] = df['Vertical_Distance_To_Hydrology'].apply(np.sign)
    #df['Water Elevation'] = df['Elevation'] - df['Vertical_Distance_To_Hydrology']

    # Count Features 
    df["Soil_Count"] = df[soil_features].apply(sum, axis=1)
    df["Wilderness_Count"] = df[wilderness_features].apply(sum, axis=1)
    
    # Water/Fire Interactions
    df['Hydro_Fire_Sum'] = df['Horizontal_Distance_To_Hydrology'] + df['Horizontal_Distance_To_Fire_Points']
    df['Hydro_Fire_AbsDiff'] = abs(df['Horizontal_Distance_To_Hydrology'] - df['Horizontal_Distance_To_Fire_Points'])
    df['Hydro_Fire_EpsSum'] = start_at_eps(df['Horizontal_Distance_To_Hydrology']) + start_at_eps(df['Horizontal_Distance_To_Fire_Points'])
    df['Hydro_Fire_Diff'] = df['Horizontal_Distance_To_Hydrology'] - df['Horizontal_Distance_To_Fire_Points']
    
    # Roadway Interactions
    #df['Hydro_Road_1'] = abs(df['Horizontal_Distance_To_Hydrology'] + df['Horizontal_Distance_To_Roadways'])
    #df['Hydro_Road_2'] = abs(df['Horizontal_Distance_To_Hydrology'] - df['Horizontal_Distance_To_Roadways'])
    #df['Fire_Road_1'] = abs(df['Horizontal_Distance_To_Fire_Points'] + df['Horizontal_Distance_To_Roadways'])
    #df['Fire_Road_2'] = abs(df['Horizontal_Distance_To_Fire_Points'] - df['Horizontal_Distance_To_Roadways'])
    
    # Elevation Interactions
    #df['Road_Elev_Int'] = df['Horizontal_Distance_To_Roadways'] * df['Elevation']
    #df['VHydro_Elev_Int'] = df['Vertical_Distance_To_Hydrology'] * df['Elevation']
    df['Elev_VHydro_Diff'] = df.Elevation - df.Vertical_Distance_To_Hydrology
    df['Elev_HHydro_Diff'] = df.Elevation - df.Horizontal_Distance_To_Hydrology * 0.2
    
    return df

In [19]:
cv_score, oof_preds, test_score, fi_scores = score_features(
    get_pipeline(len(features)+14),
    keras_features
)

new_rows.append((
    'Keras_Features', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

fi_scores

Fold 0 Accuracy:  0.95798 in 124.67s.
Fold 1 Accuracy:  0.95954 in 121.11s.
Fold 2 Accuracy:  0.95608 in 111.81s.
Fold 3 Accuracy:  0.95684 in 98.59s.
Fold 4 Accuracy:  0.95806 in 126.82s.
Fold 5 Accuracy:  0.95812 in 105.63s.
Fold 6 Accuracy:  0.95688 in 90.0s.
Fold 7 Accuracy:  0.95636 in 92.24s.
Fold 8 Accuracy:  0.9601 in 118.46s.
Fold 9 Accuracy:  0.95768 in 119.72s.
Train Accuracy: 0.95776
Test Accuracy: 0.96004
Training Time: 1109.04s



Slope                                 0.000011
Aspect_360                            0.000111
Aspect                                0.000113
Hillshade_Range                       0.000352
Wilderness_Area2                      0.000862
Hillshade_9am_Clipped                 0.001662
Wilderness_Count                      0.001950
Hillshade_3pm_Clipped                 0.001976
Hillshade_Noon_Clipped                0.002128
Hillshade_Noon                        0.002226
Hillshade_3pm                         0.002516
Hillshade_9am                         0.004184
Wilderness_Area4                      0.005388
Horizontal_Distance_To_Hydrology      0.005443
Hydro_Fire_AbsDiff                    0.006127
Horizontal_Distance_To_Fire_Points    0.006463
Hillshade_Sum                         0.006603
Hydro_Fire_Sum                        0.007320
Hydro_Fire_EpsSum                     0.007698
Hydro_Fire_Diff                       0.008280
Vertical_Distance_To_Hydrology        0.008613
Wilderness_Ar

## 2. XGBoost

In [20]:
def xgboost_features(data):
    df = data.copy()
    shade_features = ['Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm']
    soil_features = [x for x in df.columns if x.startswith("Soil_Type")]
    wilderness_features = [x for x in df.columns if x.startswith("Wilderness_Area")]
    
    # Aspect Features
    df['Aspect_360'] = df['Aspect'] % 360
    #df['Aspect_Sine'] = (df['Aspect']* np.pi / 180).apply(np.sin)
    #df['Aspect_Alt'] = (df['Aspect']-180).where(df['Aspect']+180 > 360, df['Aspect'] + 180)

    # Hillshade Features
    #df["Hillshade_9am_Clipped"] = df["Hillshade_9am"].clip(lower=0, upper=255)
    #df["Hillshade_Noon_Clipped"] = df["Hillshade_Noon"].clip(lower=0, upper=255)
    #df["Hillshade_3pm_Clipped"] = df["Hillshade_3pm"].clip(lower=0, upper=255)
    #df["Hillshade_Avg"] = df[shade_features].mean(axis=1)
    #df["Hillshade_Sum"] = df[shade_features].sum(axis=1)
    #df['Hillshade_Range'] = df[shade_features].max(axis=1) - df[shade_features].min(axis=1)
    
    # Water Features
    df["Horizontal_Distance_To_Hydrology"] = df["Horizontal_Distance_To_Hydrology"].astype('float64')
    df["Vertical_Distance_To_Hydrology"] = df["Vertical_Distance_To_Hydrology"].astype('float64')
    pos_h_hydrology = start_at_eps(df["Horizontal_Distance_To_Hydrology"])
    pos_v_hydrology = start_at_eps(df['Vertical_Distance_To_Hydrology'])
    df["Hydro_Taxicab"] = np.abs(df["Horizontal_Distance_To_Hydrology"]) + np.abs(df["Vertical_Distance_To_Hydrology"])
    df['Hydro_Taxicab_Pos'] = (pos_h_hydrology ** 2 + pos_v_hydrology ** 2).apply(np.sqrt).rename('Euclidean_positive_hydrology').astype(np.float32)
    df["Hydro_Euclid"] = (df["Horizontal_Distance_To_Hydrology"]**2 + np.abs(df["Vertical_Distance_To_Hydrology"])**2)**0.5
    #df['Hydro_Euclid_Pos'] = (pos_h_hydrology ** 2 + pos_v_hydrology ** 2).apply(np.sqrt)
    #df['Water_Direction'] = df['Vertical_Distance_To_Hydrology'].apply(np.sign)
    df['Water Elevation'] = df['Elevation'] - df['Vertical_Distance_To_Hydrology']

    # Count Features 
    df["Soil_Count"] = df[soil_features].apply(sum, axis=1)
    df["Wilderness_Count"] = df[wilderness_features].apply(sum, axis=1)
    
    # Water/Fire Interactions
    df['Hydro_Fire_Sum'] = df['Horizontal_Distance_To_Hydrology'] + df['Horizontal_Distance_To_Fire_Points']
    df['Hydro_Fire_AbsDiff'] = abs(df['Horizontal_Distance_To_Hydrology'] - df['Horizontal_Distance_To_Fire_Points'])
    #df['Hydro_Fire_EpsSum'] = start_at_eps(df['Horizontal_Distance_To_Hydrology']) + start_at_eps(df['Horizontal_Distance_To_Fire_Points'])
    df['Hydro_Fire_Diff'] = df['Horizontal_Distance_To_Hydrology'] - df['Horizontal_Distance_To_Fire_Points']
    
    # Roadway Interactions
    df['Hydro_Road_1'] = abs(df['Horizontal_Distance_To_Hydrology'] + df['Horizontal_Distance_To_Roadways'])
    df['Hydro_Road_2'] = abs(df['Horizontal_Distance_To_Hydrology'] - df['Horizontal_Distance_To_Roadways'])
    df['Fire_Road_1'] = abs(df['Horizontal_Distance_To_Fire_Points'] + df['Horizontal_Distance_To_Roadways'])
    df['Fire_Road_2'] = abs(df['Horizontal_Distance_To_Fire_Points'] - df['Horizontal_Distance_To_Roadways'])
    
    # Elevation Interactions
    #df['Road_Elev_Int'] = df['Horizontal_Distance_To_Roadways'] * df['Elevation']
    #df['VHydro_Elev_Int'] = df['Vertical_Distance_To_Hydrology'] * df['Elevation']
    #df['Elev_VHydro_Diff'] = df.Elevation - df.Vertical_Distance_To_Hydrology
    df['Elev_HHydro_Diff'] = df.Elevation - df.Horizontal_Distance_To_Hydrology * 0.2
    
    return df

In [22]:
cv_score, oof_preds, test_score, fi_scores = score_features(
    xgb_pipeline, 
    xgboost_features
)

new_rows.append((
    'XGBoost_Features', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

fi_scores

Fold 0 Accuracy:  0.96076 in 55.34s.
Fold 1 Accuracy:  0.9616 in 54.57s.
Fold 2 Accuracy:  0.95944 in 54.54s.
Fold 3 Accuracy:  0.96142 in 55.31s.
Fold 4 Accuracy:  0.96066 in 54.85s.
Fold 5 Accuracy:  0.96166 in 55.24s.
Fold 6 Accuracy:  0.95854 in 55.26s.
Fold 7 Accuracy:  0.96096 in 55.52s.
Fold 8 Accuracy:  0.96166 in 56.26s.
Fold 9 Accuracy:  0.95966 in 56.23s.
Train Accuracy: 0.96064
Test Accuracy: 0.96122
Training Time: 553.14s



Hillshade_3pm                        -5.560000e-05
Aspect                               -2.800000e-05
Horizontal_Distance_To_Hydrology     -2.240000e-05
Hillshade_9am                        -1.880000e-05
Hillshade_Sum                        -2.000000e-06
Slope                                -8.000000e-07
Elev_VHydro_Diff                      0.000000e+00
Aspect_360                            2.920000e-05
Wilderness_Area2                      8.680000e-05
Hydro_Euclid                          9.920000e-05
Vertical_Distance_To_Hydrology        1.936000e-04
Hillshade_Noon                        3.036000e-04
Hydro_Taxicab                         3.088000e-04
Hydro_Taxicab_Pos                     5.980000e-04
Wilderness_Count                      6.488000e-04
Hydro_Fire_AbsDiff                    1.242000e-03
Hydro_Road_1                          1.294000e-03
Horizontal_Distance_To_Fire_Points    2.037200e-03
Hydro_Fire_Diff                       2.116400e-03
Hydro_Road_2                   

## 3. LightGBM

In [23]:
def lightgbm_features(data):
    df = data.copy()
    shade_features = ['Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm']
    soil_features = [x for x in df.columns if x.startswith("Soil_Type")]
    wilderness_features = [x for x in df.columns if x.startswith("Wilderness_Area")]
    
    # Aspect Features
    #df['Aspect_360'] = df['Aspect'] % 360
    #df['Aspect_Sine'] = (df['Aspect']* np.pi / 180).apply(np.sin)
    #df['Aspect_Alt'] = (df['Aspect']-180).where(df['Aspect']+180 > 360, df['Aspect'] + 180)

    # Hillshade Features
    #df["Hillshade_9am_Clipped"] = df["Hillshade_9am"].clip(lower=0, upper=255)
    #df["Hillshade_Noon_Clipped"] = df["Hillshade_Noon"].clip(lower=0, upper=255)
    #df["Hillshade_3pm_Clipped"] = df["Hillshade_3pm"].clip(lower=0, upper=255)
    #df["Hillshade_Avg"] = df[shade_features].mean(axis=1)
    #df["Hillshade_Sum"] = df[shade_features].sum(axis=1)
    #df['Hillshade_Range'] = df[shade_features].max(axis=1) - df[shade_features].min(axis=1)
    
    # Water Features
    df["Horizontal_Distance_To_Hydrology"] = df["Horizontal_Distance_To_Hydrology"].astype('float64')
    df["Vertical_Distance_To_Hydrology"] = df["Vertical_Distance_To_Hydrology"].astype('float64')
    pos_h_hydrology = start_at_eps(df["Horizontal_Distance_To_Hydrology"])
    pos_v_hydrology = start_at_eps(df['Vertical_Distance_To_Hydrology'])
    df["Hydro_Taxicab"] = np.abs(df["Horizontal_Distance_To_Hydrology"]) + np.abs(df["Vertical_Distance_To_Hydrology"])
    df['Hydro_Taxicab_Pos'] = (pos_h_hydrology ** 2 + pos_v_hydrology ** 2).apply(np.sqrt).rename('Euclidean_positive_hydrology').astype(np.float32)
    df["Hydro_Euclid"] = (df["Horizontal_Distance_To_Hydrology"]**2 + np.abs(df["Vertical_Distance_To_Hydrology"])**2)**0.5
    #df['Hydro_Euclid_Pos'] = (pos_h_hydrology ** 2 + pos_v_hydrology ** 2).apply(np.sqrt)
    #df['Water_Direction'] = df['Vertical_Distance_To_Hydrology'].apply(np.sign)
    df['Water Elevation'] = df['Elevation'] - df['Vertical_Distance_To_Hydrology']

    # Count Features 
    df["Soil_Count"] = df[soil_features].apply(sum, axis=1)
    df["Wilderness_Count"] = df[wilderness_features].apply(sum, axis=1)
    
    # Water/Fire Interactions
    df['Hydro_Fire_Sum'] = df['Horizontal_Distance_To_Hydrology'] + df['Horizontal_Distance_To_Fire_Points']
    df['Hydro_Fire_AbsDiff'] = abs(df['Horizontal_Distance_To_Hydrology'] - df['Horizontal_Distance_To_Fire_Points'])
    df['Hydro_Fire_EpsSum'] = start_at_eps(df['Horizontal_Distance_To_Hydrology']) + start_at_eps(df['Horizontal_Distance_To_Fire_Points'])
    df['Hydro_Fire_Diff'] = df['Horizontal_Distance_To_Hydrology'] - df['Horizontal_Distance_To_Fire_Points']
    
    # Roadway Interactions
    #df['Hydro_Road_1'] = abs(df['Horizontal_Distance_To_Hydrology'] + df['Horizontal_Distance_To_Roadways'])
    #df['Hydro_Road_2'] = abs(df['Horizontal_Distance_To_Hydrology'] - df['Horizontal_Distance_To_Roadways'])
    #df['Fire_Road_1'] = abs(df['Horizontal_Distance_To_Fire_Points'] + df['Horizontal_Distance_To_Roadways'])
    #df['Fire_Road_2'] = abs(df['Horizontal_Distance_To_Fire_Points'] - df['Horizontal_Distance_To_Roadways'])
    
    # Elevation Interactions
    #df['Road_Elev_Int'] = df['Horizontal_Distance_To_Roadways'] * df['Elevation']
    #df['VHydro_Elev_Int'] = df['Vertical_Distance_To_Hydrology'] * df['Elevation']
    #df['Elev_VHydro_Diff'] = df.Elevation - df.Vertical_Distance_To_Hydrology
    df['Elev_HHydro_Diff'] = df.Elevation - df.Horizontal_Distance_To_Hydrology * 0.2
    
    return df

In [24]:
cv_score, oof_preds, test_score, fi_scores = score_features(
    lgbm_pipeline, 
    lightgbm_features
)

new_rows.append((
    'LightGBM_Features', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

fi_scores

Fold 0 Accuracy:  0.94934 in 211.08s.
Fold 1 Accuracy:  0.94828 in 205.6s.
Fold 2 Accuracy:  0.95022 in 203.56s.
Fold 3 Accuracy:  0.9504 in 204.44s.
Fold 4 Accuracy:  0.94954 in 211.66s.
Fold 5 Accuracy:  0.95028 in 208.24s.
Fold 6 Accuracy:  0.94498 in 205.18s.
Fold 7 Accuracy:  0.94992 in 214.62s.
Fold 8 Accuracy:  0.95406 in 183.85s.
Fold 9 Accuracy:  0.95288 in 213.91s.
Train Accuracy: 0.94999
Test Accuracy: 0.95844
Training Time: 2062.14s



Slope                                -0.000045
Elev_VHydro_Diff                      0.000000
Aspect                                0.000036
Hillshade_9am                         0.000069
Wilderness_Area2                      0.000118
Hillshade_3pm                         0.000163
Hillshade_Noon                        0.000459
Wilderness_Count                      0.000693
Wilderness_Area4                      0.002885
Vertical_Distance_To_Hydrology        0.003821
Hydro_Taxicab_Pos                     0.003905
Hydro_Euclid                          0.004550
Hydro_Fire_AbsDiff                    0.005680
Hydro_Taxicab                         0.005954
Hydro_Fire_Sum                        0.007794
Hydro_Fire_EpsSum                     0.008380
Hydro_Fire_Diff                       0.010002
Horizontal_Distance_To_Hydrology      0.010099
Horizontal_Distance_To_Fire_Points    0.014570
Wilderness_Area1                      0.017055
Wilderness_Area3                      0.021079
Soil_Count   

## 4. CatBoost

In [25]:
def catboost_features(data):
    df = data.copy()
    shade_features = ['Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm']
    soil_features = [x for x in df.columns if x.startswith("Soil_Type")]
    wilderness_features = [x for x in df.columns if x.startswith("Wilderness_Area")]
    
    # Aspect Features
    #df['Aspect_360'] = df['Aspect'] % 360
    #df['Aspect_Sine'] = (df['Aspect']* np.pi / 180).apply(np.sin)
    #df['Aspect_Alt'] = (df['Aspect']-180).where(df['Aspect']+180 > 360, df['Aspect'] + 180)

    # Hillshade Features
    #df["Hillshade_9am_Clipped"] = df["Hillshade_9am"].clip(lower=0, upper=255)
    #df["Hillshade_Noon_Clipped"] = df["Hillshade_Noon"].clip(lower=0, upper=255)
    #df["Hillshade_3pm_Clipped"] = df["Hillshade_3pm"].clip(lower=0, upper=255)
    #df["Hillshade_Avg"] = df[shade_features].mean(axis=1)
    #df["Hillshade_Sum"] = df[shade_features].sum(axis=1)
    #df['Hillshade_Range'] = df[shade_features].max(axis=1) - df[shade_features].min(axis=1)
    
    # Water Features
    df["Horizontal_Distance_To_Hydrology"] = df["Horizontal_Distance_To_Hydrology"].astype('float64')
    df["Vertical_Distance_To_Hydrology"] = df["Vertical_Distance_To_Hydrology"].astype('float64')
    pos_h_hydrology = start_at_eps(df["Horizontal_Distance_To_Hydrology"])
    pos_v_hydrology = start_at_eps(df['Vertical_Distance_To_Hydrology'])
    df["Hydro_Taxicab"] = np.abs(df["Horizontal_Distance_To_Hydrology"]) + np.abs(df["Vertical_Distance_To_Hydrology"])
    df['Hydro_Taxicab_Pos'] = (pos_h_hydrology ** 2 + pos_v_hydrology ** 2).apply(np.sqrt).rename('Euclidean_positive_hydrology').astype(np.float32)
    df["Hydro_Euclid"] = (df["Horizontal_Distance_To_Hydrology"]**2 + np.abs(df["Vertical_Distance_To_Hydrology"])**2)**0.5
    df['Hydro_Euclid_Pos'] = (pos_h_hydrology ** 2 + pos_v_hydrology ** 2).apply(np.sqrt)
    #df['Water_Direction'] = df['Vertical_Distance_To_Hydrology'].apply(np.sign)
    df['Water Elevation'] = df['Elevation'] - df['Vertical_Distance_To_Hydrology']

    # Count Features 
    df["Soil_Count"] = df[soil_features].apply(sum, axis=1)
    #df["Wilderness_Count"] = df[wilderness_features].apply(sum, axis=1)
    
    # Water/Fire Interactions
    df['Hydro_Fire_Sum'] = df['Horizontal_Distance_To_Hydrology'] + df['Horizontal_Distance_To_Fire_Points']
    df['Hydro_Fire_AbsDiff'] = abs(df['Horizontal_Distance_To_Hydrology'] - df['Horizontal_Distance_To_Fire_Points'])
    df['Hydro_Fire_EpsSum'] = start_at_eps(df['Horizontal_Distance_To_Hydrology']) + start_at_eps(df['Horizontal_Distance_To_Fire_Points'])
    df['Hydro_Fire_Diff'] = df['Horizontal_Distance_To_Hydrology'] - df['Horizontal_Distance_To_Fire_Points']
    
    # Roadway Interactions
    df['Hydro_Road_1'] = abs(df['Horizontal_Distance_To_Hydrology'] + df['Horizontal_Distance_To_Roadways'])
    df['Hydro_Road_2'] = abs(df['Horizontal_Distance_To_Hydrology'] - df['Horizontal_Distance_To_Roadways'])
    df['Fire_Road_1'] = abs(df['Horizontal_Distance_To_Fire_Points'] + df['Horizontal_Distance_To_Roadways'])
    df['Fire_Road_2'] = abs(df['Horizontal_Distance_To_Fire_Points'] - df['Horizontal_Distance_To_Roadways'])
    
    # Elevation Interactions
    #df['Road_Elev_Int'] = df['Horizontal_Distance_To_Roadways'] * df['Elevation']
    #df['VHydro_Elev_Int'] = df['Vertical_Distance_To_Hydrology'] * df['Elevation']
    df['Elev_VHydro_Diff'] = df.Elevation - df.Vertical_Distance_To_Hydrology
    df['Elev_HHydro_Diff'] = df.Elevation - df.Horizontal_Distance_To_Hydrology * 0.2
    
    return df

In [26]:
cv_score, oof_preds, test_score, fi_scores = score_features(
    catboost_pipeline, 
    catboost_features
)

new_rows.append((
    'CatBoost_Features', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

fi_scores

Fold 0 Accuracy:  0.96084 in 181.91s.
Fold 1 Accuracy:  0.96124 in 182.56s.
Fold 2 Accuracy:  0.95948 in 183.62s.
Fold 3 Accuracy:  0.96142 in 186.29s.
Fold 4 Accuracy:  0.95962 in 182.08s.
Fold 5 Accuracy:  0.96204 in 180.43s.
Fold 6 Accuracy:  0.95916 in 181.09s.
Fold 7 Accuracy:  0.9606 in 180.96s.
Fold 8 Accuracy:  0.96222 in 181.19s.
Fold 9 Accuracy:  0.9598 in 181.33s.
Train Accuracy: 0.96064
Test Accuracy: 0.96082
Training Time: 1821.46s



Slope                                -0.000029
Hillshade_9am                         0.000012
Hillshade_3pm                         0.000024
Wilderness_Area2                      0.000079
Aspect                                0.000100
Horizontal_Distance_To_Hydrology      0.000160
Hydro_Euclid                          0.000206
Hydro_Taxicab_Pos                     0.000270
Hydro_Euclid_Pos                      0.000288
Vertical_Distance_To_Hydrology        0.000294
Hydro_Taxicab                         0.000330
Hillshade_Noon                        0.000427
Hydro_Fire_EpsSum                     0.000716
Hydro_Road_1                          0.000888
Hydro_Fire_AbsDiff                    0.001514
Hydro_Fire_Sum                        0.001530
Hydro_Fire_Diff                       0.001596
Wilderness_Area4                      0.001720
Horizontal_Distance_To_Fire_Points    0.002580
Hydro_Road_2                          0.002973
Fire_Road_2                           0.003645
Wilderness_Ar

# Summary

In [27]:
pd.DataFrame.from_records(
    data = new_rows,
    columns = ['features','cv_scores','holdout','recall_0', 'recall_1','recall_2','recall_3','recall_4','recall_5']
).sort_values('holdout')

Unnamed: 0,features,cv_scores,holdout,recall_0,recall_1,recall_2,recall_3,recall_4,recall_5
2,LightGBM,0.946184,0.955393,0.9551,0.961865,0.834778,0.0,0.415266,0.619555
1,XGBoost,0.95766,0.958325,0.963682,0.971842,0.875,0.170213,0.422969,0.663112
3,CatBoost,0.957962,0.958354,0.964499,0.972132,0.876758,0.106383,0.420168,0.648079
6,LightGBM_Features,0.94999,0.958444,0.955726,0.964058,0.853336,0.0,0.453782,0.704227
0,Keras,0.95695,0.959554,0.960445,0.970017,0.881622,0.106383,0.432073,0.738019
4,Keras_Features,0.957764,0.960043,0.960963,0.971177,0.885015,0.06383,0.39986,0.731466
7,CatBoost_Features,0.960642,0.960821,0.965567,0.973257,0.885096,0.148936,0.490196,0.714891
5,XGBoost_Features,0.960636,0.96122,0.965251,0.972238,0.888326,0.170213,0.52381,0.742516
