# TPS12 - Feature Engineering

In [1]:
# Global variables for testing changes to this notebook quickly
RANDOM_SEED = 0
NUM_FOLDS = 3
TRAIN_SIZE = 500000

In [16]:
import numpy as np
import pandas as pd
import time
import os
import pyarrow
import gc

# Hide warnings
import warnings
warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

# Model/Evaluation
from functools import partial
from sklearn.base import clone
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder, RobustScaler
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, recall_score
from sklearn.inspection import permutation_importance

# Tensorflow/Keras
import tensorflow as tf
from tensorflow import keras

# Keras imports
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

# Load and Prepare Data

In [3]:
# Load full training data
train = pd.read_feather('../data/train.feather')

# Drop low/no variance 
train.drop(["Soil_Type7", "Id", "Soil_Type15"], axis=1, inplace=True)
train = train[train.Cover_Type != 5]

# Label Encoding
new_encoder = LabelEncoder()
train["Cover_Type"] = new_encoder.fit_transform(train["Cover_Type"])

# Split synthetic data
train, test = train_test_split(
    train, 
    train_size = TRAIN_SIZE, 
    random_state = RANDOM_SEED,
    stratify = train['Cover_Type'],
)
y_train = train['Cover_Type']


# features, data structure for summary scores
features = [x for x in train.columns if x not in ['Id','Cover_Type']]
nonsoil = [x for x in features if not x.startswith('Soil_Type')]
new_rows = list()
gc.collect()

print(f'Training Size: {train.shape[0]} rows, {train.shape[1]} cols')
print(f'Holdout Size: {test.shape[0]} rows, {test.shape[1]} cols\n')

Training Size: 500000 rows, 53 cols
Holdout Size: 3499999 rows, 53 cols



# Model 

In [4]:
# Keras Parameters
BATCH_SIZE = 512
EPOCHS = 100
EARLY_STOP = 5
VERBOSE = 0
NUM_CLASSES = len(train.Cover_Type.value_counts())

In [20]:
def build_model(input_size):
    model = keras.Sequential(
        [
            layers.InputLayer(input_shape = (input_size,)),
            layers.Dense(input_size, kernel_initializer="lecun_normal", activation="selu"),
            layers.Dense(3*input_size//4, kernel_initializer="lecun_normal", activation="selu"),
            layers.Dense(input_size//2, kernel_initializer="lecun_normal", activation="selu"),
            layers.Dense(NUM_CLASSES, activation="softmax"),
        ]
    )

    model.compile(
        loss = tf.keras.losses.SparseCategoricalCrossentropy(), 
        optimizer = tf.keras.optimizers.Adam(),
        metrics=['accuracy'],
    )
    return model 

In [6]:
# Fit parameters for Keras model
keras_pipeline = make_pipeline(
    RobustScaler(),
    KerasClassifier(
        partial(build_model, input_size = len(features)),
        batch_size = BATCH_SIZE,
        epochs = EPOCHS,
        verbose = VERBOSE,
        shuffle = True,
        validation_split = 0.1,
        callbacks = [
            EarlyStopping(
                patience=EARLY_STOP,
                monitor='val_loss',
                restore_best_weights=True,
            ),
        ],
    )
)

# Scoring Function

In [7]:
def score_features(sklearn_model, processing = None):
    
    # Original Training/Test Split
    features = [x for x in train.columns if x not in ['Id','Cover_Type']]
    X_temp, X_test = train[features], test[features]
    y_temp, y_test = train['Cover_Type'], test['Cover_Type']
    
    # Feature Engineering
    if processing:
        X_temp = processing(X_temp)
        X_test = processing(X_test)
    features = [x for x in X_temp.columns]
    
    # Store the out-of-fold predictions
    test_preds = np.zeros((X_test.shape[0],6))
    oof_preds = np.zeros((X_temp.shape[0],))
    fi_scores = np.zeros((X_temp.shape[1],))
    scores, times = np.zeros(NUM_FOLDS), np.zeros(NUM_FOLDS)
    
    # Stratified k-fold cross-validation
    skf = StratifiedKFold(n_splits = NUM_FOLDS, shuffle = True, random_state = RANDOM_SEED)
    for fold, (train_idx, valid_idx) in enumerate(skf.split(X_temp,y_temp)):
       
        # Training and Validation Sets
        X_train, X_valid = X_temp.iloc[train_idx], X_temp.iloc[valid_idx]
        y_train, y_valid = y_temp.iloc[train_idx], y_temp.iloc[valid_idx]
        
        # Create model
        start = time.time()
        model = clone(sklearn_model)
        model.fit(X_train, y_train)

        # Permutation Importance
        result = permutation_importance(
            model, X_valid, y_valid, 
            n_repeats=10, random_state=RANDOM_SEED
        )
        fi_scores += result.importances_mean / NUM_FOLDS

        # validation/holdout predictions
        valid_preds = np.ravel(model.predict(X_valid))
        oof_preds[valid_idx] = valid_preds
        test_preds += model.predict_proba(X_test)

        # Save scores and times
        scores[fold] = accuracy_score(y_valid, valid_preds)
        end = time.time()
        times[fold] = end-start
        print(f'Fold {fold} Accuracy:  {round(scores[fold], 5)} in {round(end-start,2)}s.')
        time.sleep(0.5)
    
    nonsoil = [x for x in X_test.columns if not x.startswith('Soil_Type')]
    test_preds = np.argmax(test_preds, axis = 1)
    test_score = accuracy_score(y_test, test_preds)
    #print('\n'+model.__class__.__name__)
    print("Train Accuracy:", round(scores.mean(), 5))
    print('Test Accuracy:', round(test_score, 5))
    print(f'Training Time: {round(times.sum(), 2)}s')
    
    fi_scores = pd.Series(
        data = fi_scores, 
        index = features
    ).loc[nonsoil].sort_values()
    
    return scores.mean(), oof_preds, test_score, fi_scores

# Baseline

In [8]:
cv_score, oof_preds, test_score, fi_scores = score_features(
    keras_pipeline
)

new_rows.append((
    'Baseline', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

fi_scores

Fold 0 Accuracy:  0.95626 in 202.62s.
Fold 1 Accuracy:  0.95452 in 194.71s.
Fold 2 Accuracy:  0.9542 in 193.17s.
Train Accuracy: 0.95499
Test Accuracy: 0.95672
Training Time: 590.5s


Hillshade_9am                        -0.000017
Slope                                 0.000040
Hillshade_3pm                         0.000077
Aspect                                0.000087
Wilderness_Area2                      0.000095
Hillshade_Noon                        0.000465
Wilderness_Area1                      0.005630
Horizontal_Distance_To_Hydrology      0.010718
Wilderness_Area4                      0.012379
Vertical_Distance_To_Hydrology        0.018118
Wilderness_Area3                      0.022362
Horizontal_Distance_To_Fire_Points    0.031108
Horizontal_Distance_To_Roadways       0.048320
Elevation                             0.458086
dtype: float64

# Feature Engineering

## 1 . Fix Aspect and Hillshade Values

In [9]:
# Fix aspect
def fix_ranges(data):
    
    df = data.copy()
    
    # Fix Aspect
    df["Aspect"][df["Aspect"] < 0] += 360
    df["Aspect"][df["Aspect"] > 359] -= 360
    
    # Fix Hillshade
    df.loc[df["Hillshade_9am"] < 0, "Hillshade_9am"] = 0
    df.loc[df["Hillshade_Noon"] < 0, "Hillshade_Noon"] = 0
    df.loc[df["Hillshade_3pm"] < 0, "Hillshade_3pm"] = 0
    df.loc[df["Hillshade_9am"] > 255, "Hillshade_9am"] = 255
    df.loc[df["Hillshade_Noon"] > 255, "Hillshade_Noon"] = 255
    df.loc[df["Hillshade_3pm"] > 255, "Hillshade_3pm"] = 255
    
    return df

In [10]:
cv_score, oof_preds, test_score, fi_scores = score_features(
    keras_pipeline, 
    fix_ranges
)

new_rows.append((
    'Fix_Range', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

fi_scores

Fold 0 Accuracy:  0.95573 in 195.3s.
Fold 1 Accuracy:  0.95607 in 206.05s.
Fold 2 Accuracy:  0.95613 in 204.97s.
Train Accuracy: 0.95597
Test Accuracy: 0.95771
Training Time: 606.32s


Hillshade_3pm                         0.000032
Aspect                                0.000036
Slope                                 0.000037
Hillshade_9am                         0.000081
Wilderness_Area2                      0.000117
Hillshade_Noon                        0.000395
Wilderness_Area1                      0.004932
Wilderness_Area4                      0.008946
Horizontal_Distance_To_Hydrology      0.011241
Vertical_Distance_To_Hydrology        0.018506
Wilderness_Area3                      0.024273
Horizontal_Distance_To_Fire_Points    0.031478
Horizontal_Distance_To_Roadways       0.049205
Elevation                             0.461040
dtype: float64

## 2. Water Distance Features

In [21]:
# Fit parameters for Keras model
keras_pipeline = make_pipeline(
    RobustScaler(),
    KerasClassifier(
        partial(build_model, input_size = len(features)+2),
        batch_size = BATCH_SIZE,
        epochs = EPOCHS,
        verbose = VERBOSE,
        shuffle = True,
        validation_split = 0.1,
        callbacks = [
            EarlyStopping(
                patience=EARLY_STOP,
                monitor='val_loss',
                restore_best_weights=True,
            ),
        ],
    )
)

In [22]:
# Distance to Water
def water_distance_features(data):
    df = data.copy()
    
    # use float64 for squaring
    df["Horizontal_Distance_To_Hydrology"] = df["Horizontal_Distance_To_Hydrology"].astype('float64')
    df["Vertical_Distance_To_Hydrology"] = df["Vertical_Distance_To_Hydrology"].astype('float64')
    
    # compute metrics
    df["Hydro_Taxicab"] = np.abs(df["Horizontal_Distance_To_Hydrology"]) + np.abs(df["Vertical_Distance_To_Hydrology"])
    df["Hydro_Euclid"] = (df["Horizontal_Distance_To_Hydrology"]**2 + np.abs(df["Vertical_Distance_To_Hydrology"])**2)**0.5
    
    # convert back
    df["Horizontal_Distance_To_Hydrology"] = df["Horizontal_Distance_To_Hydrology"].astype('float32')
    df["Vertical_Distance_To_Hydrology"] = df["Vertical_Distance_To_Hydrology"].astype('float32')
    df["Hydro_Taxicab"] = df["Hydro_Taxicab"].astype('float32')
    df["Hydro_Euclid"] = df["Hydro_Euclid"].astype('float32')
    
    return df

In [23]:
cv_score, oof_preds, test_score, fi_scores = score_features(
    keras_pipeline, 
    water_distance_features
)

new_rows.append((
    'Water_Dist', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

fi_scores

Fold 0 Accuracy:  0.95579 in 201.75s.
Fold 1 Accuracy:  0.9564 in 236.53s.
Fold 2 Accuracy:  0.95517 in 212.3s.
Train Accuracy: 0.95579
Test Accuracy: 0.95777
Training Time: 650.58s


Hillshade_3pm                        -0.000086
Hillshade_9am                        -0.000034
Wilderness_Area2                      0.000001
Slope                                 0.000005
Aspect                                0.000020
Hillshade_Noon                        0.000359
Wilderness_Area1                      0.005319
Wilderness_Area4                      0.010468
Vertical_Distance_To_Hydrology        0.012476
Hydro_Taxicab                         0.013185
Hydro_Euclid                          0.018479
Horizontal_Distance_To_Hydrology      0.019509
Wilderness_Area3                      0.023161
Horizontal_Distance_To_Fire_Points    0.032168
Horizontal_Distance_To_Roadways       0.047550
Elevation                             0.461135
dtype: float64

## 3. Count Features

In [24]:
# Create count features
def count_features(data):
    
    df = data.copy()
    soil_features = [x for x in df.columns if x.startswith("Soil_Type")]
    wilderness_features = [x for x in df.columns if x.startswith("Wilderness_Area")]

    # Count features
    df["Soil_Count"] = df[soil_features].apply(sum, axis=1)
    df["Wilderness_Count"] = df[wilderness_features].apply(sum, axis=1)
    
    return df

In [25]:
cv_score, oof_preds, test_score, fi_scores = score_features(
    keras_pipeline, 
    count_features
)

new_rows.append((
    'Count', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

fi_scores

Fold 0 Accuracy:  0.95646 in 239.44s.
Fold 1 Accuracy:  0.95551 in 240.48s.
Fold 2 Accuracy:  0.95607 in 233.46s.
Train Accuracy: 0.95601
Test Accuracy: 0.95785
Training Time: 713.38s


Hillshade_3pm                        -0.000052
Hillshade_9am                        -0.000029
Aspect                                0.000006
Slope                                 0.000061
Hillshade_Noon                        0.000371
Wilderness_Area2                      0.001008
Wilderness_Count                      0.002128
Wilderness_Area4                      0.007962
Horizontal_Distance_To_Hydrology      0.010908
Wilderness_Area1                      0.011653
Wilderness_Area3                      0.017273
Vertical_Distance_To_Hydrology        0.019399
Horizontal_Distance_To_Fire_Points    0.030980
Soil_Count                            0.046622
Horizontal_Distance_To_Roadways       0.047309
Elevation                             0.462201
dtype: float64

## 4. Shade Features

In [26]:
# Create Shade features
def new_shade_features(data):
    
    df = data.copy()
    shade_features = ['Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm']

    # Hillshade
    df["Hillshade_Avg"] = df[shade_features].mean(axis=1)
    df['Hillshade_Range'] = df[shade_features].max(axis=1) - df[shade_features].min(axis=1)
    
    return df

In [27]:
cv_score, oof_preds, test_score, fi_scores = score_features(
    keras_pipeline, 
    new_shade_features
)

new_rows.append((
    'Shade_Features', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

fi_scores

Fold 0 Accuracy:  0.95721 in 236.64s.
Fold 1 Accuracy:  0.95554 in 218.33s.
Fold 2 Accuracy:  0.9535 in 209.59s.
Train Accuracy: 0.95542
Test Accuracy: 0.95734
Training Time: 664.56s


Slope                                -0.000042
Aspect                               -0.000014
Wilderness_Area2                     -0.000004
Hillshade_Range                       0.000525
Hillshade_Noon                        0.001885
Hillshade_9am                         0.003416
Wilderness_Area1                      0.005463
Hillshade_3pm                         0.006396
Hillshade_Avg                         0.009258
Horizontal_Distance_To_Hydrology      0.010001
Wilderness_Area4                      0.011735
Vertical_Distance_To_Hydrology        0.018436
Wilderness_Area3                      0.022883
Horizontal_Distance_To_Fire_Points    0.031208
Horizontal_Distance_To_Roadways       0.047122
Elevation                             0.460704
dtype: float64

## 5. Distance Interactions

In [28]:
# Fit parameters for Keras model
keras_pipeline = make_pipeline(
    RobustScaler(),
    KerasClassifier(
        partial(build_model, input_size = len(features)+6),
        batch_size = BATCH_SIZE,
        epochs = EPOCHS,
        verbose = VERBOSE,
        shuffle = True,
        validation_split = 0.1,
        callbacks = [
            EarlyStopping(
                patience=EARLY_STOP,
                monitor='val_loss',
                restore_best_weights=True,
            ),
        ],
    )
)

In [29]:
def distance_interactions(data):
    
    df = data.copy()
    df['Hydro_Fire_1'] = df['Horizontal_Distance_To_Hydrology'] + df['Horizontal_Distance_To_Fire_Points']
    df['Hydro_Fire_2'] = abs(df['Horizontal_Distance_To_Hydrology'] - df['Horizontal_Distance_To_Fire_Points'])
    df['Hydro_Road_1'] = abs(df['Horizontal_Distance_To_Hydrology'] + df['Horizontal_Distance_To_Roadways'])
    df['Hydro_Road_2'] = abs(df['Horizontal_Distance_To_Hydrology'] - df['Horizontal_Distance_To_Roadways'])
    df['Fire_Road_1'] = abs(df['Horizontal_Distance_To_Fire_Points'] + df['Horizontal_Distance_To_Roadways'])
    df['Fire_Road_2'] = abs(df['Horizontal_Distance_To_Fire_Points'] - df['Horizontal_Distance_To_Roadways'])
    return df

In [30]:
cv_score, oof_preds, test_score, fi_scores = score_features(
    keras_pipeline, 
    distance_interactions
)

new_rows.append((
    'Dist_Interactions', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

fi_scores

Fold 0 Accuracy:  0.95562 in 238.7s.
Fold 1 Accuracy:  0.95645 in 242.24s.
Fold 2 Accuracy:  0.95642 in 251.27s.
Train Accuracy: 0.95616
Test Accuracy: 0.95796
Training Time: 732.21s


Hillshade_3pm                        -0.000037
Slope                                -0.000027
Hillshade_9am                         0.000059
Aspect                                0.000061
Wilderness_Area2                      0.000114
Hillshade_Noon                        0.000448
Wilderness_Area1                      0.004179
Wilderness_Area4                      0.009221
Fire_Road_1                           0.009596
Hydro_Road_2                          0.010647
Fire_Road_2                           0.012690
Horizontal_Distance_To_Hydrology      0.013540
Hydro_Fire_2                          0.013722
Hydro_Fire_1                          0.014504
Horizontal_Distance_To_Fire_Points    0.014574
Horizontal_Distance_To_Roadways       0.016397
Vertical_Distance_To_Hydrology        0.018744
Hydro_Road_1                          0.020072
Wilderness_Area3                      0.024378
Elevation                             0.462099
dtype: float64

## 6. Elevation Interaction

In [31]:
def elevation_interactions(data):
    
    df = data.copy()
    df['EHiElv'] = df['Horizontal_Distance_To_Roadways'] * df['Elevation']
    df['EViElv'] = df['Vertical_Distance_To_Hydrology'] * df['Elevation']
    df['EVDtH'] = df.Elevation - df.Vertical_Distance_To_Hydrology
    df['EHDtH'] = df.Elevation - df.Horizontal_Distance_To_Hydrology * 0.2
    
    # Throwaway Features most likely
    df['Highwater'] = (df.Vertical_Distance_To_Hydrology < 0).astype(int)
    df['Hillshade_3pm_is_zero'] = (df.Hillshade_3pm == 0).astype(int)
    
    return df

In [32]:
cv_score, oof_preds, test_score, fi_scores = score_features(
    keras_pipeline, 
    elevation_interactions
)

new_rows.append((
    'Elev_Interactions', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

fi_scores

Fold 0 Accuracy:  0.95643 in 250.11s.
Fold 1 Accuracy:  0.95657 in 252.33s.
Fold 2 Accuracy:  0.95526 in 243.23s.
Train Accuracy: 0.95608
Test Accuracy: 0.9581
Training Time: 745.68s


Slope                                -0.000056
Hillshade_3pm_is_zero                -0.000003
EHiElv                                0.000009
Hillshade_9am                         0.000026
Hillshade_3pm                         0.000047
Aspect                                0.000051
Highwater                             0.000059
EViElv                                0.000059
Wilderness_Area2                      0.000135
Hillshade_Noon                        0.000564
Wilderness_Area1                      0.004612
Horizontal_Distance_To_Hydrology      0.007111
Vertical_Distance_To_Hydrology        0.007137
Wilderness_Area4                      0.007697
Wilderness_Area3                      0.024546
Horizontal_Distance_To_Fire_Points    0.031732
Horizontal_Distance_To_Roadways       0.048176
EHDtH                                 0.126353
EVDtH                                 0.128327
Elevation                             0.135859
dtype: float64

## Summary

In [33]:
pd.DataFrame.from_records(
    data = new_rows,
    columns = ['features','cv_scores','holdout','recall_0', 'recall_1','recall_2','recall_3','recall_4','recall_5']
).sort_values('holdout')

Unnamed: 0,features,cv_scores,holdout,recall_0,recall_1,recall_2,recall_3,recall_4,recall_5
0,Baseline,0.954992,0.956718,0.953786,0.972348,0.870585,0.0,0.303922,0.743415
4,Shade_Features,0.955418,0.957339,0.9602,0.968801,0.879251,0.0,0.302521,0.721444
1,Fix_Range,0.955974,0.957713,0.961551,0.969713,0.883012,0.0,0.284314,0.68367
2,Water_Dist,0.955786,0.95777,0.958451,0.96955,0.879333,0.021277,0.368347,0.746627
3,Count,0.956012,0.957847,0.960363,0.969327,0.873896,0.021277,0.39986,0.735449
5,Dist_Interactions,0.95616,0.957956,0.959573,0.96885,0.901815,0.0,0.231092,0.724271
6,Elev_Interactions,0.956084,0.958098,0.961927,0.969483,0.873896,0.0,0.282213,0.71926
