# Notebook 3b - Soil Type Engineering

In this notebook, we use soil type features to engineer new features using interactions.

In [1]:
# Global variables for testing changes to this notebook quickly
RANDOM_SEED = 0
NUM_FOLDS = 12

In [2]:
import numpy as np
import pandas as pd
import time
import pyarrow
import gc

# Model evaluation
from functools import partial
from sklearn.base import clone
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, recall_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier

# Plotting
import matplotlib
import seaborn as sns
from matplotlib import pyplot as plt

# Hide warnings
import warnings
warnings.filterwarnings('ignore')

# Prepare Data

In [3]:
# Encode soil type
def categorical_encoding(input_df):
    data = input_df.copy()
    data['Soil_Type'] = 0
    soil_features = list()
    for i in range(1,41):
        data['Soil_Type'] += i*data[f'Soil_Type{i}']
        soil_features.append(f'Soil_Type{i}')
    nonsoil_features = [x for x in data.columns if x not in soil_features]
    return data[nonsoil_features]

In [4]:
%%time

# Load original data
original = categorical_encoding(pd.read_feather('../data/original.feather'))

# Label Encode
old_encoder = LabelEncoder()
original["Cover_Type"] = old_encoder.fit_transform(original["Cover_Type"])
y_train = original['Cover_Type'].iloc[:15119]
y_test = original['Cover_Type'].iloc[15119:]

# Get feature columns
features = [x for x in original.columns if x not in ['Id','Cover_Type']]

# Data structures for summary scores
bagging_scores = list()
extratrees_scores = list()
adaboost_scores = list()
random_scores = list()

Wall time: 239 ms


# Scoring Function

In [5]:
def train_original(sklearn_model, processing = None):
    
    # Original Training/Test Split
    X_temp = original[features].iloc[:15119]
    X_test = original[features].iloc[15119:]
    y_temp = original['Cover_Type'].iloc[:15119]
    y_test = original['Cover_Type'].iloc[15119:]
    
    # Feature Engineering
    if processing:
        X_temp = processing(X_temp)
        X_test = processing(X_test)
        
    # Store the out-of-fold predictions
    test_preds = np.zeros((X_test.shape[0],7))
    oof_preds = np.zeros((X_temp.shape[0],))
    scores, times = np.zeros(NUM_FOLDS), np.zeros(NUM_FOLDS)
    
    # Stratified k-fold cross-validation
    skf = StratifiedKFold(n_splits = NUM_FOLDS, shuffle = True, random_state = RANDOM_SEED)
    for fold, (train_idx, valid_idx) in enumerate(skf.split(X_temp,y_temp)):
       
        # Training and Validation Sets
        X_train, X_valid = X_temp.iloc[train_idx], X_temp.iloc[valid_idx]
        y_train, y_valid = y_temp.iloc[train_idx], y_temp.iloc[valid_idx]
        
        # Create model
        start = time.time()
        model = clone(sklearn_model)
        model.fit(X_train, y_train)

        # validation and test predictions
        valid_preds = np.ravel(model.predict(X_valid))
        oof_preds[valid_idx] = valid_preds
        test_preds += model.predict_proba(X_test)
        
        # Save scores and times
        scores[fold] = accuracy_score(y_valid, valid_preds)
        end = time.time()
        times[fold] = end-start
        time.sleep(0.5)
    
    test_preds = np.argmax(test_preds, axis = 1)
    test_score = accuracy_score(y_test, test_preds)
    print('\n'+model.__class__.__name__)
    print("Train Accuracy:", round(scores.mean(), 5))
    print('Test Accuracy:', round(test_score, 5))
    print(f'Training Time: {round(times.sum(), 2)}s')
    
    return scores.mean(), oof_preds, test_score

# Models

We use the following 4 models from the scikit-learn library:

1. AdaBoost 
2. ExtraTrees
3. Bagging
4. Random Forest

In [6]:
# AdaBoost Classifier
adaboost = AdaBoostClassifier(
    base_estimator = DecisionTreeClassifier(
        splitter = 'random',
        random_state = RANDOM_SEED,
    ),
    random_state = RANDOM_SEED,
)

# ExtraTrees Classifier
extratrees = ExtraTreesClassifier(
    n_jobs = -1,
    random_state = RANDOM_SEED,
    max_features = None,
)

# Bagging Classifier
bagging = BaggingClassifier(
    base_estimator = DecisionTreeClassifier(
        splitter = 'random',
        random_state = RANDOM_SEED,
    ),
    n_jobs = -1,
    random_state = RANDOM_SEED
)

# Random Forest Classifier
randomforest = RandomForestClassifier(
    n_jobs = -1,
    random_state = RANDOM_SEED,
)

# Baselines

In [7]:
# AdaBoost
cv_score, oof_preds, test_score = train_original(adaboost)

adaboost_scores.append((
    'Baseline', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

# ExtraTrees
cv_score, oof_preds, test_score = train_original(extratrees)

extratrees_scores.append((
    'Baseline', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

# Bagging
cv_score, oof_preds, test_score = train_original(bagging)

bagging_scores.append((
    'Baseline', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

cv_score, oof_preds, test_score = train_original(randomforest)

random_scores.append((
    'Baseline', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))


AdaBoostClassifier
Train Accuracy: 0.78596
Test Accuracy: 0.76229
Training Time: 3.68s

ExtraTreesClassifier
Train Accuracy: 0.88538
Test Accuracy: 0.78206
Training Time: 37.48s

BaggingClassifier
Train Accuracy: 0.85006
Test Accuracy: 0.75982
Training Time: 21.03s

RandomForestClassifier
Train Accuracy: 0.86454
Test Accuracy: 0.74822
Training Time: 35.33s


# Categorial Feature Interactions

We test out the following interactions:

1. Climatic Zone and Wilderness Area
2. Geologic Zone and Wilderness Area
3. Surface Cover and Wilderness Area
4. Rock Size and Wilderness Area

## 1. Climatic Zone and Wilderness Area

In [8]:
def climatic_zone_original(input_df):
    code = {
        1:2702,2:2703,3:2704,4:2705,5:2706,6:2717,7:3501,8:3502,9:4201,
        10:4703,11:4704,12:4744,13:4758,14:5101,15:5151,16:6101,17:6102,
        18:6731,19:7101,20:7102,21:7103,22:7201,23:7202,24:7700,25:7701,
        26:7702,27:7709,28:7710,29:7745,30:7746,31:7755,32:7756,33:7757,
        34:7790,35:8703,36:8707,37:8708,38:8771,39:8772,40:8776
    }
    df = input_df.copy()
    df['Climatic_Zone'] = input_df['Soil_Type'].apply(lambda x: int(str(code[x])[0]))
    return df

def wilderness_climatic(input_df, drop = False):
    data = climatic_zone_original(input_df)
    df = input_df.copy()
    df['Climate_Area1'] = df['Wilderness_Area1']*data['Climatic_Zone'] 
    df['Climate_Area2'] = df['Wilderness_Area2']*data['Climatic_Zone'] 
    df['Climate_Area3'] = df['Wilderness_Area3']*data['Climatic_Zone'] 
    df['Climate_Area4'] = df['Wilderness_Area4']*data['Climatic_Zone'] 
    return df

In [9]:
# AdaBoost
cv_score, oof_preds, test_score = train_original(adaboost, wilderness_climatic)

adaboost_scores.append((
    'Wild_Clim', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

# ExtraTrees
cv_score, oof_preds, test_score = train_original(extratrees, wilderness_climatic)

extratrees_scores.append((
    'Wild_Clim', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

# Bagging
cv_score, oof_preds, test_score = train_original(bagging, wilderness_climatic)

bagging_scores.append((
    'Wild_Clim', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

# RandomForest
cv_score, oof_preds, test_score = train_original(randomforest, wilderness_climatic)

random_scores.append((
    'Wild_Clim', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))


AdaBoostClassifier
Train Accuracy: 0.78868
Test Accuracy: 0.75762
Training Time: 3.43s

ExtraTreesClassifier
Train Accuracy: 0.88524
Test Accuracy: 0.78313
Training Time: 36.32s

BaggingClassifier
Train Accuracy: 0.85297
Test Accuracy: 0.76035
Training Time: 21.58s

RandomForestClassifier
Train Accuracy: 0.86441
Test Accuracy: 0.7501
Training Time: 35.42s


## 2. Geologic Zone and Wilderness Area

In [10]:
def geologic_zone_original(input_df):
    code = {
        1:2702,2:2703,3:2704,4:2705,5:2706,6:2717,7:3501,8:3502,9:4201,
        10:4703,11:4704,12:4744,13:4758,14:5101,15:5151,16:6101,17:6102,
        18:6731,19:7101,20:7102,21:7103,22:7201,23:7202,24:7700,25:7701,
        26:7702,27:7709,28:7710,29:7745,30:7746,31:7755,32:7756,33:7757,
        34:7790,35:8703,36:8707,37:8708,38:8771,39:8772,40:8776
    }
    df = input_df.copy()
    df['Geologic_Zone'] = input_df['Soil_Type'].apply(lambda x: int(str(code[x])[1]))
    return df

def wilderness_geologic(input_df, drop = False):
    data = geologic_zone_original(input_df)
    df = input_df.copy()
    df['Geologic_Area1'] = df['Wilderness_Area1']*data['Geologic_Zone'] 
    df['Geologic_Area2'] = df['Wilderness_Area2']*data['Geologic_Zone']  
    df['Geologic_Area3'] = df['Wilderness_Area3']*data['Geologic_Zone'] 
    df['Geologic_Area4'] = df['Wilderness_Area4']*data['Geologic_Zone'] 
    return df

In [11]:
# AdaBoost
cv_score, oof_preds, test_score = train_original(adaboost, wilderness_geologic)

adaboost_scores.append((
    'Wild_Geo', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

# ExtraTrees
cv_score, oof_preds, test_score = train_original(extratrees, wilderness_geologic)

extratrees_scores.append((
    'Wild_Geo', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

# Bagging
cv_score, oof_preds, test_score = train_original(bagging, wilderness_geologic)

bagging_scores.append((
    'Wild_Geo', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

# RandomForest
cv_score, oof_preds, test_score = train_original(randomforest, wilderness_geologic)

random_scores.append((
    'Wild_Geo', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))


AdaBoostClassifier
Train Accuracy: 0.7906
Test Accuracy: 0.76019
Training Time: 3.52s

ExtraTreesClassifier
Train Accuracy: 0.88339
Test Accuracy: 0.78244
Training Time: 35.59s

BaggingClassifier
Train Accuracy: 0.85336
Test Accuracy: 0.75781
Training Time: 19.68s

RandomForestClassifier
Train Accuracy: 0.8654
Test Accuracy: 0.75015
Training Time: 34.45s


## 3. Surface Cover and Wilderness Area

In [12]:
def surface_cover_original(input_df):
    # Group IDs
    no_desc = [7,8,14,15,16,17,19,20,21,23,35]
    stony = [6,12]
    very_stony = [2,9,18,26]
    extremely_stony = [1,22,24,25,27,28,29,30,31,32,33,34,36,37,38,39,40]
    rubbly = [3,4,5,10,11,13]

    # Create dictionary
    surface_cover = {i:0 for i in no_desc}
    surface_cover.update({i:1 for i in stony})
    surface_cover.update({i:2 for i in very_stony})
    surface_cover.update({i:3 for i in extremely_stony})
    surface_cover.update({i:4 for i in rubbly})
    
    # Create Feature
    df = input_df.copy()
    df['Surface_Cover'] = input_df['Soil_Type'].apply(lambda x: surface_cover[x])
    return df

def wilderness_surface(input_df, drop = False):
    data = surface_cover_original(input_df)
    df = input_df.copy()
    df['Surface_Area1'] = df['Wilderness_Area1']*data['Surface_Cover'] 
    df['Surface_Area2'] = df['Wilderness_Area2']*data['Surface_Cover']   
    df['Surface_Area3'] = df['Wilderness_Area3']*data['Surface_Cover']  
    df['Surface_Area4'] = df['Wilderness_Area4']*data['Surface_Cover'] 
    return df

In [13]:
# AdaBoost
cv_score, oof_preds, test_score = train_original(adaboost, wilderness_surface)

adaboost_scores.append((
    'Wild_Surf', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

# ExtraTrees
cv_score, oof_preds, test_score = train_original(extratrees, wilderness_surface)

extratrees_scores.append((
    'Wild_Surf', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

# Bagging
cv_score, oof_preds, test_score = train_original(bagging, wilderness_surface)

bagging_scores.append((
    'Wild_Surf', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

# RandomForest
cv_score, oof_preds, test_score = train_original(randomforest, wilderness_surface)

random_scores.append((
    'Wild_Surf', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))


AdaBoostClassifier
Train Accuracy: 0.78623
Test Accuracy: 0.75803
Training Time: 3.56s

ExtraTreesClassifier
Train Accuracy: 0.88432
Test Accuracy: 0.78333
Training Time: 36.76s

BaggingClassifier
Train Accuracy: 0.85336
Test Accuracy: 0.75816
Training Time: 20.64s

RandomForestClassifier
Train Accuracy: 0.86487
Test Accuracy: 0.75105
Training Time: 37.42s


## 4. Rock Size and Wilderness Area

In [14]:
def rock_size_original(input_df):
    
    # Group IDs
    no_desc = [7,8,14,15,16,17,19,20,21,23,35]
    stones = [1,2,6,9,12,18,24,25,26,27,28,29,30,31,32,33,34,36,37,38,39,40]
    boulders = [22]
    rubble = [3,4,5,10,11,13]

    # Create dictionary
    rock_size = {i:0 for i in no_desc}
    rock_size.update({i:1 for i in stones})
    rock_size.update({i:2 for i in boulders})
    rock_size.update({i:3 for i in rubble})
    
    df = input_df.copy()
    df['Rock_Size'] = input_df['Soil_Type'].apply(lambda x: rock_size[x])
    return df

def wilderness_rocksize(input_df, drop = False):
    data = rock_size_original(input_df)
    df = input_df.copy()
    df['Rock_Area1'] = df['Wilderness_Area1']*data['Rock_Size'] 
    df['Rock_Area2'] = df['Wilderness_Area2']*data['Rock_Size']   
    df['Rock_Area3'] = df['Wilderness_Area3']*data['Rock_Size']  
    df['Rock_Area4'] = df['Wilderness_Area4']*data['Rock_Size']
    return df

In [15]:
# AdaBoost
cv_score, oof_preds, test_score = train_original(adaboost, wilderness_rocksize)

adaboost_scores.append((
    'Wild_Rock', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

# ExtraTrees
cv_score, oof_preds, test_score = train_original(extratrees, wilderness_rocksize)

extratrees_scores.append((
    'Wild_Rock', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

# Bagging
cv_score, oof_preds, test_score = train_original(bagging, wilderness_rocksize)

bagging_scores.append((
    'Wild_Rock', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

# RandomForest
cv_score, oof_preds, test_score = train_original(randomforest, wilderness_rocksize)

random_scores.append((
    'Wild_Rock', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))


AdaBoostClassifier
Train Accuracy: 0.78729
Test Accuracy: 0.75985
Training Time: 3.67s

ExtraTreesClassifier
Train Accuracy: 0.88591
Test Accuracy: 0.78314
Training Time: 39.85s

BaggingClassifier
Train Accuracy: 0.85145
Test Accuracy: 0.75864
Training Time: 23.36s

RandomForestClassifier
Train Accuracy: 0.86606
Test Accuracy: 0.75107
Training Time: 37.14s


# Summary

These probably require more testing (permutation importance, etc).

In [22]:
# AdaBoost
pd.DataFrame.from_records(
    data = adaboost_scores,
    columns = ['features','cv_score','holdout','recall_0', 'recall_1','recall_2','recall_3','recall_4','recall_5','recall_6']
).sort_values('cv_score')

Unnamed: 0,features,cv_score,holdout,recall_0,recall_1,recall_2,recall_3,recall_4,recall_5,recall_6
0,Baseline,0.785965,0.762291,0.674537,0.6375,0.72302,0.918519,0.878704,0.74537,0.924074
3,Wild_Surf,0.786228,0.758028,0.674074,0.615278,0.73321,0.92037,0.881019,0.752315,0.927315
4,Wild_Rock,0.787288,0.759849,0.675463,0.60787,0.732747,0.909722,0.890278,0.75787,0.937037
1,Wild_Clim,0.788677,0.757618,0.684259,0.613889,0.738768,0.90787,0.887963,0.757407,0.930556
2,Wild_Geo,0.790596,0.76019,0.659722,0.613426,0.736915,0.930556,0.896296,0.765741,0.931481


In [23]:
# Extra Trees Classifier
pd.DataFrame.from_records(
    data = extratrees_scores,
    columns = ['features','cv_score','holdout','recall_0', 'recall_1','recall_2','recall_3','recall_4','recall_5','recall_6']
).sort_values('cv_score')

Unnamed: 0,features,cv_score,holdout,recall_0,recall_1,recall_2,recall_3,recall_4,recall_5,recall_6
2,Wild_Geo,0.883392,0.782445,0.785648,0.738889,0.856878,0.97037,0.959259,0.900926,0.971759
3,Wild_Surf,0.884319,0.783328,0.784722,0.737037,0.855952,0.973611,0.958796,0.90463,0.975463
1,Wild_Clim,0.885244,0.78313,0.791204,0.740278,0.856878,0.970833,0.961574,0.902315,0.973611
0,Baseline,0.885377,0.78206,0.7875,0.7375,0.86151,0.971759,0.961111,0.905556,0.972685
4,Wild_Rock,0.885906,0.783145,0.790278,0.742593,0.861047,0.971296,0.959259,0.903704,0.973148


In [24]:
# Bagging Classifier
pd.DataFrame.from_records(
    data = bagging_scores,
    columns = ['features','cv_score','holdout','recall_0', 'recall_1','recall_2','recall_3','recall_4','recall_5','recall_6']
).sort_values('cv_score')

Unnamed: 0,features,cv_score,holdout,recall_0,recall_1,recall_2,recall_3,recall_4,recall_5,recall_6
0,Baseline,0.850057,0.75982,0.775,0.672222,0.819824,0.958333,0.939815,0.830556,0.95463
4,Wild_Rock,0.851446,0.758642,0.768519,0.65787,0.834182,0.964352,0.937963,0.839352,0.95787
1,Wild_Clim,0.852967,0.760347,0.769444,0.664815,0.836035,0.964352,0.941667,0.83287,0.961574
2,Wild_Geo,0.853363,0.757806,0.766204,0.668981,0.841593,0.966667,0.9375,0.833333,0.959259
3,Wild_Surf,0.853364,0.758156,0.781481,0.661111,0.842057,0.964352,0.936111,0.831481,0.956944


In [25]:
# Random Forest
pd.DataFrame.from_records(
    data = random_scores,
    columns = ['features','cv_score','holdout','recall_0', 'recall_1','recall_2','recall_3','recall_4','recall_5','recall_6']
).sort_values('cv_score')

Unnamed: 0,features,cv_score,holdout,recall_0,recall_1,recall_2,recall_3,recall_4,recall_5,recall_6
1,Wild_Clim,0.864409,0.750103,0.766667,0.695833,0.830014,0.972222,0.95463,0.86713,0.964352
0,Baseline,0.864542,0.748216,0.765741,0.69537,0.824456,0.970833,0.955556,0.871759,0.968056
3,Wild_Surf,0.864873,0.751052,0.775,0.690278,0.826772,0.969444,0.952315,0.876852,0.963426
2,Wild_Geo,0.865402,0.750151,0.769444,0.701852,0.827698,0.970833,0.954167,0.868519,0.965278
4,Wild_Rock,0.866063,0.751075,0.775926,0.697222,0.825845,0.973148,0.954167,0.871296,0.964815
