# Notebook 3 - One-Hot vs Ordinal Encoding

From the previous experiments we see that tree-based ensembles seem to be the most promising and we will focus on efforts there for the remaining notebooks. 

For tree-based models, it is often not necessary to one-hot encode categorical variables, and sometimes it is preferable not to. In this notebook, we will test various ways of encoding the `Wilderness_Area` and `Soil_Type` features:

1. Original (One-Hot)
2. Ordinal
3. Both
4. Binary Encoding (dimension reduction)

We will use the ideal encoding for the remaining notebooks.

In [1]:
# Global variables for testing changes to this notebook quickly
RANDOM_SEED = 0
NUM_FOLDS = 12

In [2]:
import numpy as np
import pandas as pd
import time
import pyarrow
import gc

# Model evaluation
from functools import partial
from category_encoders import BinaryEncoder
from sklearn.base import clone
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, recall_score
from sklearn.inspection import partial_dependence, permutation_importance
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier

# Plotting
import matplotlib
import seaborn as sns
from matplotlib import pyplot as plt

# Hide warnings
import warnings
warnings.filterwarnings('ignore')

# Load Data

In [3]:
%%time

# Load original data
original = pd.read_feather('../data/original.feather')

# Label Encode
old_encoder = LabelEncoder()
original["Cover_Type"] = old_encoder.fit_transform(original["Cover_Type"])
y_train = original['Cover_Type'].iloc[:15119]
y_test = original['Cover_Type'].iloc[15119:]

# Get feature columns
features = [x for x in original.columns if x not in ['Id','Cover_Type']]

# Data structures for summary scores
bagging_scores = list()
extratrees_scores = list()
adaboost_scores = list()
random_scores = list()

Wall time: 72 ms


# Scoring Function

In [4]:
def train_original(sklearn_model, processing = None):
    
    # Original Training/Test Split
    X_temp = original[features].iloc[:15119]
    X_test = original[features].iloc[15119:]
    y_temp = original['Cover_Type'].iloc[:15119]
    y_test = original['Cover_Type'].iloc[15119:]
    
    # Feature Engineering
    if processing:
        X_temp = processing(X_temp)
        X_test = processing(X_test)
        
    # Store the out-of-fold predictions
    test_preds = np.zeros((X_test.shape[0],7))
    oof_preds = np.zeros((X_temp.shape[0],))
    scores, times = np.zeros(NUM_FOLDS), np.zeros(NUM_FOLDS)
    
    # Stratified k-fold cross-validation
    skf = StratifiedKFold(n_splits = NUM_FOLDS, shuffle = True, random_state = RANDOM_SEED)
    for fold, (train_idx, valid_idx) in enumerate(skf.split(X_temp,y_temp)):
       
        # Training and Validation Sets
        X_train, X_valid = X_temp.iloc[train_idx], X_temp.iloc[valid_idx]
        y_train, y_valid = y_temp.iloc[train_idx], y_temp.iloc[valid_idx]
        
        # Create model
        start = time.time()
        model = clone(sklearn_model)
        model.fit(X_train, y_train)

        # validation and test predictions
        valid_preds = np.ravel(model.predict(X_valid))
        oof_preds[valid_idx] = valid_preds
        test_preds += model.predict_proba(X_test)
        
        # Save scores and times
        scores[fold] = accuracy_score(y_valid, valid_preds)
        end = time.time()
        times[fold] = end-start
        time.sleep(0.5)
    
    test_preds = np.argmax(test_preds, axis = 1)
    test_score = accuracy_score(y_test, test_preds)
    print('\n'+model.__class__.__name__)
    print("Train Accuracy:", round(scores.mean(), 5))
    print('Test Accuracy:', round(test_score, 5))
    print(f'Training Time: {round(times.sum(), 2)}s')
    
    return scores.mean(), oof_preds, test_score, times.sum()

# Models

We use the following 4 models from the scikit-learn library:

1. AdaBoost 
2. ExtraTrees
3. Bagging
4. Random Forest

In [5]:
# AdaBoost Classifier
adaboost = AdaBoostClassifier(
    base_estimator = DecisionTreeClassifier(
        splitter = 'random',
        random_state = RANDOM_SEED,
    ),
    random_state = RANDOM_SEED,
)

# ExtraTrees Classifier
extratrees = ExtraTreesClassifier(
    n_jobs = -1,
    random_state = RANDOM_SEED,
    max_features = None,
)

# Bagging Classifier
bagging = BaggingClassifier(
    base_estimator = DecisionTreeClassifier(
        splitter = 'random',
        random_state = RANDOM_SEED,
    ),
    n_jobs = -1,
    random_state = RANDOM_SEED
)

# Random Forest Classifier
randomforest = RandomForestClassifier(
    n_jobs = -1,
    random_state = RANDOM_SEED,
)

# Encoding Functions

In [6]:
def consolidate_soil_types(input_df, drop = True):
    data = input_df.copy()
    data['Soil_Type'] = 0
    soil_features = list()
    for i in range(1,41):
        data['Soil_Type'] += i*data[f'Soil_Type{i}']
        soil_features.append(f'Soil_Type{i}')
    if drop:
        nonsoil_features = [x for x in data.columns if x not in soil_features]
        return data[nonsoil_features]
    return data

def consolidate_wilderness(input_df, drop = True):
    data = input_df.copy()
    data['Wilderness'] = 0
    wild_features = list()
    for i in range(1,4):
        data['Wilderness'] += i*data[f'Wilderness_Area{i}']
        wild_features.append(f'Wilderness_Area{i}')
    if drop:
        nonwild_features = [x for x in data.columns if x not in wild_features]
        return data[nonwild_features]
    return data

def ordinal_both(input_df, drop = True):
    data = consolidate_soil_types(input_df, drop)
    return consolidate_wilderness(data, drop)

def binary_soil_type(input_df):
    data = consolidate_soil_types(input_df)
    return BinaryEncoder(cols = ['Soil_Type']).fit_transform(data)

def binary_wilderness(input_df):
    data = consolidate_wilderness(input_df)
    return BinaryEncoder(cols = ['Wilderness']).fit_transform(data)

def binary_both(input_df):
    data = binary_soil_type(input_df)
    return binary_wilderness(data)

# One Hot Encoding (Baseline)

In [7]:
# AdaBoost
cv_score, oof_preds, test_score, times = train_original(adaboost)

adaboost_scores.append((
    'OH_Both', cv_score, test_score, round(times, 2)
))

# ExtraTrees
cv_score, oof_preds, test_score, times = train_original(extratrees)

extratrees_scores.append((
    'OH_Both', cv_score, test_score, round(times, 2)
))

# Bagging
cv_score, oof_preds, test_score, times = train_original(bagging)

bagging_scores.append((
    'OH_Both', cv_score, test_score, round(times, 2)
))

# Random Forest
cv_score, oof_preds, test_score, times = train_original(randomforest)

random_scores.append((
    'OH_Both', cv_score, test_score, round(times, 2)
))


AdaBoostClassifier
Train Accuracy: 0.80356
Test Accuracy: 0.75373
Training Time: 4.27s

ExtraTreesClassifier
Train Accuracy: 0.88491
Test Accuracy: 0.77808
Training Time: 43.35s

BaggingClassifier
Train Accuracy: 0.85581
Test Accuracy: 0.75372
Training Time: 27.71s

RandomForestClassifier
Train Accuracy: 0.86395
Test Accuracy: 0.74895
Training Time: 44.11s


# Ordinal Soil Encoding

In [8]:
# AdaBoost
cv_score, oof_preds, test_score, times = train_original(adaboost, consolidate_soil_types)

adaboost_scores.append((
    'Ord_Soil', cv_score, test_score, round(times, 2)
))

# ExtraTrees
cv_score, oof_preds, test_score, times = train_original(extratrees, consolidate_soil_types)

extratrees_scores.append((
    'Ord_Soil', cv_score, test_score, round(times, 2)
))

# Bagging
cv_score, oof_preds, test_score, times = train_original(bagging, consolidate_soil_types)

bagging_scores.append((
    'Ord_Soil', cv_score, test_score, round(times, 2)
))

# Random Forest
cv_score, oof_preds, test_score, times = train_original(randomforest, consolidate_soil_types)

random_scores.append((
    'Ord_Soil', cv_score, test_score, round(times, 2)
))


AdaBoostClassifier
Train Accuracy: 0.78596
Test Accuracy: 0.76229
Training Time: 3.63s

ExtraTreesClassifier
Train Accuracy: 0.88538
Test Accuracy: 0.78206
Training Time: 38.67s

BaggingClassifier
Train Accuracy: 0.85006
Test Accuracy: 0.75982
Training Time: 19.37s

RandomForestClassifier
Train Accuracy: 0.86454
Test Accuracy: 0.74822
Training Time: 38.6s


# Ordinal Wilderness Area

In [9]:
# AdaBoost
cv_score, oof_preds, test_score, times = train_original(adaboost, consolidate_wilderness)

adaboost_scores.append((
    'Ord_Wild', cv_score, test_score, round(times, 2)
))

# ExtraTrees
cv_score, oof_preds, test_score, times = train_original(extratrees, consolidate_wilderness)

extratrees_scores.append((
    'Ord_Wild', cv_score, test_score, round(times, 2)
))

# Bagging
cv_score, oof_preds, test_score, times = train_original(bagging, consolidate_wilderness)

bagging_scores.append((
    'Ord_Wild', cv_score, test_score, round(times, 2)
))

# Random Forest
cv_score, oof_preds, test_score, times = train_original(randomforest, consolidate_wilderness)

random_scores.append((
    'Ord_Wild', cv_score, test_score, round(times, 2)
))


AdaBoostClassifier
Train Accuracy: 0.79873
Test Accuracy: 0.75473
Training Time: 4.76s

ExtraTreesClassifier
Train Accuracy: 0.88518
Test Accuracy: 0.77761
Training Time: 44.63s

BaggingClassifier
Train Accuracy: 0.85376
Test Accuracy: 0.7537
Training Time: 28.58s

RandomForestClassifier
Train Accuracy: 0.86408
Test Accuracy: 0.74631
Training Time: 44.24s


# Ordinal Encoding

In [10]:
# AdaBoost
cv_score, oof_preds, test_score, times = train_original(adaboost, ordinal_both)

adaboost_scores.append((
    'Ord_Both', cv_score, test_score, round(times, 2)
))

# ExtraTrees
cv_score, oof_preds, test_score, times = train_original(extratrees, ordinal_both)

extratrees_scores.append((
    'Ord_Both', cv_score, test_score, round(times, 2)
))

# Bagging
cv_score, oof_preds, test_score, times = train_original(bagging, ordinal_both)

bagging_scores.append((
    'Ord_Both', cv_score, test_score, round(times, 2)
))

# Random Forest
cv_score, oof_preds, test_score, times = train_original(randomforest, ordinal_both)

random_scores.append((
    'Ord_Both', cv_score, test_score, round(times, 2)
))


AdaBoostClassifier
Train Accuracy: 0.79185
Test Accuracy: 0.75746
Training Time: 3.68s

ExtraTreesClassifier
Train Accuracy: 0.88491
Test Accuracy: 0.78202
Training Time: 39.03s

BaggingClassifier
Train Accuracy: 0.85277
Test Accuracy: 0.7581
Training Time: 18.31s

RandomForestClassifier
Train Accuracy: 0.86401
Test Accuracy: 0.74686
Training Time: 40.52s


# Binary Soil

In [11]:
# AdaBoost
cv_score, oof_preds, test_score, times = train_original(adaboost, binary_soil_type)

adaboost_scores.append((
    'Bin_Soil', cv_score, test_score, round(times, 2)
))

# ExtraTrees
cv_score, oof_preds, test_score, times = train_original(extratrees, binary_soil_type)

extratrees_scores.append((
    'Bin_Soil', cv_score, test_score, round(times, 2)
))

# Bagging
cv_score, oof_preds, test_score, times = train_original(bagging, binary_soil_type)

bagging_scores.append((
    'Bin_Soil', cv_score, test_score, round(times, 2)
))

# Random Forest
cv_score, oof_preds, test_score, times = train_original(randomforest, binary_soil_type)

random_scores.append((
    'Bin_Soil', cv_score, test_score, round(times, 2)
))


AdaBoostClassifier
Train Accuracy: 0.79509
Test Accuracy: 0.66774
Training Time: 3.97s

ExtraTreesClassifier
Train Accuracy: 0.88677
Test Accuracy: 0.69762
Training Time: 41.0s

BaggingClassifier
Train Accuracy: 0.85548
Test Accuracy: 0.68524
Training Time: 22.58s

RandomForestClassifier
Train Accuracy: 0.86924
Test Accuracy: 0.70905
Training Time: 37.21s


# Keep Dummies

In [12]:
# AdaBoost
cv_score, oof_preds, test_score, times = train_original(adaboost, partial(consolidate_soil_types, drop = False))

adaboost_scores.append((
    'Keep_Soil', cv_score, test_score, round(times, 2)
))

# ExtraTrees
cv_score, oof_preds, test_score, times = train_original(extratrees, partial(consolidate_soil_types, drop = False))

extratrees_scores.append((
    'Keep_Soil', cv_score, test_score, round(times, 2)
))

# Bagging
cv_score, oof_preds, test_score, times = train_original(bagging, partial(consolidate_soil_types, drop = False))

bagging_scores.append((
    'Keep_Soil', cv_score, test_score, round(times, 2)
))

# Random Forest
cv_score, oof_preds, test_score, times = train_original(randomforest, partial(consolidate_soil_types, drop = False))

random_scores.append((
    'Keep_Soil', cv_score, test_score, round(times, 2)
))


AdaBoostClassifier
Train Accuracy: 0.799
Test Accuracy: 0.76088
Training Time: 4.96s

ExtraTreesClassifier
Train Accuracy: 0.88769
Test Accuracy: 0.78102
Training Time: 39.81s

BaggingClassifier
Train Accuracy: 0.85594
Test Accuracy: 0.76007
Training Time: 33.84s

RandomForestClassifier
Train Accuracy: 0.86276
Test Accuracy: 0.7494
Training Time: 44.92s


In [13]:
# AdaBoost
cv_score, oof_preds, test_score, times = train_original(adaboost, partial(ordinal_both, drop = False))

adaboost_scores.append((
    'Keep_Both', cv_score, test_score, round(times, 2)
))

# ExtraTrees
cv_score, oof_preds, test_score, times = train_original(extratrees, partial(ordinal_both, drop = False))

extratrees_scores.append((
    'Keep_Both', cv_score, test_score, round(times, 2)
))

# Bagging
cv_score, oof_preds, test_score, times = train_original(bagging, partial(ordinal_both, drop = False))

bagging_scores.append((
    'Keep_Both', cv_score, test_score, round(times, 2)
))

# Random Forest
cv_score, oof_preds, test_score, times = train_original(randomforest, partial(ordinal_both, drop = False))

random_scores.append((
    'Keep_Both', cv_score, test_score, round(times, 2)
))


AdaBoostClassifier
Train Accuracy: 0.79575
Test Accuracy: 0.76192
Training Time: 6.11s

ExtraTreesClassifier
Train Accuracy: 0.88723
Test Accuracy: 0.78129
Training Time: 42.63s

BaggingClassifier
Train Accuracy: 0.85217
Test Accuracy: 0.75965
Training Time: 29.8s

RandomForestClassifier
Train Accuracy: 0.86487
Test Accuracy: 0.7491
Training Time: 43.04s


# Summary

In [14]:
# AdaBoost
pd.DataFrame.from_records(
    data = adaboost_scores,
    columns = ['model','cv_score','holdout','time']
).sort_values('holdout')

Unnamed: 0,model,cv_score,holdout,time
4,Bin_Soil,0.795093,0.667736,3.97
0,OH_Both,0.803559,0.753727,4.27
2,Ord_Wild,0.79873,0.754735,4.76
3,Ord_Both,0.791852,0.757465,3.68
5,Keep_Soil,0.798996,0.760879,4.96
6,Keep_Both,0.795754,0.761916,6.11
1,Ord_Soil,0.785965,0.762291,3.63


In [15]:
# ExtraTrees
pd.DataFrame.from_records(
    data = extratrees_scores,
    columns = ['model','cv_score','holdout','time']
).sort_values('holdout')

Unnamed: 0,model,cv_score,holdout,time
4,Bin_Soil,0.886766,0.697618,41.0
2,Ord_Wild,0.885178,0.777613,44.63
0,OH_Both,0.884914,0.778078,43.35
5,Keep_Soil,0.887692,0.781015,39.81
6,Keep_Both,0.887229,0.781286,42.63
3,Ord_Both,0.884914,0.782015,39.03
1,Ord_Soil,0.885377,0.78206,38.67


In [16]:
# Bagging
pd.DataFrame.from_records(
    data = bagging_scores,
    columns = ['model','cv_score','holdout','time']
).sort_values('holdout')

Unnamed: 0,model,cv_score,holdout,time
4,Bin_Soil,0.855481,0.685236,22.58
2,Ord_Wild,0.853761,0.753695,28.58
0,OH_Both,0.855812,0.753724,27.71
3,Ord_Both,0.852769,0.758096,18.31
6,Keep_Both,0.852173,0.759654,29.8
1,Ord_Soil,0.850057,0.75982,19.37
5,Keep_Soil,0.855943,0.760069,33.84


In [17]:
# Random Forest
pd.DataFrame.from_records(
    data = random_scores,
    columns = ['model','cv_score','holdout','time']
).sort_values('holdout')

Unnamed: 0,model,cv_score,holdout,time
4,Bin_Soil,0.869238,0.709049,37.21
2,Ord_Wild,0.864079,0.746311,44.24
3,Ord_Both,0.864012,0.746857,40.52
1,Ord_Soil,0.864542,0.748216,38.6
0,OH_Both,0.863947,0.748949,44.11
6,Keep_Both,0.864873,0.749099,43.04
5,Keep_Soil,0.862756,0.749405,44.92


Henceforth, we will use ordinal encoding for the soil type and leave the dummy variables for the wilderness area.