# Notebook 3 - Feature Engineering

In this notebook we consider various feature engineering techniques. Some of these were suggested in the forums for TPS 12 as well as in the original forest cover type competition. The break down of soil type features into coarser categories based on their own description is (possibly) a novel idea.

In [1]:
# Global variables for testing changes to this notebook quickly
RANDOM_SEED = 0
NUM_FOLDS = 12

In [2]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import time
import os
import pyarrow
import gc

# Model evaluation
from functools import partial
from sklearn.base import clone
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from sklearn.metrics import accuracy_score, recall_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.feature_selection import mutual_info_classif

# Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier

# Plotting
import matplotlib
import seaborn as sns
from matplotlib import pyplot as plt

# Hide warnings
import warnings
warnings.filterwarnings('ignore')

# Load Data

In [3]:
def get_data():
    try:
        #
        original = pd.read_feather('../data/original.feather')
    except:
        # Get Original Data
        original = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz', header = None)

        # Fix columns
        original.columns = ['Elevation', 'Aspect', 'Slope',
               'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology',
               'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon',
               'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points',
               'Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3',
               'Wilderness_Area4', 'Soil_Type1', 'Soil_Type2', 'Soil_Type3',
               'Soil_Type4', 'Soil_Type5', 'Soil_Type6', 'Soil_Type7', 'Soil_Type8',
               'Soil_Type9', 'Soil_Type10', 'Soil_Type11', 'Soil_Type12',
               'Soil_Type13', 'Soil_Type14', 'Soil_Type15', 'Soil_Type16',
               'Soil_Type17', 'Soil_Type18', 'Soil_Type19', 'Soil_Type20',
               'Soil_Type21', 'Soil_Type22', 'Soil_Type23', 'Soil_Type24',
               'Soil_Type25', 'Soil_Type26', 'Soil_Type27', 'Soil_Type28',
               'Soil_Type29', 'Soil_Type30', 'Soil_Type31', 'Soil_Type32',
               'Soil_Type33', 'Soil_Type34', 'Soil_Type35', 'Soil_Type36',
               'Soil_Type37', 'Soil_Type38', 'Soil_Type39', 'Soil_Type40',
               'Cover_Type']

        for col, dtype in original.dtypes.iteritems():
            if dtype.name.startswith('int'):
                original[col] = pd.to_numeric(original[col], downcast ='integer')
            elif dtype.name.startswith('float'):
                original[col] = pd.to_numeric(original[col], downcast ='float')
        
        if not os.path.isdir('../data'):
            os.mkdir('../data')
        original.to_feather('../data/original.feather')
    return original

In [4]:
%%time
original = get_data()

# Get feature columns
features = [x for x in original.columns if x not in ['Id','Cover_Type']]
wilderness_cols = [x for x in features if x.startswith('Wild')]
soil_cols = [x for x in features if x.startswith('Soil')]
binary_cols = [x for x in features if x.startswith('Soil') or x in wilderness_cols]
numerical_cols = [x for x in features if x not in binary_cols]

# Encoder
old_encoder = LabelEncoder()
original["Cover_Type"] = old_encoder.fit_transform(original["Cover_Type"])

# Data structures for summary scores
bagging_scores = list()
extratrees_scores = list()
adaboost_scores = list()
random_scores = list()

Wall time: 73 ms


# Scoring Function

In [5]:
def train_original(sklearn_model, processing = None, plot = False):
    
    # Original Training/Test Split
    X_temp = original[features].iloc[:15119]
    X_test = original[features].iloc[15119:]
    y_temp = original['Cover_Type'].iloc[:15119]
    y_test = original['Cover_Type'].iloc[15119:]
    
    # Feature Engineering
    if processing:
        X_temp = processing(X_temp)
        X_test = processing(X_test)
    
    # Store the out-of-fold predictions
    test_preds = np.zeros((X_test.shape[0],7))
    scores = np.zeros(NUM_FOLDS)
    times = np.zeros(NUM_FOLDS)
    
    # Stratified k-fold cross-validation
    skf = StratifiedKFold(n_splits = NUM_FOLDS, shuffle = True, random_state = RANDOM_SEED)
    for fold, (train_idx, valid_idx) in enumerate(skf.split(X_temp,y_temp)):
       
        # Training and Validation Sets
        X_train = X_temp.iloc[train_idx]
        X_valid = X_temp.iloc[valid_idx]
        y_train = y_temp.iloc[train_idx]
        y_valid = y_temp.iloc[valid_idx]
        
        # Create model
        start = time.time()
        model = clone(sklearn_model)
        model.fit(X_train, y_train)
        
        # validation and test predictions
        valid_preds = np.ravel(model.predict(X_valid))
        test_preds += model.predict_proba(X_test)
        
        # fold auc score
        fold_acc = accuracy_score(y_valid, valid_preds)
        end = time.time()
        print(f'Fold {fold} Accuracy: {round(fold_acc, 5)} in {round(end-start,2)}s.')
        scores[fold] = fold_acc
        times[fold] = end-start
        time.sleep(0.5)
    
    test_preds = np.argmax(test_preds, axis = 1)
    holdout = accuracy_score(y_test, test_preds)
    print("\nAverage CV Accuracy:", round(scores.mean(), 5))
    print("Worst CV Accuracy:", round(scores.min(), 5))
    print('Holdout Accuracy:', round(holdout, 5))
    print(f'Training Time: {round(times.sum(), 2)}s\n')
    
    if plot:
        # Plot confusion matrix
        fig, ax = plt.subplots(figsize = (8,6))
        cm = confusion_matrix(y_test, test_preds, normalize = 'true')
        disp = ConfusionMatrixDisplay(confusion_matrix=cm)
        disp.plot(ax = ax)
        plt.title("Confusion Matrix (% of Actual Labels)", fontsize=16)
        plt.xlabel("Predicted Labels", fontsize=14)
        plt.ylabel("Actual Labels", fontsize=14)
        plt.show()
    
    return scores.mean(), test_preds, holdout

# Test Models

We use the following 4 models from the scikit-learn library:

1. AdaBoost 
2. ExtraTrees
3. Bagging
4. Random Forest

In [6]:
# AdaBoost Classifier
adaboost = make_pipeline(
    AdaBoostClassifier(
        # Same settings as ExtraTreesClassifier
        base_estimator = DecisionTreeClassifier(
            splitter = 'random',
            random_state = RANDOM_SEED,
        ),
        random_state = RANDOM_SEED,
    ),
)

In [7]:
# ExtraTrees Classifier
extratrees = make_pipeline(
    ExtraTreesClassifier(
        n_jobs = -1,
        random_state = RANDOM_SEED,
        max_features = None,
    ),
)

In [8]:
# Bagging Classifier
bagging = make_pipeline(
    BaggingClassifier(
        base_estimator = DecisionTreeClassifier(
            splitter = 'random',
            random_state = RANDOM_SEED,
        ),
        n_jobs = -1,
        random_state = RANDOM_SEED,
    ),
)

In [9]:
# Random Forest Classifier
randomforest = make_pipeline(
    RandomForestClassifier(
        n_jobs = -1,
        random_state = RANDOM_SEED,
    ),
)

# Baselines

In [10]:
# Train/Test split
test_score, oof_preds, holdout = train_original(adaboost)

adaboost_scores.append((
    'Baseline', test_score, holdout,
     *recall_score(original['Cover_Type'].iloc[15119:], oof_preds, average = None)
))

Fold 0 Accuracy: 0.80317 in 0.42s.
Fold 1 Accuracy: 0.79206 in 0.56s.
Fold 2 Accuracy: 0.80397 in 0.39s.
Fold 3 Accuracy: 0.78968 in 0.65s.
Fold 4 Accuracy: 0.80635 in 0.36s.
Fold 5 Accuracy: 0.8246 in 0.37s.
Fold 6 Accuracy: 0.80952 in 0.35s.
Fold 7 Accuracy: 0.80556 in 0.37s.
Fold 8 Accuracy: 0.79286 in 0.35s.
Fold 9 Accuracy: 0.80079 in 0.36s.
Fold 10 Accuracy: 0.79921 in 0.35s.
Fold 11 Accuracy: 0.81493 in 0.35s.

Average CV Accuracy: 0.80356
Worst CV Accuracy: 0.78968
Holdout Accuracy: 0.75373
Training Time: 4.89s



In [11]:
test_score, oof_preds, holdout = train_original(extratrees)

extratrees_scores.append((
    'Baseline', test_score, holdout,
     *recall_score(original['Cover_Type'].iloc[15119:], oof_preds, average = None)
))

Fold 0 Accuracy: 0.89286 in 3.53s.
Fold 1 Accuracy: 0.87619 in 3.73s.
Fold 2 Accuracy: 0.88571 in 3.51s.
Fold 3 Accuracy: 0.8873 in 3.53s.
Fold 4 Accuracy: 0.88651 in 3.4s.
Fold 5 Accuracy: 0.8873 in 3.35s.
Fold 6 Accuracy: 0.88333 in 3.39s.
Fold 7 Accuracy: 0.8746 in 3.42s.
Fold 8 Accuracy: 0.86349 in 3.54s.
Fold 9 Accuracy: 0.89365 in 3.59s.
Fold 10 Accuracy: 0.88175 in 3.37s.
Fold 11 Accuracy: 0.90627 in 3.59s.

Average CV Accuracy: 0.88491
Worst CV Accuracy: 0.86349
Holdout Accuracy: 0.77808
Training Time: 41.97s



In [12]:
test_score, oof_preds, holdout = train_original(bagging)

bagging_scores.append((
    'Baseline', test_score, holdout,
     *recall_score(original['Cover_Type'].iloc[15119:], oof_preds, average = None)
))

Fold 0 Accuracy: 0.8627 in 3.26s.
Fold 1 Accuracy: 0.86111 in 2.24s.
Fold 2 Accuracy: 0.86429 in 2.01s.
Fold 3 Accuracy: 0.85159 in 2.0s.
Fold 4 Accuracy: 0.84762 in 1.85s.
Fold 5 Accuracy: 0.85238 in 2.05s.
Fold 6 Accuracy: 0.84524 in 2.19s.
Fold 7 Accuracy: 0.84841 in 2.29s.
Fold 8 Accuracy: 0.84762 in 2.36s.
Fold 9 Accuracy: 0.86825 in 2.01s.
Fold 10 Accuracy: 0.85 in 2.0s.
Fold 11 Accuracy: 0.87053 in 2.21s.

Average CV Accuracy: 0.85581
Worst CV Accuracy: 0.84524
Holdout Accuracy: 0.75372
Training Time: 26.46s



In [13]:
test_score, oof_preds, holdout = train_original(randomforest)

random_scores.append((
    'Baseline', test_score, holdout,
     *recall_score(original['Cover_Type'].iloc[15119:], oof_preds, average = None)
))

Fold 0 Accuracy: 0.87381 in 4.12s.
Fold 1 Accuracy: 0.86429 in 3.65s.
Fold 2 Accuracy: 0.86032 in 3.53s.
Fold 3 Accuracy: 0.8627 in 3.58s.
Fold 4 Accuracy: 0.87222 in 3.39s.
Fold 5 Accuracy: 0.86032 in 3.52s.
Fold 6 Accuracy: 0.86349 in 3.55s.
Fold 7 Accuracy: 0.85317 in 3.5s.
Fold 8 Accuracy: 0.85397 in 3.4s.
Fold 9 Accuracy: 0.87222 in 3.39s.
Fold 10 Accuracy: 0.85714 in 3.36s.
Fold 11 Accuracy: 0.87371 in 3.35s.

Average CV Accuracy: 0.86395
Worst CV Accuracy: 0.85317
Holdout Accuracy: 0.74895
Training Time: 42.34s



# Feature Engineering

## Fix Aspect Range

In [14]:
# Fix aspect
def fix_aspect(data):
    df = data.copy()
    df["Aspect"][df["Aspect"] < 0] += 360
    df["Aspect"][df["Aspect"] > 359] -= 360
    return df

In [15]:
test_score, oof_preds, holdout = train_original(adaboost, fix_aspect)

adaboost_scores.append((
    'Fix_Aspect', test_score, holdout,
     *recall_score(original['Cover_Type'].iloc[15119:], oof_preds, average = None)
))

Fold 0 Accuracy: 0.80317 in 0.36s.
Fold 1 Accuracy: 0.80159 in 0.36s.
Fold 2 Accuracy: 0.80397 in 0.36s.
Fold 3 Accuracy: 0.78968 in 0.35s.
Fold 4 Accuracy: 0.80476 in 0.38s.
Fold 5 Accuracy: 0.79762 in 0.35s.
Fold 6 Accuracy: 0.80952 in 0.35s.
Fold 7 Accuracy: 0.80556 in 0.35s.
Fold 8 Accuracy: 0.79286 in 0.38s.
Fold 9 Accuracy: 0.77698 in 0.38s.
Fold 10 Accuracy: 0.79921 in 0.36s.
Fold 11 Accuracy: 0.8054 in 0.36s.

Average CV Accuracy: 0.79919
Worst CV Accuracy: 0.77698
Holdout Accuracy: 0.75214
Training Time: 4.33s



In [16]:
test_score, oof_preds, holdout = train_original(extratrees, fix_aspect)

extratrees_scores.append((
    'Fix_Aspect', test_score, holdout,
     *recall_score(original['Cover_Type'].iloc[15119:], oof_preds, average = None)
))

Fold 0 Accuracy: 0.89603 in 3.57s.
Fold 1 Accuracy: 0.87937 in 3.61s.
Fold 2 Accuracy: 0.88095 in 3.62s.
Fold 3 Accuracy: 0.89048 in 3.49s.
Fold 4 Accuracy: 0.89048 in 3.43s.
Fold 5 Accuracy: 0.8873 in 3.45s.
Fold 6 Accuracy: 0.88333 in 3.62s.
Fold 7 Accuracy: 0.8746 in 3.43s.
Fold 8 Accuracy: 0.86349 in 3.41s.
Fold 9 Accuracy: 0.89206 in 3.55s.
Fold 10 Accuracy: 0.88254 in 3.42s.
Fold 11 Accuracy: 0.90151 in 3.42s.

Average CV Accuracy: 0.88518
Worst CV Accuracy: 0.86349
Holdout Accuracy: 0.77804
Training Time: 42.03s



In [17]:
test_score, oof_preds, holdout = train_original(bagging, fix_aspect)

bagging_scores.append((
    'Fix_Aspect', test_score, holdout,
     *recall_score(original['Cover_Type'].iloc[15119:], oof_preds, average = None)
))

Fold 0 Accuracy: 0.86111 in 2.1s.
Fold 1 Accuracy: 0.8627 in 1.99s.
Fold 2 Accuracy: 0.85635 in 2.06s.
Fold 3 Accuracy: 0.85159 in 2.32s.
Fold 4 Accuracy: 0.85397 in 2.02s.
Fold 5 Accuracy: 0.86508 in 2.1s.
Fold 6 Accuracy: 0.84524 in 2.1s.
Fold 7 Accuracy: 0.84921 in 2.14s.
Fold 8 Accuracy: 0.84603 in 2.25s.
Fold 9 Accuracy: 0.85794 in 2.06s.
Fold 10 Accuracy: 0.85 in 2.06s.
Fold 11 Accuracy: 0.87133 in 2.06s.

Average CV Accuracy: 0.85588
Worst CV Accuracy: 0.84524
Holdout Accuracy: 0.75361
Training Time: 25.25s



In [18]:
test_score, oof_preds, holdout = train_original(randomforest, fix_aspect)

random_scores.append((
    'Fix_Aspect', test_score, holdout,
     *recall_score(original['Cover_Type'].iloc[15119:], oof_preds, average = None)
))

Fold 0 Accuracy: 0.87937 in 3.73s.
Fold 1 Accuracy: 0.8627 in 4.0s.
Fold 2 Accuracy: 0.85794 in 3.37s.
Fold 3 Accuracy: 0.86508 in 3.16s.
Fold 4 Accuracy: 0.86825 in 3.47s.
Fold 5 Accuracy: 0.86032 in 3.29s.
Fold 6 Accuracy: 0.86349 in 3.61s.
Fold 7 Accuracy: 0.85794 in 3.2s.
Fold 8 Accuracy: 0.85159 in 3.5s.
Fold 9 Accuracy: 0.87381 in 3.64s.
Fold 10 Accuracy: 0.86032 in 3.65s.
Fold 11 Accuracy: 0.87371 in 3.41s.

Average CV Accuracy: 0.86454
Worst CV Accuracy: 0.85159
Holdout Accuracy: 0.74964
Training Time: 42.04s



## Water Distance Features

In [19]:
def water_distance_features(data):
    df = data.copy()
    
    # use float64 for squaring
    df["Horizontal_Distance_To_Hydrology"] = df["Horizontal_Distance_To_Hydrology"].astype('float64')
    df["Vertical_Distance_To_Hydrology"] = df["Vertical_Distance_To_Hydrology"].astype('float64')
    
    # compute metrics
    df["Hydro_Taxicab"] = np.abs(df["Horizontal_Distance_To_Hydrology"]) + np.abs(df["Vertical_Distance_To_Hydrology"])
    df["Hydro_Euclid"] = (df["Horizontal_Distance_To_Hydrology"]**2 + np.abs(df["Vertical_Distance_To_Hydrology"])**2)**0.5
    
    # convert back
    df["Horizontal_Distance_To_Hydrology"] = df["Horizontal_Distance_To_Hydrology"].astype('float32')
    df["Vertical_Distance_To_Hydrology"] = df["Vertical_Distance_To_Hydrology"].astype('float32')
    df["Hydro_Taxicab"] = df["Hydro_Taxicab"].astype('float32')
    df["Hydro_Euclid"] = df["Hydro_Euclid"].astype('float32')
    
    return df

In [20]:
test_score, oof_preds, holdout = train_original(adaboost, water_distance_features)

adaboost_scores.append((
    'Water_Dist', test_score, holdout,
     *recall_score(original['Cover_Type'].iloc[15119:], oof_preds, average = None)
))

Fold 0 Accuracy: 0.80079 in 0.38s.
Fold 1 Accuracy: 0.8 in 0.35s.
Fold 2 Accuracy: 0.78651 in 0.38s.
Fold 3 Accuracy: 0.80238 in 0.39s.
Fold 4 Accuracy: 0.79683 in 0.4s.
Fold 5 Accuracy: 0.80238 in 0.37s.
Fold 6 Accuracy: 0.81429 in 0.39s.
Fold 7 Accuracy: 0.79921 in 0.37s.
Fold 8 Accuracy: 0.77778 in 0.4s.
Fold 9 Accuracy: 0.79841 in 0.4s.
Fold 10 Accuracy: 0.7873 in 0.39s.
Fold 11 Accuracy: 0.79984 in 0.37s.

Average CV Accuracy: 0.79714
Worst CV Accuracy: 0.77778
Holdout Accuracy: 0.75943
Training Time: 4.59s



In [21]:
test_score, oof_preds, holdout = train_original(extratrees, water_distance_features)

extratrees_scores.append((
    'Water_Dist', test_score, holdout,
     *recall_score(original['Cover_Type'].iloc[15119:], oof_preds, average = None)
))

Fold 0 Accuracy: 0.88889 in 3.86s.
Fold 1 Accuracy: 0.87778 in 3.65s.
Fold 2 Accuracy: 0.8746 in 3.65s.
Fold 3 Accuracy: 0.88571 in 3.48s.
Fold 4 Accuracy: 0.88968 in 3.29s.
Fold 5 Accuracy: 0.88651 in 3.43s.
Fold 6 Accuracy: 0.88492 in 3.34s.
Fold 7 Accuracy: 0.87937 in 3.63s.
Fold 8 Accuracy: 0.86984 in 3.31s.
Fold 9 Accuracy: 0.89603 in 3.35s.
Fold 10 Accuracy: 0.88175 in 3.2s.
Fold 11 Accuracy: 0.89833 in 3.5s.

Average CV Accuracy: 0.88445
Worst CV Accuracy: 0.86984
Holdout Accuracy: 0.7734
Training Time: 41.68s



In [22]:
test_score, oof_preds, holdout = train_original(bagging, water_distance_features)

bagging_scores.append((
    'Water_Dist', test_score, holdout,
     *recall_score(original['Cover_Type'].iloc[15119:], oof_preds, average = None)
))

Fold 0 Accuracy: 0.84286 in 1.77s.
Fold 1 Accuracy: 0.83968 in 2.25s.
Fold 2 Accuracy: 0.84206 in 1.92s.
Fold 3 Accuracy: 0.84048 in 1.81s.
Fold 4 Accuracy: 0.86429 in 1.86s.
Fold 5 Accuracy: 0.85 in 1.8s.
Fold 6 Accuracy: 0.85397 in 1.78s.
Fold 7 Accuracy: 0.84206 in 1.68s.
Fold 8 Accuracy: 0.83413 in 1.74s.
Fold 9 Accuracy: 0.85952 in 1.79s.
Fold 10 Accuracy: 0.8627 in 1.76s.
Fold 11 Accuracy: 0.86815 in 1.76s.

Average CV Accuracy: 0.84999
Worst CV Accuracy: 0.83413
Holdout Accuracy: 0.7522
Training Time: 21.91s



In [23]:
test_score, oof_preds, holdout = train_original(randomforest, water_distance_features)

random_scores.append((
    'Water_Dist', test_score, holdout,
     *recall_score(original['Cover_Type'].iloc[15119:], oof_preds, average = None)
))

Fold 0 Accuracy: 0.8746 in 3.32s.
Fold 1 Accuracy: 0.8627 in 3.21s.
Fold 2 Accuracy: 0.85794 in 3.32s.
Fold 3 Accuracy: 0.85635 in 3.19s.
Fold 4 Accuracy: 0.86667 in 3.21s.
Fold 5 Accuracy: 0.85476 in 3.25s.
Fold 6 Accuracy: 0.8619 in 3.17s.
Fold 7 Accuracy: 0.85476 in 3.17s.
Fold 8 Accuracy: 0.84762 in 3.25s.
Fold 9 Accuracy: 0.87222 in 3.39s.
Fold 10 Accuracy: 0.85952 in 3.43s.
Fold 11 Accuracy: 0.86894 in 3.63s.

Average CV Accuracy: 0.8615
Worst CV Accuracy: 0.84762
Holdout Accuracy: 0.74562
Training Time: 39.55s



## Shade Features

In [24]:
def new_shade_features(data):
    df = data.copy()
    shade_features = ['Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm']
    df["Hillshade_Avg"] = df[shade_features].mean(axis=1)
    df['Hillshade_Range'] = df[shade_features].max(axis=1) - df[shade_features].min(axis=1)
    return df

In [25]:
test_score, oof_preds, holdout = train_original(adaboost, new_shade_features)

adaboost_scores.append((
    'Shade_Features', test_score, holdout,
     *recall_score(original['Cover_Type'].iloc[15119:], oof_preds, average = None)
))

Fold 0 Accuracy: 0.79524 in 0.46s.
Fold 1 Accuracy: 0.79603 in 0.52s.
Fold 2 Accuracy: 0.79286 in 0.45s.
Fold 3 Accuracy: 0.78889 in 0.45s.
Fold 4 Accuracy: 0.80397 in 0.44s.
Fold 5 Accuracy: 0.7873 in 0.48s.
Fold 6 Accuracy: 0.78016 in 0.47s.
Fold 7 Accuracy: 0.80476 in 0.44s.
Fold 8 Accuracy: 0.79444 in 0.44s.
Fold 9 Accuracy: 0.79048 in 0.44s.
Fold 10 Accuracy: 0.80159 in 0.44s.
Fold 11 Accuracy: 0.78872 in 0.44s.

Average CV Accuracy: 0.7937
Worst CV Accuracy: 0.78016
Holdout Accuracy: 0.75256
Training Time: 5.47s



In [26]:
test_score, oof_preds, holdout = train_original(extratrees, new_shade_features)

extratrees_scores.append((
    'Shade_Features', test_score, holdout,
     *recall_score(original['Cover_Type'].iloc[15119:], oof_preds, average = None)
))

Fold 0 Accuracy: 0.8881 in 3.46s.
Fold 1 Accuracy: 0.88095 in 3.51s.
Fold 2 Accuracy: 0.88095 in 3.54s.
Fold 3 Accuracy: 0.88413 in 3.55s.
Fold 4 Accuracy: 0.89127 in 3.66s.
Fold 5 Accuracy: 0.88016 in 3.37s.
Fold 6 Accuracy: 0.88175 in 3.29s.
Fold 7 Accuracy: 0.87143 in 3.31s.
Fold 8 Accuracy: 0.8627 in 3.32s.
Fold 9 Accuracy: 0.89206 in 3.5s.
Fold 10 Accuracy: 0.88254 in 3.33s.
Fold 11 Accuracy: 0.89992 in 3.41s.

Average CV Accuracy: 0.883
Worst CV Accuracy: 0.8627
Holdout Accuracy: 0.77191
Training Time: 41.25s



In [27]:
test_score, oof_preds, holdout = train_original(bagging, new_shade_features)

bagging_scores.append((
    'Shade_Features', test_score, holdout,
     *recall_score(original['Cover_Type'].iloc[15119:], oof_preds, average = None)
))

Fold 0 Accuracy: 0.85873 in 2.89s.
Fold 1 Accuracy: 0.85238 in 2.68s.
Fold 2 Accuracy: 0.85159 in 2.44s.
Fold 3 Accuracy: 0.86587 in 2.52s.
Fold 4 Accuracy: 0.85079 in 2.38s.
Fold 5 Accuracy: 0.85476 in 2.65s.
Fold 6 Accuracy: 0.85714 in 2.35s.
Fold 7 Accuracy: 0.84762 in 2.66s.
Fold 8 Accuracy: 0.84048 in 2.64s.
Fold 9 Accuracy: 0.85 in 2.57s.
Fold 10 Accuracy: 0.85 in 2.69s.
Fold 11 Accuracy: 0.85624 in 2.68s.

Average CV Accuracy: 0.85297
Worst CV Accuracy: 0.84048
Holdout Accuracy: 0.7513
Training Time: 31.15s



In [28]:
test_score, oof_preds, holdout = train_original(randomforest, new_shade_features)

random_scores.append((
    'Shade_Features', test_score, holdout,
     *recall_score(original['Cover_Type'].iloc[15119:], oof_preds, average = None)
))

Fold 0 Accuracy: 0.86746 in 3.49s.
Fold 1 Accuracy: 0.85794 in 3.45s.
Fold 2 Accuracy: 0.84841 in 3.35s.
Fold 3 Accuracy: 0.86746 in 3.32s.
Fold 4 Accuracy: 0.85873 in 3.35s.
Fold 5 Accuracy: 0.85397 in 3.55s.
Fold 6 Accuracy: 0.85635 in 3.37s.
Fold 7 Accuracy: 0.85 in 3.48s.
Fold 8 Accuracy: 0.83968 in 3.36s.
Fold 9 Accuracy: 0.85714 in 3.52s.
Fold 10 Accuracy: 0.85714 in 3.35s.
Fold 11 Accuracy: 0.87053 in 3.4s.

Average CV Accuracy: 0.85707
Worst CV Accuracy: 0.83968
Holdout Accuracy: 0.73824
Training Time: 40.99s



## Distance Interactions

In [29]:
def distance_interactions(data):
    df = data.copy()
    df['Hydro_Fire_1'] = df['Horizontal_Distance_To_Hydrology'] + df['Horizontal_Distance_To_Fire_Points']
    df['Hydro_Fire_2'] = abs(df['Horizontal_Distance_To_Hydrology'] - df['Horizontal_Distance_To_Fire_Points'])
    df['Hydro_Road_1'] = abs(df['Horizontal_Distance_To_Hydrology'] + df['Horizontal_Distance_To_Roadways'])
    df['Hydro_Road_2'] = abs(df['Horizontal_Distance_To_Hydrology'] - df['Horizontal_Distance_To_Roadways'])
    df['Fire_Road_1'] = abs(df['Horizontal_Distance_To_Fire_Points'] + df['Horizontal_Distance_To_Roadways'])
    df['Fire_Road_2'] = abs(df['Horizontal_Distance_To_Fire_Points'] - df['Horizontal_Distance_To_Roadways'])
    return df

In [30]:
test_score, oof_preds, holdout = train_original(adaboost, distance_interactions)

adaboost_scores.append((
    'Dist_Interactions', test_score, holdout,
     *recall_score(original['Cover_Type'].iloc[15119:], oof_preds, average = None)
))

Fold 0 Accuracy: 0.80635 in 0.42s.
Fold 1 Accuracy: 0.8127 in 0.36s.
Fold 2 Accuracy: 0.80317 in 0.35s.
Fold 3 Accuracy: 0.81905 in 0.35s.
Fold 4 Accuracy: 0.81825 in 0.36s.
Fold 5 Accuracy: 0.82302 in 0.35s.
Fold 6 Accuracy: 0.8246 in 0.36s.
Fold 7 Accuracy: 0.82063 in 0.37s.
Fold 8 Accuracy: 0.81111 in 0.37s.
Fold 9 Accuracy: 0.81905 in 0.4s.
Fold 10 Accuracy: 0.81984 in 0.38s.
Fold 11 Accuracy: 0.82526 in 0.36s.

Average CV Accuracy: 0.81692
Worst CV Accuracy: 0.80317
Holdout Accuracy: 0.77843
Training Time: 4.43s



In [31]:
test_score, oof_preds, holdout = train_original(extratrees, distance_interactions)

extratrees_scores.append((
    'Dist_Interactions', test_score, holdout,
     *recall_score(original['Cover_Type'].iloc[15119:], oof_preds, average = None)
))

Fold 0 Accuracy: 0.90556 in 3.74s.
Fold 1 Accuracy: 0.89841 in 3.54s.
Fold 2 Accuracy: 0.89841 in 3.54s.
Fold 3 Accuracy: 0.90476 in 3.49s.
Fold 4 Accuracy: 0.90556 in 3.51s.
Fold 5 Accuracy: 0.89921 in 3.43s.
Fold 6 Accuracy: 0.9 in 3.38s.
Fold 7 Accuracy: 0.89286 in 3.38s.
Fold 8 Accuracy: 0.88016 in 3.46s.
Fold 9 Accuracy: 0.9119 in 3.57s.
Fold 10 Accuracy: 0.90635 in 3.41s.
Fold 11 Accuracy: 0.90469 in 3.36s.

Average CV Accuracy: 0.90066
Worst CV Accuracy: 0.88016
Holdout Accuracy: 0.80163
Training Time: 41.81s



In [32]:
test_score, oof_preds, holdout = train_original(bagging, distance_interactions)

bagging_scores.append((
    'Dist_Interactions', test_score, holdout,
     *recall_score(original['Cover_Type'].iloc[15119:], oof_preds, average = None)
))

Fold 0 Accuracy: 0.88333 in 1.93s.
Fold 1 Accuracy: 0.85873 in 2.22s.
Fold 2 Accuracy: 0.87857 in 1.94s.
Fold 3 Accuracy: 0.87698 in 2.19s.
Fold 4 Accuracy: 0.88254 in 1.95s.
Fold 5 Accuracy: 0.86825 in 2.01s.
Fold 6 Accuracy: 0.87063 in 1.84s.
Fold 7 Accuracy: 0.8627 in 2.05s.
Fold 8 Accuracy: 0.86587 in 1.89s.
Fold 9 Accuracy: 0.87381 in 1.96s.
Fold 10 Accuracy: 0.87619 in 1.88s.
Fold 11 Accuracy: 0.87927 in 1.91s.

Average CV Accuracy: 0.87307
Worst CV Accuracy: 0.85873
Holdout Accuracy: 0.78025
Training Time: 23.78s



In [33]:
test_score, oof_preds, holdout = train_original(randomforest, distance_interactions)

random_scores.append((
    'Dist_Interactions', test_score, holdout,
     *recall_score(original['Cover_Type'].iloc[15119:], oof_preds, average = None)
))

Fold 0 Accuracy: 0.89683 in 3.36s.
Fold 1 Accuracy: 0.89048 in 3.48s.
Fold 2 Accuracy: 0.88413 in 3.13s.
Fold 3 Accuracy: 0.9 in 3.3s.
Fold 4 Accuracy: 0.8873 in 3.19s.
Fold 5 Accuracy: 0.88651 in 3.37s.
Fold 6 Accuracy: 0.89524 in 3.33s.
Fold 7 Accuracy: 0.88968 in 3.28s.
Fold 8 Accuracy: 0.87381 in 3.28s.
Fold 9 Accuracy: 0.90635 in 3.22s.
Fold 10 Accuracy: 0.88571 in 3.45s.
Fold 11 Accuracy: 0.89357 in 3.21s.

Average CV Accuracy: 0.8908
Worst CV Accuracy: 0.87381
Holdout Accuracy: 0.78466
Training Time: 39.6s



## Misc. Interactions

In [34]:
def various_interactions(data):
    df = data.copy()
    df['EHiElv'] = df['Horizontal_Distance_To_Roadways'] * df['Elevation']
    df['EViElv'] = df['Vertical_Distance_To_Hydrology'] * df['Elevation']
    df['Highwater'] = (df.Vertical_Distance_To_Hydrology < 0).astype(int)
    df['EVDtH'] = df.Elevation - df.Vertical_Distance_To_Hydrology
    df['EHDtH'] = df.Elevation - df.Horizontal_Distance_To_Hydrology * 0.2
    df['Hillshade_3pm_is_zero'] = (df.Hillshade_3pm == 0).astype(int)
    return df

In [35]:
test_score, oof_preds, holdout = train_original(adaboost, various_interactions)

adaboost_scores.append((
    'Misc_Interactions', test_score, holdout,
     *recall_score(original['Cover_Type'].iloc[15119:], oof_preds, average = None)
))

Fold 0 Accuracy: 0.78889 in 0.47s.
Fold 1 Accuracy: 0.80397 in 0.5s.
Fold 2 Accuracy: 0.8 in 0.48s.
Fold 3 Accuracy: 0.78333 in 0.48s.
Fold 4 Accuracy: 0.80159 in 0.48s.
Fold 5 Accuracy: 0.78175 in 0.48s.
Fold 6 Accuracy: 0.79286 in 0.49s.
Fold 7 Accuracy: 0.78571 in 0.58s.
Fold 8 Accuracy: 0.78413 in 0.46s.
Fold 9 Accuracy: 0.80238 in 0.52s.
Fold 10 Accuracy: 0.79762 in 0.48s.
Fold 11 Accuracy: 0.77284 in 0.48s.

Average CV Accuracy: 0.79125
Worst CV Accuracy: 0.77284
Holdout Accuracy: 0.75407
Training Time: 5.88s



In [36]:
test_score, oof_preds, holdout = train_original(extratrees, various_interactions)

extratrees_scores.append((
    'Misc_Interactions', test_score, holdout,
     *recall_score(original['Cover_Type'].iloc[15119:], oof_preds, average = None)
))

Fold 0 Accuracy: 0.89286 in 3.59s.
Fold 1 Accuracy: 0.88651 in 3.57s.
Fold 2 Accuracy: 0.88333 in 3.44s.
Fold 3 Accuracy: 0.89048 in 3.37s.
Fold 4 Accuracy: 0.89048 in 3.4s.
Fold 5 Accuracy: 0.88968 in 3.38s.
Fold 6 Accuracy: 0.88175 in 3.41s.
Fold 7 Accuracy: 0.8746 in 3.49s.
Fold 8 Accuracy: 0.87222 in 3.35s.
Fold 9 Accuracy: 0.9 in 3.48s.
Fold 10 Accuracy: 0.88968 in 3.39s.
Fold 11 Accuracy: 0.89515 in 3.48s.

Average CV Accuracy: 0.88723
Worst CV Accuracy: 0.87222
Holdout Accuracy: 0.77675
Training Time: 41.37s



In [37]:
test_score, oof_preds, holdout = train_original(bagging, various_interactions)

bagging_scores.append((
    'Misc_Interactions', test_score, holdout,
     *recall_score(original['Cover_Type'].iloc[15119:], oof_preds, average = None)
))

Fold 0 Accuracy: 0.85556 in 2.72s.
Fold 1 Accuracy: 0.86032 in 2.5s.
Fold 2 Accuracy: 0.85714 in 2.48s.
Fold 3 Accuracy: 0.83413 in 2.52s.
Fold 4 Accuracy: 0.86032 in 2.42s.
Fold 5 Accuracy: 0.85952 in 2.56s.
Fold 6 Accuracy: 0.85397 in 2.43s.
Fold 7 Accuracy: 0.84762 in 2.7s.
Fold 8 Accuracy: 0.84683 in 2.55s.
Fold 9 Accuracy: 0.86587 in 2.74s.
Fold 10 Accuracy: 0.85159 in 2.48s.
Fold 11 Accuracy: 0.84035 in 2.56s.

Average CV Accuracy: 0.85277
Worst CV Accuracy: 0.83413
Holdout Accuracy: 0.75134
Training Time: 30.67s



In [38]:
test_score, oof_preds, holdout = train_original(randomforest, various_interactions)

random_scores.append((
    'Misc_Interactions', test_score, holdout,
     *recall_score(original['Cover_Type'].iloc[15119:], oof_preds, average = None)
))

Fold 0 Accuracy: 0.87143 in 3.42s.
Fold 1 Accuracy: 0.85635 in 3.48s.
Fold 2 Accuracy: 0.85079 in 3.28s.
Fold 3 Accuracy: 0.87063 in 3.18s.
Fold 4 Accuracy: 0.86508 in 3.29s.
Fold 5 Accuracy: 0.86349 in 3.37s.
Fold 6 Accuracy: 0.86667 in 3.31s.
Fold 7 Accuracy: 0.86349 in 3.29s.
Fold 8 Accuracy: 0.84683 in 3.35s.
Fold 9 Accuracy: 0.86349 in 3.27s.
Fold 10 Accuracy: 0.8619 in 3.25s.
Fold 11 Accuracy: 0.86974 in 3.32s.

Average CV Accuracy: 0.86249
Worst CV Accuracy: 0.84683
Holdout Accuracy: 0.73641
Training Time: 39.81s



## Summary 

In [39]:
pd.DataFrame.from_records(
    data = adaboost_scores,
    columns = ['model','cv_score','holdout','recall_0','recall_1','recall_2','recall_3','recall_4','recall_5','recall_6']
).sort_values('holdout')

Unnamed: 0,model,cv_score,holdout,recall_0,recall_1,recall_2,recall_3,recall_4,recall_5,recall_6
1,Fix_Aspect,0.799193,0.75214,0.784576,0.688658,0.866141,0.969336,0.954316,0.867824,0.961798
3,Shade_Features,0.793703,0.752559,0.7847,0.689277,0.864117,0.97615,0.951861,0.872887,0.964087
0,Baseline,0.803559,0.753727,0.786217,0.689899,0.868165,0.964225,0.957453,0.872953,0.963924
5,Misc_Interactions,0.791255,0.754074,0.778119,0.696608,0.866528,0.969336,0.95568,0.875912,0.96545
2,Water_Dist,0.797143,0.759428,0.785168,0.702598,0.862182,0.971039,0.955816,0.877425,0.96485
4,Dist_Interactions,0.81692,0.77843,0.807483,0.71998,0.884417,0.967632,0.963589,0.895311,0.971008


In [40]:
pd.DataFrame.from_records(
    data = extratrees_scores,
    columns = ['model','cv_score','holdout','recall_0','recall_1','recall_2','recall_3','recall_4','recall_5','recall_6']
).sort_values('holdout')

Unnamed: 0,model,cv_score,holdout,recall_0,recall_1,recall_2,recall_3,recall_4,recall_5,recall_6
3,Shade_Features,0.882996,0.771911,0.784729,0.724402,0.876142,0.974446,0.965498,0.901098,0.971608
2,Water_Dist,0.884451,0.773404,0.787696,0.725042,0.874475,0.974446,0.968635,0.905373,0.972207
5,Misc_Interactions,0.887229,0.776755,0.787047,0.731142,0.881232,0.974446,0.967135,0.910239,0.973678
1,Fix_Aspect,0.885179,0.778045,0.790571,0.731732,0.878672,0.972743,0.967271,0.90557,0.972698
0,Baseline,0.884914,0.778078,0.790614,0.7318,0.878553,0.971039,0.967271,0.905767,0.972316
4,Dist_Interactions,0.900655,0.801627,0.812552,0.759217,0.896145,0.972743,0.975726,0.922338,0.97842


In [41]:
pd.DataFrame.from_records(
    data = bagging_scores,
    columns = ['model','cv_score','holdout','recall_0','recall_1','recall_2','recall_3','recall_4','recall_5','recall_6']
).sort_values('holdout')

Unnamed: 0,model,cv_score,holdout,recall_0,recall_1,recall_2,recall_3,recall_4,recall_5,recall_6
3,Shade_Features,0.852967,0.751299,0.763754,0.701402,0.857776,0.972743,0.962226,0.893076,0.969646
5,Misc_Interactions,0.852767,0.751338,0.762104,0.70174,0.862926,0.97615,0.962089,0.898336,0.970681
2,Water_Dist,0.849991,0.752204,0.768194,0.699862,0.856139,0.97615,0.964953,0.892747,0.97248
1,Fix_Aspect,0.855878,0.753612,0.764522,0.704678,0.860366,0.97615,0.963862,0.897481,0.972861
0,Baseline,0.855812,0.753724,0.765505,0.704309,0.859652,0.974446,0.963998,0.896035,0.973243
4,Dist_Interactions,0.873074,0.780252,0.794597,0.73201,0.881083,0.974446,0.973135,0.91379,0.976894


In [42]:
pd.DataFrame.from_records(
    data = random_scores,
    columns = ['model','cv_score','holdout','recall_0','recall_1','recall_2','recall_3','recall_4','recall_5','recall_6']
).sort_values('holdout')

Unnamed: 0,model,cv_score,holdout,recall_0,recall_1,recall_2,recall_3,recall_4,recall_5,recall_6
5,Misc_Interactions,0.862491,0.736408,0.754011,0.680182,0.850335,0.974446,0.95568,0.879989,0.973896
3,Shade_Features,0.857068,0.738237,0.76094,0.682761,0.827921,0.965928,0.947225,0.864273,0.969319
2,Water_Dist,0.861499,0.745623,0.77067,0.688665,0.835839,0.97615,0.952543,0.873874,0.970572
0,Baseline,0.863947,0.748949,0.769587,0.695494,0.839768,0.97615,0.951861,0.876044,0.972153
1,Fix_Aspect,0.864542,0.749643,0.771104,0.695697,0.839827,0.974446,0.951589,0.877096,0.972316
4,Dist_Interactions,0.8908,0.784662,0.804679,0.733443,0.879625,0.977853,0.971362,0.90866,0.98327


# Soil Type Features

Next, were going to attempt to leverage domain knowledge to derive further categorical features using the soil types. From the description of the dataset, we have the following:
```
     ID    Code     Description

     1     2702     Cathedral family - Rock outcrop complex, extremely stony.
     2     2703     Vanet - Ratake families complex, very stony.
     3     2704     Haploborolis - Rock outcrop complex, rubbly.
     4     2705     Ratake family - Rock outcrop complex, rubbly.
     5     2706     Vanet family - Rock outcrop complex complex, rubbly.
     6     2717     Vanet - Wetmore families - Rock outcrop complex, stony.
     7     3501     Gothic family.
     8     3502     Supervisor - Limber families complex.
     9     4201     Troutville family, very stony.
    10     4703     Bullwark - Catamount families - Rock outcrop complex, rubbly.
    11     4704     Bullwark - Catamount families - Rock land complex, rubbly.
    12     4744     Legault family - Rock land complex, stony.
    13     4758     Catamount family - Rock land - Bullwark family complex, rubbly.
    14     5101     Pachic Argiborolis - Aquolis complex.
    15     5151     unspecified in the USFS Soil and ELU Survey.
    16     6101     Cryaquolis - Cryoborolis complex.
    17     6102     Gateview family - Cryaquolis complex.
    18     6731     Rogert family, very stony.
    19     7101     Typic Cryaquolis - Borohemists complex.
    20     7102     Typic Cryaquepts - Typic Cryaquolls complex.
    21     7103     Typic Cryaquolls - Leighcan family, till substratum complex.
    22     7201     Leighcan family, till substratum, extremely bouldery.
    23     7202     Leighcan family, till substratum - Typic Cryaquolls complex.
    24     7700     Leighcan family, extremely stony.
    25     7701     Leighcan family, warm, extremely stony.
    26     7702     Granile - Catamount families complex, very stony.
    27     7709     Leighcan family, warm - Rock outcrop complex, extremely stony.
    28     7710     Leighcan family - Rock outcrop complex, extremely stony.
    29     7745     Como - Legault families complex, extremely stony.
    30     7746     Como family - Rock land - Legault family complex, extremely stony.
    31     7755     Leighcan - Catamount families complex, extremely stony.
    32     7756     Catamount family - Rock outcrop - Leighcan family complex, extremely stony.
    33     7757     Leighcan - Catamount families - Rock outcrop complex, extremely stony.
    34     7790     Cryorthents - Rock land complex, extremely stony.
    35     8703     Cryumbrepts - Rock outcrop - Cryaquepts complex.
    36     8707     Bross family - Rock land - Cryumbrepts complex, extremely stony.
    37     8708     Rock outcrop - Cryumbrepts - Cryorthents complex, extremely stony.
    38     8771     Leighcan - Moran families - Cryaquolls complex, extremely stony.
    39     8772     Moran family - Cryorthents - Leighcan family complex, extremely stony.
    40     8776     Moran family - Cryorthents - Rock land complex, extremely stony.

        Note:   First digit:  climatic zone             Second digit:  geologic zones
                1.  lower montane dry                   1.  alluvium
                2.  lower montane                       2.  glacial
                3.  montane dry                         3.  shale
                4.  montane                             4.  sandstone
                5.  montane dry and montane             5.  mixed sedimentary
                6.  montane and subalpine               6.  unspecified in the USFS ELU Survey
                7.  subalpine                           7.  igneous and metamorphic
                8.  alpine                              8.  volcanic

        The third and fourth ELU digits are unique to the mapping unit 
        and have no special meaning to the climatic or geologic zones.
```

In [43]:
# Dictionary for mapping to ELU Code
code = {
    1:2702,2:2703,3:2704,4:2705,5:2706,
    6:2717,7:3501,8:3502,9:4201,10:4703,
    11:4704,12:4744,13:4758,14:5101,15:5151,
    16:6101,17:6102,18:6731,19:7101,20:7102,
    21:7103,22:7201,23:7202,24:7700,25:7701,
    26:7702,27:7709,28:7710,29:7745,30:7746,
    31:7755,32:7756,33:7757,34:7790,35:8703,
    36:8707,37:8708,38:8771,39:8772,40:8776
}

## 1. Undo One-Hot Encoding

In [44]:
def consolidate_soil_types(input_df, drop = True):
    data = input_df.copy()
    soil_features = sorted([x for x in data.columns if x.startswith("Soil_Type")])
    soil_type = list()
    for index, row in data[soil_features].iterrows():
        found = False
        for i, val in enumerate(row, start = 1):
            if val != 0:
                soil_type.append(i)
                found = True
                break
        if not found: soil_type.append(None)
    data['Soil_Type'] = soil_type
    if drop:
        nonsoil_features = [x for x in data.columns if x not in soil_features]
        return data[nonsoil_features]
    return data
    

In [45]:
test_score, oof_preds, holdout = train_original(adaboost, consolidate_soil_types)

adaboost_scores.append((
    'Ordinal_Drop', test_score, holdout,
     *recall_score(original['Cover_Type'].iloc[15119:], oof_preds, average = None)
))

Fold 0 Accuracy: 0.78413 in 0.29s.
Fold 1 Accuracy: 0.78889 in 0.29s.
Fold 2 Accuracy: 0.8 in 0.28s.
Fold 3 Accuracy: 0.79524 in 0.28s.
Fold 4 Accuracy: 0.79841 in 0.29s.
Fold 5 Accuracy: 0.80635 in 0.28s.
Fold 6 Accuracy: 0.78968 in 0.29s.
Fold 7 Accuracy: 0.80079 in 0.28s.
Fold 8 Accuracy: 0.76984 in 0.28s.
Fold 9 Accuracy: 0.78333 in 0.29s.
Fold 10 Accuracy: 0.8 in 0.29s.
Fold 11 Accuracy: 0.76807 in 0.28s.

Average CV Accuracy: 0.79039
Worst CV Accuracy: 0.76807
Holdout Accuracy: 0.76229
Training Time: 3.42s



In [46]:
test_score, oof_preds, holdout = train_original(extratrees, consolidate_soil_types)

extratrees_scores.append((
    'Ordinal_Drop', test_score, holdout,
     *recall_score(original['Cover_Type'].iloc[15119:], oof_preds, average = None)
))

Fold 0 Accuracy: 0.89683 in 3.0s.
Fold 1 Accuracy: 0.87857 in 2.93s.
Fold 2 Accuracy: 0.88492 in 2.97s.
Fold 3 Accuracy: 0.88095 in 2.91s.
Fold 4 Accuracy: 0.89365 in 2.84s.
Fold 5 Accuracy: 0.8881 in 3.13s.
Fold 6 Accuracy: 0.88254 in 3.16s.
Fold 7 Accuracy: 0.88175 in 2.91s.
Fold 8 Accuracy: 0.86905 in 3.08s.
Fold 9 Accuracy: 0.89127 in 3.02s.
Fold 10 Accuracy: 0.88889 in 2.95s.
Fold 11 Accuracy: 0.90389 in 2.85s.

Average CV Accuracy: 0.8867
Worst CV Accuracy: 0.86905
Holdout Accuracy: 0.78229
Training Time: 35.75s



In [47]:
test_score, oof_preds, holdout = train_original(bagging, consolidate_soil_types)

bagging_scores.append((
    'Ordinal_Drop', test_score, holdout,
     *recall_score(original['Cover_Type'].iloc[15119:], oof_preds, average = None)
))

Fold 0 Accuracy: 0.84603 in 1.72s.
Fold 1 Accuracy: 0.84286 in 1.56s.
Fold 2 Accuracy: 0.85794 in 1.49s.
Fold 3 Accuracy: 0.85159 in 1.53s.
Fold 4 Accuracy: 0.85714 in 1.55s.
Fold 5 Accuracy: 0.85317 in 1.62s.
Fold 6 Accuracy: 0.85476 in 1.52s.
Fold 7 Accuracy: 0.85794 in 1.46s.
Fold 8 Accuracy: 0.84048 in 1.79s.
Fold 9 Accuracy: 0.86111 in 1.67s.
Fold 10 Accuracy: 0.86508 in 1.64s.
Fold 11 Accuracy: 0.8618 in 1.6s.

Average CV Accuracy: 0.85416
Worst CV Accuracy: 0.84048
Holdout Accuracy: 0.76073
Training Time: 19.15s



In [48]:
test_score, oof_preds, holdout = train_original(randomforest, consolidate_soil_types)

random_scores.append((
    'Ordinal_Drop', test_score, holdout,
     *recall_score(original['Cover_Type'].iloc[15119:], oof_preds, average = None)
))

Fold 0 Accuracy: 0.86825 in 3.13s.
Fold 1 Accuracy: 0.85556 in 3.16s.
Fold 2 Accuracy: 0.85873 in 3.01s.
Fold 3 Accuracy: 0.86746 in 3.0s.
Fold 4 Accuracy: 0.8754 in 3.18s.
Fold 5 Accuracy: 0.86825 in 2.95s.
Fold 6 Accuracy: 0.86667 in 2.96s.
Fold 7 Accuracy: 0.8619 in 3.14s.
Fold 8 Accuracy: 0.84921 in 2.93s.
Fold 9 Accuracy: 0.88175 in 3.03s.
Fold 10 Accuracy: 0.85873 in 3.15s.
Fold 11 Accuracy: 0.86577 in 2.94s.

Average CV Accuracy: 0.86481
Worst CV Accuracy: 0.84921
Holdout Accuracy: 0.74847
Training Time: 36.6s



In [49]:
test_score, oof_preds, holdout = train_original(adaboost, partial(consolidate_soil_types, drop = False))

adaboost_scores.append((
    'Ordinal_Keep', test_score, holdout,
     *recall_score(original['Cover_Type'].iloc[15119:], oof_preds, average = None)
))

Fold 0 Accuracy: 0.79524 in 0.42s.
Fold 1 Accuracy: 0.80873 in 0.43s.
Fold 2 Accuracy: 0.79206 in 0.41s.
Fold 3 Accuracy: 0.78413 in 0.4s.
Fold 4 Accuracy: 0.78333 in 0.4s.
Fold 5 Accuracy: 0.80238 in 0.39s.
Fold 6 Accuracy: 0.78413 in 0.4s.
Fold 7 Accuracy: 0.80556 in 0.4s.
Fold 8 Accuracy: 0.78254 in 0.39s.
Fold 9 Accuracy: 0.79762 in 0.43s.
Fold 10 Accuracy: 0.77778 in 0.43s.
Fold 11 Accuracy: 0.8054 in 0.44s.

Average CV Accuracy: 0.79324
Worst CV Accuracy: 0.77778
Holdout Accuracy: 0.75612
Training Time: 4.94s



In [50]:
test_score, oof_preds, holdout = train_original(extratrees,  partial(consolidate_soil_types, drop = False))

extratrees_scores.append((
    'Ordinal_Keep', test_score, holdout,
     *recall_score(original['Cover_Type'].iloc[15119:], oof_preds, average = None)
))

Fold 0 Accuracy: 0.89206 in 3.51s.
Fold 1 Accuracy: 0.88333 in 3.42s.
Fold 2 Accuracy: 0.88413 in 3.39s.
Fold 3 Accuracy: 0.88889 in 3.3s.
Fold 4 Accuracy: 0.89048 in 3.31s.
Fold 5 Accuracy: 0.88492 in 3.51s.
Fold 6 Accuracy: 0.8881 in 3.77s.
Fold 7 Accuracy: 0.88333 in 3.47s.
Fold 8 Accuracy: 0.8627 in 3.23s.
Fold 9 Accuracy: 0.89127 in 3.43s.
Fold 10 Accuracy: 0.88651 in 3.43s.
Fold 11 Accuracy: 0.89515 in 3.34s.

Average CV Accuracy: 0.88591
Worst CV Accuracy: 0.8627
Holdout Accuracy: 0.77965
Training Time: 41.12s



In [51]:
test_score, oof_preds, holdout = train_original(bagging,  partial(consolidate_soil_types, drop = False))

bagging_scores.append((
    'Ordinal_Keep', test_score, holdout,
     *recall_score(original['Cover_Type'].iloc[15119:], oof_preds, average = None)
))

Fold 0 Accuracy: 0.86984 in 2.32s.
Fold 1 Accuracy: 0.86508 in 2.54s.
Fold 2 Accuracy: 0.84841 in 2.42s.
Fold 3 Accuracy: 0.85317 in 2.43s.
Fold 4 Accuracy: 0.85873 in 2.4s.
Fold 5 Accuracy: 0.85159 in 2.45s.
Fold 6 Accuracy: 0.85 in 2.51s.
Fold 7 Accuracy: 0.85476 in 2.52s.
Fold 8 Accuracy: 0.85 in 2.45s.
Fold 9 Accuracy: 0.85635 in 2.45s.
Fold 10 Accuracy: 0.8619 in 2.49s.
Fold 11 Accuracy: 0.85703 in 2.65s.

Average CV Accuracy: 0.85641
Worst CV Accuracy: 0.84841
Holdout Accuracy: 0.75935
Training Time: 29.62s



In [52]:
test_score, oof_preds, holdout = train_original(randomforest,  partial(consolidate_soil_types, drop = False))

random_scores.append((
    'Ordinal_Keep', test_score, holdout,
     *recall_score(original['Cover_Type'].iloc[15119:], oof_preds, average = None)
))

Fold 0 Accuracy: 0.87857 in 3.56s.
Fold 1 Accuracy: 0.86032 in 3.45s.
Fold 2 Accuracy: 0.85794 in 3.41s.
Fold 3 Accuracy: 0.86667 in 3.33s.
Fold 4 Accuracy: 0.86825 in 3.35s.
Fold 5 Accuracy: 0.85794 in 3.3s.
Fold 6 Accuracy: 0.86746 in 3.3s.
Fold 7 Accuracy: 0.85794 in 3.36s.
Fold 8 Accuracy: 0.84762 in 3.3s.
Fold 9 Accuracy: 0.86746 in 3.19s.
Fold 10 Accuracy: 0.8627 in 3.16s.
Fold 11 Accuracy: 0.87768 in 3.27s.

Average CV Accuracy: 0.86421
Worst CV Accuracy: 0.84762
Holdout Accuracy: 0.7502
Training Time: 39.99s



## 2. Climatic Zone

We create a feature based on the climatic zone of the soil, which has a natural ordering:

1. lower montane dry
2. lower montane
3. montane dry
4. montane
5. montane dry and montane
6. montane and subalpine
7. subalpine
8. alpine

However, the ordering of the soil type labels roughly follows the ordering of their respectively climatic zones, so there's a chance this feature won't be particularly informative.

In [53]:
def climatic_zone_original(input_df):
    data = input_df.copy()
    data = consolidate_soil_types(data, drop = False)
    data['Climatic Zone'] = data['Soil_Type'].apply(lambda x: int(str(code[x])[0]))
    return data

In [54]:
test_score, oof_preds, holdout = train_original(adaboost, climatic_zone_original)

adaboost_scores.append((
    'Climatic_Zone', test_score, holdout,
     *recall_score(original['Cover_Type'].iloc[15119:], oof_preds, average = None)
))

Fold 0 Accuracy: 0.80794 in 0.44s.
Fold 1 Accuracy: 0.79444 in 0.42s.
Fold 2 Accuracy: 0.80476 in 0.45s.
Fold 3 Accuracy: 0.79444 in 0.42s.
Fold 4 Accuracy: 0.80873 in 0.39s.
Fold 5 Accuracy: 0.79683 in 0.4s.
Fold 6 Accuracy: 0.81508 in 0.39s.
Fold 7 Accuracy: 0.80714 in 0.4s.
Fold 8 Accuracy: 0.77698 in 0.42s.
Fold 9 Accuracy: 0.80397 in 0.46s.
Fold 10 Accuracy: 0.79286 in 0.41s.
Fold 11 Accuracy: 0.80937 in 0.42s.

Average CV Accuracy: 0.80105
Worst CV Accuracy: 0.77698
Holdout Accuracy: 0.76152
Training Time: 5.03s



In [55]:
test_score, oof_preds, holdout = train_original(extratrees, climatic_zone_original)

extratrees_scores.append((
    'Climatic_Zone', test_score, holdout,
     *recall_score(original['Cover_Type'].iloc[15119:], oof_preds, average = None)
))

Fold 0 Accuracy: 0.89524 in 3.33s.
Fold 1 Accuracy: 0.88016 in 3.62s.
Fold 2 Accuracy: 0.88968 in 3.51s.
Fold 3 Accuracy: 0.88968 in 3.25s.
Fold 4 Accuracy: 0.89286 in 3.4s.
Fold 5 Accuracy: 0.88175 in 3.24s.
Fold 6 Accuracy: 0.88254 in 3.27s.
Fold 7 Accuracy: 0.88175 in 3.31s.
Fold 8 Accuracy: 0.8627 in 3.25s.
Fold 9 Accuracy: 0.89127 in 3.39s.
Fold 10 Accuracy: 0.89286 in 3.46s.
Fold 11 Accuracy: 0.89754 in 3.36s.

Average CV Accuracy: 0.8865
Worst CV Accuracy: 0.8627
Holdout Accuracy: 0.78027
Training Time: 40.39s



In [56]:
test_score, oof_preds, holdout = train_original(bagging, climatic_zone_original)

bagging_scores.append((
    'Climatic_Zone', test_score, holdout,
     *recall_score(original['Cover_Type'].iloc[15119:], oof_preds, average = None)
))

Fold 0 Accuracy: 0.86349 in 2.44s.
Fold 1 Accuracy: 0.85397 in 2.46s.
Fold 2 Accuracy: 0.85556 in 2.4s.
Fold 3 Accuracy: 0.85238 in 2.47s.
Fold 4 Accuracy: 0.86587 in 2.42s.
Fold 5 Accuracy: 0.86111 in 2.38s.
Fold 6 Accuracy: 0.85952 in 2.35s.
Fold 7 Accuracy: 0.85556 in 2.52s.
Fold 8 Accuracy: 0.84444 in 2.37s.
Fold 9 Accuracy: 0.8627 in 2.38s.
Fold 10 Accuracy: 0.87302 in 2.49s.
Fold 11 Accuracy: 0.85862 in 2.84s.

Average CV Accuracy: 0.85885
Worst CV Accuracy: 0.84444
Holdout Accuracy: 0.75868
Training Time: 29.53s



In [57]:
test_score, oof_preds, holdout = train_original(randomforest, climatic_zone_original)

random_scores.append((
    'Climatic_Zone', test_score, holdout,
     *recall_score(original['Cover_Type'].iloc[15119:], oof_preds, average = None)
))

Fold 0 Accuracy: 0.87222 in 3.39s.
Fold 1 Accuracy: 0.8627 in 3.18s.
Fold 2 Accuracy: 0.86032 in 3.3s.
Fold 3 Accuracy: 0.86984 in 3.28s.
Fold 4 Accuracy: 0.86349 in 3.21s.
Fold 5 Accuracy: 0.86111 in 3.43s.
Fold 6 Accuracy: 0.86587 in 3.45s.
Fold 7 Accuracy: 0.86508 in 3.53s.
Fold 8 Accuracy: 0.84603 in 3.52s.
Fold 9 Accuracy: 0.86667 in 3.44s.
Fold 10 Accuracy: 0.8619 in 3.36s.
Fold 11 Accuracy: 0.87689 in 3.18s.

Average CV Accuracy: 0.86434
Worst CV Accuracy: 0.84603
Holdout Accuracy: 0.74791
Training Time: 40.28s



## 3. Geologic Zones

1. alluvium
2. glacial
3. shale
4. sandstone
5. mixed sedimentary
6. unspecified in the USFS ELU Survey
7. igneous and metamorphic
8. volcanic

This is another feature which is based on the soil type codes.

In [58]:
def geologic_zone_original(input_df):
    data = input_df.copy()
    data = consolidate_soil_types(data, drop = False)
    data['Geologic_Zone'] = data['Soil_Type'].apply(lambda x: int(str(code[x])[1]))
    return data

In [59]:
test_score, oof_preds, holdout = train_original(adaboost, geologic_zone_original)

adaboost_scores.append((
    'Geologic_Zone', test_score, holdout,
     *recall_score(original['Cover_Type'].iloc[15119:], oof_preds, average = None)
))

Fold 0 Accuracy: 0.7881 in 0.43s.
Fold 1 Accuracy: 0.81429 in 0.39s.
Fold 2 Accuracy: 0.81349 in 0.4s.
Fold 3 Accuracy: 0.79762 in 0.42s.
Fold 4 Accuracy: 0.79921 in 0.43s.
Fold 5 Accuracy: 0.80873 in 0.4s.
Fold 6 Accuracy: 0.79127 in 0.43s.
Fold 7 Accuracy: 0.8127 in 0.42s.
Fold 8 Accuracy: 0.79206 in 0.41s.
Fold 9 Accuracy: 0.80794 in 0.41s.
Fold 10 Accuracy: 0.80635 in 0.4s.
Fold 11 Accuracy: 0.80699 in 0.4s.

Average CV Accuracy: 0.80323
Worst CV Accuracy: 0.7881
Holdout Accuracy: 0.75718
Training Time: 4.92s



In [60]:
test_score, oof_preds, holdout = train_original(extratrees, geologic_zone_original)

extratrees_scores.append((
    'Geologic_Zone', test_score, holdout,
     *recall_score(original['Cover_Type'].iloc[15119:], oof_preds, average = None)
))

Fold 0 Accuracy: 0.89286 in 3.39s.
Fold 1 Accuracy: 0.88254 in 3.38s.
Fold 2 Accuracy: 0.88571 in 3.4s.
Fold 3 Accuracy: 0.89048 in 3.38s.
Fold 4 Accuracy: 0.89444 in 3.2s.
Fold 5 Accuracy: 0.88492 in 3.33s.
Fold 6 Accuracy: 0.88254 in 3.27s.
Fold 7 Accuracy: 0.87619 in 3.29s.
Fold 8 Accuracy: 0.86825 in 3.43s.
Fold 9 Accuracy: 0.89683 in 3.29s.
Fold 10 Accuracy: 0.89048 in 3.33s.
Fold 11 Accuracy: 0.89595 in 3.28s.

Average CV Accuracy: 0.88677
Worst CV Accuracy: 0.86825
Holdout Accuracy: 0.78087
Training Time: 39.98s



In [61]:
test_score, oof_preds, holdout = train_original(bagging, geologic_zone_original)

bagging_scores.append((
    'Geologic_Zone', test_score, holdout,
     *recall_score(original['Cover_Type'].iloc[15119:], oof_preds, average = None)
))

Fold 0 Accuracy: 0.85794 in 2.6s.
Fold 1 Accuracy: 0.8627 in 2.43s.
Fold 2 Accuracy: 0.84921 in 2.45s.
Fold 3 Accuracy: 0.86667 in 2.58s.
Fold 4 Accuracy: 0.85952 in 2.57s.
Fold 5 Accuracy: 0.85952 in 2.71s.
Fold 6 Accuracy: 0.85794 in 2.65s.
Fold 7 Accuracy: 0.85159 in 2.5s.
Fold 8 Accuracy: 0.85079 in 2.45s.
Fold 9 Accuracy: 0.85635 in 2.58s.
Fold 10 Accuracy: 0.8627 in 2.65s.
Fold 11 Accuracy: 0.86656 in 2.51s.

Average CV Accuracy: 0.85846
Worst CV Accuracy: 0.84921
Holdout Accuracy: 0.75896
Training Time: 30.69s



In [62]:
test_score, oof_preds, holdout = train_original(randomforest, geologic_zone_original)

random_scores.append((
    'Geologic_Zone', test_score, holdout,
     *recall_score(original['Cover_Type'].iloc[15119:], oof_preds, average = None)
))

Fold 0 Accuracy: 0.87381 in 3.43s.
Fold 1 Accuracy: 0.86984 in 3.37s.
Fold 2 Accuracy: 0.8627 in 3.28s.
Fold 3 Accuracy: 0.8619 in 3.41s.
Fold 4 Accuracy: 0.8627 in 3.43s.
Fold 5 Accuracy: 0.85317 in 3.32s.
Fold 6 Accuracy: 0.87063 in 3.23s.
Fold 7 Accuracy: 0.85317 in 3.2s.
Fold 8 Accuracy: 0.84206 in 3.31s.
Fold 9 Accuracy: 0.86746 in 3.28s.
Fold 10 Accuracy: 0.85952 in 3.41s.
Fold 11 Accuracy: 0.87053 in 3.44s.

Average CV Accuracy: 0.86229
Worst CV Accuracy: 0.84206
Holdout Accuracy: 0.74884
Training Time: 40.11s



## 4. Surface Cover

According to the [USDA reference](https://www.nrcs.usda.gov/wps/portal/nrcs/detail/soils/ref/?cid=nrcs142p2_054253#surface_fragments) on soil profiling:

1. **(Stony/Bouldery)** — Stones or boulders cover 0.01 to less than 0.1 percent of the surface. The smallest stones are at least 8 meters apart; the smallest boulders are at least 20 meters apart (fig. 3-9).

2. **(Very Stony/Very Bouldery)** — Stones or boulders cover 0.1 to less than 3 percent of the surface. The smallest stones are not less than 1 meter apart; the smallest boulders are not less than 3 meters apart (fig. 3-10).

3. **(Extremely Stony/Extremely Bouldery)** — Stones or boulders cover 3 to less than 15 percent of the surface. The smallest stones are as little as 0.5 meter apart; the smallest boulders are as little as 1 meter apart (fig. 3-11).

4. **(Rubbly)** — Stones or boulders cover 15 to less than 50 percent of the surface. The smallest stones are as little as 0.3 meter apart; the smallest boulders are as little as 0.5 meter apart. In most places it is possible to step from stone to stone or jump from boulder to boulder without touching the soil (fig. 3-12).

5. **(Very Rubbly)** — Stones or boulders appear to be nearly continuous and cover 50 percent or more of the surface. The smallest stones are less than 0.03 meter apart; the smallest boulders are less than 0.05 meter apart. Classifiable soil is among the rock fragments, and plant growth is possible (fig. 3-13).

In [63]:
# Group IDs
no_desc = [7,8,14,15,16,17,19,20,21,23,35]
stony = [6,12]
very_stony = [2,9,18,26]
extremely_stony = [1,22,24,25,27,28,29,30,31,32,33,34,36,37,38,39,40]
rubbly = [3,4,5,10,11,13]

# Create dictionary
surface_cover = {i:0 for i in no_desc}
surface_cover.update({i:1 for i in stony})
surface_cover.update({i:2 for i in very_stony})
surface_cover.update({i:3 for i in extremely_stony})
surface_cover.update({i:4 for i in rubbly})

# Preprocessing function
def surface_cover_original(input_df):
    data = input_df.copy()
    data = consolidate_soil_types(data, drop = False)
    data['Surface_Cover'] = data['Soil_Type'].apply(lambda x: surface_cover[x])
    return data

In [64]:
test_score, oof_preds, holdout = train_original(adaboost, surface_cover_original)

adaboost_scores.append((
    'Surface_Cover', test_score, holdout,
     *recall_score(original['Cover_Type'].iloc[15119:], oof_preds, average = None)
))

Fold 0 Accuracy: 0.79841 in 0.44s.
Fold 1 Accuracy: 0.80159 in 0.42s.
Fold 2 Accuracy: 0.79683 in 0.47s.
Fold 3 Accuracy: 0.7754 in 0.44s.
Fold 4 Accuracy: 0.77698 in 0.44s.
Fold 5 Accuracy: 0.79365 in 0.43s.
Fold 6 Accuracy: 0.80317 in 0.41s.
Fold 7 Accuracy: 0.79921 in 0.42s.
Fold 8 Accuracy: 0.80556 in 0.42s.
Fold 9 Accuracy: 0.81111 in 0.4s.
Fold 10 Accuracy: 0.79048 in 0.41s.
Fold 11 Accuracy: 0.80778 in 0.39s.

Average CV Accuracy: 0.79668
Worst CV Accuracy: 0.7754
Holdout Accuracy: 0.75642
Training Time: 5.08s



In [65]:
test_score, oof_preds, holdout = train_original(extratrees, surface_cover_original)

extratrees_scores.append((
    'Surface_Cover', test_score, holdout,
     *recall_score(original['Cover_Type'].iloc[15119:], oof_preds, average = None)
))

Fold 0 Accuracy: 0.89206 in 3.35s.
Fold 1 Accuracy: 0.88254 in 3.54s.
Fold 2 Accuracy: 0.88651 in 3.49s.
Fold 3 Accuracy: 0.89365 in 3.58s.
Fold 4 Accuracy: 0.89762 in 3.39s.
Fold 5 Accuracy: 0.87857 in 3.49s.
Fold 6 Accuracy: 0.88571 in 3.35s.
Fold 7 Accuracy: 0.87698 in 3.39s.
Fold 8 Accuracy: 0.86587 in 3.32s.
Fold 9 Accuracy: 0.89048 in 3.38s.
Fold 10 Accuracy: 0.89286 in 3.36s.
Fold 11 Accuracy: 0.9023 in 3.34s.

Average CV Accuracy: 0.8871
Worst CV Accuracy: 0.86587
Holdout Accuracy: 0.78026
Training Time: 40.98s



In [66]:
test_score, oof_preds, holdout = train_original(bagging, surface_cover_original)

bagging_scores.append((
    'Surface_Cover', test_score, holdout,
     *recall_score(original['Cover_Type'].iloc[15119:], oof_preds, average = None)
))

Fold 0 Accuracy: 0.86429 in 2.44s.
Fold 1 Accuracy: 0.85238 in 2.64s.
Fold 2 Accuracy: 0.85079 in 2.35s.
Fold 3 Accuracy: 0.86746 in 2.53s.
Fold 4 Accuracy: 0.86032 in 2.53s.
Fold 5 Accuracy: 0.86111 in 2.53s.
Fold 6 Accuracy: 0.86032 in 2.61s.
Fold 7 Accuracy: 0.85556 in 2.42s.
Fold 8 Accuracy: 0.83016 in 2.86s.
Fold 9 Accuracy: 0.85635 in 2.46s.
Fold 10 Accuracy: 0.84921 in 2.44s.
Fold 11 Accuracy: 0.8467 in 2.54s.

Average CV Accuracy: 0.85455
Worst CV Accuracy: 0.83016
Holdout Accuracy: 0.75774
Training Time: 30.36s



In [67]:
test_score, oof_preds, holdout = train_original(randomforest, surface_cover_original)

random_scores.append((
    'Surface_Cover', test_score, holdout,
     *recall_score(original['Cover_Type'].iloc[15119:], oof_preds, average = None)
))

Fold 0 Accuracy: 0.8754 in 3.48s.
Fold 1 Accuracy: 0.86349 in 3.35s.
Fold 2 Accuracy: 0.86429 in 3.37s.
Fold 3 Accuracy: 0.86587 in 3.37s.
Fold 4 Accuracy: 0.86429 in 3.16s.
Fold 5 Accuracy: 0.85476 in 3.3s.
Fold 6 Accuracy: 0.8627 in 3.28s.
Fold 7 Accuracy: 0.85317 in 3.27s.
Fold 8 Accuracy: 0.84841 in 3.58s.
Fold 9 Accuracy: 0.8754 in 3.3s.
Fold 10 Accuracy: 0.86508 in 3.34s.
Fold 11 Accuracy: 0.87927 in 3.17s.

Average CV Accuracy: 0.86434
Worst CV Accuracy: 0.84841
Holdout Accuracy: 0.74861
Training Time: 39.95s



## 5. Rock Size (Ordinal)

According to the [USDA reference](https://www.nrcs.usda.gov/wps/portal/nrcs/detail/soils/ref/?cid=nrcs142p2_054253#fragments) on soil profile:

* Stones/Stony - 250–600 mm diameter
* Boulders/Bouldery - > 600 mm diameter

We can use these to perhaps derive useful features

In [68]:
# Group IDs
no_desc = [7,8,14,15,16,17,19,20,21,23,35]
stones = [1,2,6,9,12,18,24,25,26,27,28,29,30,31,32,33,34,36,37,38,39,40]
boulders = [22]
rubble = [3,4,5,10,11,13]

# Create dictionary
rock_size = {i:0 for i in no_desc}
rock_size.update({i:1 for i in stones})
rock_size.update({i:2 for i in boulders})
rock_size.update({i:3 for i in rubble})

# Preprocessing function
def rock_size_original(input_df):
    data = input_df.copy()
    data = consolidate_soil_types(data, drop = False)
    data['Rock_Size'] = data['Soil_Type'].apply(lambda x: rock_size[x])
    return data

In [69]:
test_score, oof_preds, holdout = train_original(adaboost, rock_size_original)

adaboost_scores.append((
    'Rock_Size', test_score, holdout,
     *recall_score(original['Cover_Type'].iloc[15119:], oof_preds, average = None)
))

Fold 0 Accuracy: 0.79365 in 0.43s.
Fold 1 Accuracy: 0.79841 in 0.4s.
Fold 2 Accuracy: 0.80159 in 0.41s.
Fold 3 Accuracy: 0.81032 in 0.39s.
Fold 4 Accuracy: 0.79683 in 0.42s.
Fold 5 Accuracy: 0.80397 in 0.4s.
Fold 6 Accuracy: 0.79524 in 0.4s.
Fold 7 Accuracy: 0.79683 in 0.41s.
Fold 8 Accuracy: 0.79127 in 0.43s.
Fold 9 Accuracy: 0.80317 in 0.43s.
Fold 10 Accuracy: 0.79444 in 0.42s.
Fold 11 Accuracy: 0.80302 in 0.43s.

Average CV Accuracy: 0.79906
Worst CV Accuracy: 0.79127
Holdout Accuracy: 0.76041
Training Time: 4.98s



In [70]:
test_score, oof_preds, holdout = train_original(extratrees, rock_size_original)

extratrees_scores.append((
    'Rock_Size', test_score, holdout,
     *recall_score(original['Cover_Type'].iloc[15119:], oof_preds, average = None)
))

Fold 0 Accuracy: 0.89365 in 3.57s.
Fold 1 Accuracy: 0.88254 in 3.52s.
Fold 2 Accuracy: 0.88651 in 3.57s.
Fold 3 Accuracy: 0.88968 in 3.38s.
Fold 4 Accuracy: 0.89444 in 3.4s.
Fold 5 Accuracy: 0.88095 in 3.41s.
Fold 6 Accuracy: 0.8881 in 3.17s.
Fold 7 Accuracy: 0.8746 in 3.55s.
Fold 8 Accuracy: 0.86349 in 3.33s.
Fold 9 Accuracy: 0.88968 in 3.32s.
Fold 10 Accuracy: 0.89127 in 3.42s.
Fold 11 Accuracy: 0.89913 in 3.37s.

Average CV Accuracy: 0.88617
Worst CV Accuracy: 0.86349
Holdout Accuracy: 0.78025
Training Time: 41.01s



In [71]:
test_score, oof_preds, holdout = train_original(bagging, rock_size_original)

bagging_scores.append((
    'Rock_Size', test_score, holdout,
     *recall_score(original['Cover_Type'].iloc[15119:], oof_preds, average = None)
))

Fold 0 Accuracy: 0.85635 in 2.38s.
Fold 1 Accuracy: 0.85635 in 2.46s.
Fold 2 Accuracy: 0.85476 in 2.53s.
Fold 3 Accuracy: 0.84524 in 2.6s.
Fold 4 Accuracy: 0.85794 in 2.53s.
Fold 5 Accuracy: 0.85 in 2.51s.
Fold 6 Accuracy: 0.84524 in 2.5s.
Fold 7 Accuracy: 0.84841 in 2.53s.
Fold 8 Accuracy: 0.84603 in 2.41s.
Fold 9 Accuracy: 0.84524 in 2.53s.
Fold 10 Accuracy: 0.85476 in 2.48s.
Fold 11 Accuracy: 0.87053 in 2.47s.

Average CV Accuracy: 0.85257
Worst CV Accuracy: 0.84524
Holdout Accuracy: 0.7592
Training Time: 29.92s



In [72]:
test_score, oof_preds, holdout = train_original(randomforest, rock_size_original)

random_scores.append((
    'Rock_Size', test_score, holdout,
     *recall_score(original['Cover_Type'].iloc[15119:], oof_preds, average = None)
))

Fold 0 Accuracy: 0.88016 in 3.45s.
Fold 1 Accuracy: 0.85952 in 3.42s.
Fold 2 Accuracy: 0.86111 in 3.37s.
Fold 3 Accuracy: 0.87143 in 3.4s.
Fold 4 Accuracy: 0.86587 in 3.31s.
Fold 5 Accuracy: 0.85952 in 3.44s.
Fold 6 Accuracy: 0.86667 in 3.36s.
Fold 7 Accuracy: 0.85714 in 3.25s.
Fold 8 Accuracy: 0.8373 in 3.3s.
Fold 9 Accuracy: 0.87222 in 3.39s.
Fold 10 Accuracy: 0.85952 in 3.28s.
Fold 11 Accuracy: 0.8745 in 3.33s.

Average CV Accuracy: 0.86375
Worst CV Accuracy: 0.8373
Holdout Accuracy: 0.74763
Training Time: 40.29s



## 6. More Description Features

I may decide to do something with this in the future, for now I just grouped each soil type by their families from the descriptions.

In [73]:
families = dict(
    cathedral = [1],
    rock_outcrop = [1,3,4,5,6,10,27,28,32,33,35,37],
    rock_land = [11,12,13,30,34,40],
    vanet = [2,5,6],
    ratake = [2,4],
    wetmore = [6],
    haploborolis = [3],
    gothic = [7],
    supervisor = [8],
    limber = [8],
    troutville = [9],
    bullwark = [10,11,13],
    legault = [12],
    pachic_argiborolis = [14],
    aquolis = [14],
    unspecified = [15],
    cryoborolis = [16],
    cryaquolis = [16,17,19],
    borohmists = [19],
    cryaquepts = [20,35],
    cryaquolls = [20,21,23,38],
    cryorthents = [34,37,39,40],
    cryumbrepts = [35,36,37],
    gateview = [17],
    rogert = [18],
    till_substratum = [21,22,23],
    granile = [26],
    catamount = [26,10,11,13,26,31,32,33],
    leighcan = [21,22,23,24,25,27,28,29,31,32,33,38,39],
    como = [29,30],
    bross = [36],
    moran = [38,39,40],
)

# Summary

In [74]:
# AdaBoost
pd.DataFrame.from_records(
    data = adaboost_scores,
    columns = ['features','cv_score','holdout','recall_0', 'recall_1','recall_2','recall_3','recall_4','recall_5','recall_6']
).sort_values('holdout')

Unnamed: 0,features,cv_score,holdout,recall_0,recall_1,recall_2,recall_3,recall_4,recall_5,recall_6
1,Fix_Aspect,0.799193,0.75214,0.784576,0.688658,0.866141,0.969336,0.954316,0.867824,0.961798
3,Shade_Features,0.793703,0.752559,0.7847,0.689277,0.864117,0.97615,0.951861,0.872887,0.964087
0,Baseline,0.803559,0.753727,0.786217,0.689899,0.868165,0.964225,0.957453,0.872953,0.963924
5,Misc_Interactions,0.791255,0.754074,0.778119,0.696608,0.866528,0.969336,0.95568,0.875912,0.96545
7,Ordinal_Keep,0.793241,0.756122,0.787696,0.692603,0.871052,0.955707,0.955816,0.882028,0.967575
10,Surface_Cover,0.79668,0.756422,0.783379,0.697255,0.865813,0.972743,0.956634,0.87611,0.968501
9,Geologic_Zone,0.803228,0.757182,0.785821,0.697241,0.865039,0.965928,0.958271,0.87703,0.964469
2,Water_Dist,0.797143,0.759428,0.785168,0.702598,0.862182,0.971039,0.955816,0.877425,0.96485
11,Rock_Size,0.799061,0.760409,0.785931,0.703597,0.865426,0.965928,0.95268,0.875058,0.968501
8,Climatic_Zone,0.801046,0.761524,0.795918,0.697881,0.867897,0.960818,0.951725,0.876241,0.97139


In [75]:
# Extra Trees Classifier
pd.DataFrame.from_records(
    data = extratrees_scores,
    columns = ['features','cv_score','holdout','recall_0', 'recall_1','recall_2','recall_3','recall_4','recall_5','recall_6']
).sort_values('holdout')

Unnamed: 0,features,cv_score,holdout,recall_0,recall_1,recall_2,recall_3,recall_4,recall_5,recall_6
3,Shade_Features,0.882996,0.771911,0.784729,0.724402,0.876142,0.974446,0.965498,0.901098,0.971608
2,Water_Dist,0.884451,0.773404,0.787696,0.725042,0.874475,0.974446,0.968635,0.905373,0.972207
5,Misc_Interactions,0.887229,0.776755,0.787047,0.731142,0.881232,0.974446,0.967135,0.910239,0.973678
1,Fix_Aspect,0.885179,0.778045,0.790571,0.731732,0.878672,0.972743,0.967271,0.90557,0.972698
0,Baseline,0.884914,0.778078,0.790614,0.7318,0.878553,0.971039,0.967271,0.905767,0.972316
7,Ordinal_Keep,0.885906,0.779647,0.793032,0.733095,0.879149,0.972743,0.966453,0.905044,0.973025
11,Rock_Size,0.886171,0.78025,0.792355,0.734873,0.878553,0.972743,0.966589,0.904978,0.973188
10,Surface_Cover,0.887097,0.780264,0.792021,0.735033,0.87903,0.972743,0.967408,0.905373,0.97346
8,Climatic_Zone,0.886501,0.780275,0.794048,0.733529,0.879298,0.972743,0.966453,0.905833,0.973188
9,Geologic_Zone,0.886766,0.780867,0.793037,0.735364,0.879565,0.971039,0.967271,0.906753,0.973351


In [76]:
# Bagging Classifier
pd.DataFrame.from_records(
    data = bagging_scores,
    columns = ['features','cv_score','holdout','recall_0', 'recall_1','recall_2','recall_3','recall_4','recall_5','recall_6']
).sort_values('holdout')

Unnamed: 0,features,cv_score,holdout,recall_0,recall_1,recall_2,recall_3,recall_4,recall_5,recall_6
3,Shade_Features,0.852967,0.751299,0.763754,0.701402,0.857776,0.972743,0.962226,0.893076,0.969646
5,Misc_Interactions,0.852767,0.751338,0.762104,0.70174,0.862926,0.97615,0.962089,0.898336,0.970681
2,Water_Dist,0.849991,0.752204,0.768194,0.699862,0.856139,0.97615,0.964953,0.892747,0.97248
1,Fix_Aspect,0.855878,0.753612,0.764522,0.704678,0.860366,0.97615,0.963862,0.897481,0.972861
0,Baseline,0.855812,0.753724,0.765505,0.704309,0.859652,0.974446,0.963998,0.896035,0.973243
10,Surface_Cover,0.854553,0.757742,0.771457,0.707737,0.862271,0.97615,0.964135,0.893799,0.973569
8,Climatic_Zone,0.858853,0.758682,0.771895,0.709274,0.862063,0.97615,0.964271,0.894983,0.973351
9,Geologic_Zone,0.858457,0.758963,0.771504,0.709719,0.865337,0.97615,0.964135,0.896166,0.972752
11,Rock_Size,0.852571,0.759197,0.775458,0.707951,0.860455,0.971039,0.963453,0.89472,0.972425
7,Ordinal_Keep,0.856406,0.759352,0.772868,0.71011,0.862212,0.972743,0.965089,0.893339,0.970954


In [77]:
# Random Forest
pd.DataFrame.from_records(
    data = random_scores,
    columns = ['features','cv_score','holdout','recall_0', 'recall_1','recall_2','recall_3','recall_4','recall_5','recall_6']
).sort_values('holdout')

Unnamed: 0,features,cv_score,holdout,recall_0,recall_1,recall_2,recall_3,recall_4,recall_5,recall_6
5,Misc_Interactions,0.862491,0.736408,0.754011,0.680182,0.850335,0.974446,0.95568,0.879989,0.973896
3,Shade_Features,0.857068,0.738237,0.76094,0.682761,0.827921,0.965928,0.947225,0.864273,0.969319
2,Water_Dist,0.861499,0.745623,0.77067,0.688665,0.835839,0.97615,0.952543,0.873874,0.970572
11,Rock_Size,0.863748,0.747631,0.7628,0.69829,0.840482,0.97615,0.951861,0.8744,0.966267
8,Climatic_Zone,0.864344,0.747913,0.763626,0.698468,0.838934,0.974446,0.950498,0.872624,0.967738
6,Ordinal_Drop,0.864806,0.748467,0.774142,0.690572,0.845275,0.974446,0.955543,0.876307,0.968937
10,Surface_Cover,0.864344,0.748606,0.76385,0.699539,0.839589,0.972743,0.950907,0.874334,0.967411
9,Geologic_Zone,0.862293,0.748836,0.764393,0.699396,0.841048,0.972743,0.95268,0.874597,0.966866
0,Baseline,0.863947,0.748949,0.769587,0.695494,0.839768,0.97615,0.951861,0.876044,0.972153
1,Fix_Aspect,0.864542,0.749643,0.771104,0.695697,0.839827,0.974446,0.951589,0.877096,0.972316
