# Notebook 3c - Soil Type Engineering

In this notebook, we use soil type features to engineer new features using interactions.

In [None]:
# Global variables for testing changes to this notebook quickly
RANDOM_SEED = 0
NUM_FOLDS = 12

In [None]:
import numpy as np
import pandas as pd
import time
import pyarrow
import gc

# Model evaluation
from functools import partial
from sklearn.base import clone
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, recall_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier

# Plotting
import matplotlib
import seaborn as sns
from matplotlib import pyplot as plt

# Hide warnings
import warnings
warnings.filterwarnings('ignore')

# Load Data

In [None]:
%%time

# Load original data
original = pd.read_feather('../data/original.feather')

# Label Encode
old_encoder = LabelEncoder()
original["Cover_Type"] = old_encoder.fit_transform(original["Cover_Type"])
y_train = original['Cover_Type'].iloc[:15119]
y_test = original['Cover_Type'].iloc[15119:]

# Get feature columns
features = [x for x in original.columns if x not in ['Id','Cover_Type']]

# Data structures for summary scores
bagging_scores = list()
extratrees_scores = list()
adaboost_scores = list()
random_scores = list()

# Scoring Function

In [None]:
def train_original(sklearn_model, processing = None):
    
    # Original Training/Test Split
    X_temp = original[features].iloc[:15119]
    X_test = original[features].iloc[15119:]
    y_temp = original['Cover_Type'].iloc[:15119]
    y_test = original['Cover_Type'].iloc[15119:]
    
    # Feature Engineering
    if processing:
        X_temp = processing(X_temp)
        X_test = processing(X_test)
        
    # Store the out-of-fold predictions
    test_preds = np.zeros((X_test.shape[0],7))
    oof_preds = np.zeros((X_temp.shape[0],))
    scores, times = np.zeros(NUM_FOLDS), np.zeros(NUM_FOLDS)
    
    # Stratified k-fold cross-validation
    skf = StratifiedKFold(n_splits = NUM_FOLDS, shuffle = True, random_state = RANDOM_SEED)
    for fold, (train_idx, valid_idx) in enumerate(skf.split(X_temp,y_temp)):
       
        # Training and Validation Sets
        X_train, X_valid = X_temp.iloc[train_idx], X_temp.iloc[valid_idx]
        y_train, y_valid = y_temp.iloc[train_idx], y_temp.iloc[valid_idx]
        
        # Create model
        start = time.time()
        model = clone(sklearn_model)
        model.fit(X_train, y_train)

        # validation and test predictions
        valid_preds = np.ravel(model.predict(X_valid))
        oof_preds[valid_idx] = valid_preds
        test_preds += model.predict_proba(X_test)
        
        # Save scores and times
        scores[fold] = accuracy_score(y_valid, valid_preds)
        end = time.time()
        times[fold] = end-start
        time.sleep(0.5)
    
    test_preds = np.argmax(test_preds, axis = 1)
    test_score = accuracy_score(y_test, test_preds)
    print('\n'+model.__class__.__name__)
    print("Train Accuracy:", round(scores.mean(), 5))
    print('Test Accuracy:', round(test_score, 5))
    print(f'Training Time: {round(times.sum(), 2)}s')
    
    return scores.mean(), oof_preds, test_score

# Models

We use the following 4 models from the scikit-learn library:

1. AdaBoost 
2. ExtraTrees
3. Bagging
4. Random Forest

In [None]:
# AdaBoost Classifier
adaboost = AdaBoostClassifier(
    base_estimator = DecisionTreeClassifier(
        splitter = 'random',
        random_state = RANDOM_SEED,
    ),
    random_state = RANDOM_SEED,
)

# ExtraTrees Classifier
extratrees = ExtraTreesClassifier(
    n_jobs = -1,
    random_state = RANDOM_SEED,
    max_features = None,
)

# Bagging Classifier
bagging = BaggingClassifier(
    base_estimator = DecisionTreeClassifier(
        splitter = 'random',
        random_state = RANDOM_SEED,
    ),
    n_jobs = -1,
    random_state = RANDOM_SEED
)

# Random Forest Classifier
randomforest = RandomForestClassifier(
    n_jobs = -1,
    random_state = RANDOM_SEED,
)

# Baselines

In [None]:
# AdaBoost
cv_score, oof_preds, test_score = train_original(adaboost)

adaboost_scores.append((
    'Baseline', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

# ExtraTrees
cv_score, oof_preds, test_score = train_original(extratrees)

extratrees_scores.append((
    'Baseline', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

# Bagging
cv_score, oof_preds, test_score = train_original(bagging)

bagging_scores.append((
    'Baseline', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

cv_score, oof_preds, test_score = train_original(randomforest)

random_scores.append((
    'Baseline', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

# Categorial Feature Interactions

We test out the following interactions:

1. Climatic Zone and Wilderness Area
2. Geologic Zone and Wilderness Area
3. Surface Cover and Wilderness Area
4. Rock Size and Wilderness Area

In [None]:
def consolidate_soil_types(input_df, drop = True):
    data = input_df.copy()
    data['Soil_Type'] = 0
    soil_features = list()
    for i in range(1,41):
        data['Soil_Type'] += i*data[f'Soil_Type{i}']
        soil_features.append(f'Soil_Type{i}')
    if drop:
        nonsoil_features = [x for x in data.columns if x not in soil_features]
        return data[nonsoil_features]
    return data

## 1. Climatic Zone and Wilderness Area

In [None]:
def climatic_zone_original(input_df):
    code = {
        1:2702,2:2703,3:2704,4:2705,5:2706,6:2717,7:3501,8:3502,9:4201,
        10:4703,11:4704,12:4744,13:4758,14:5101,15:5151,16:6101,17:6102,
        18:6731,19:7101,20:7102,21:7103,22:7201,23:7202,24:7700,25:7701,
        26:7702,27:7709,28:7710,29:7745,30:7746,31:7755,32:7756,33:7757,
        34:7790,35:8703,36:8707,37:8708,38:8771,39:8772,40:8776
    }
    data = consolidate_soil_types(input_df, drop = False)
    df = input_df.copy()
    df['Climatic_Zone'] = data['Soil_Type'].apply(lambda x: int(str(code[x])[0]))
    return df

def wilderness_climatic(input_df, drop = False):
    data = climatic_zone_original(input_df)
    df = input_df.copy()
    df['Climate_Area1'] = df['Wilderness_Area1']*data['Climatic_Zone'] 
    df['Climate_Area2'] = df['Wilderness_Area2']*data['Climatic_Zone'] 
    df['Climate_Area3'] = df['Wilderness_Area3']*data['Climatic_Zone'] 
    df['Climate_Area4'] = df['Wilderness_Area4']*data['Climatic_Zone'] 
    return df

In [None]:
# AdaBoost
cv_score, oof_preds, test_score = train_original(adaboost, wilderness_climatic)

adaboost_scores.append((
    'Wild_Clim', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

# ExtraTrees
cv_score, oof_preds, test_score = train_original(extratrees, wilderness_climatic)

extratrees_scores.append((
    'Wild_Clim', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

# Bagging
cv_score, oof_preds, test_score = train_original(bagging, wilderness_climatic)

bagging_scores.append((
    'Wild_Clim', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

# RandomForest
cv_score, oof_preds, test_score = train_original(randomforest, wilderness_climatic)

random_scores.append((
    'Wild_Clim', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

## 2. Geologic Zone and Wilderness Area

In [None]:
def geologic_zone_original(input_df):
    code = {
        1:2702,2:2703,3:2704,4:2705,5:2706,6:2717,7:3501,8:3502,9:4201,
        10:4703,11:4704,12:4744,13:4758,14:5101,15:5151,16:6101,17:6102,
        18:6731,19:7101,20:7102,21:7103,22:7201,23:7202,24:7700,25:7701,
        26:7702,27:7709,28:7710,29:7745,30:7746,31:7755,32:7756,33:7757,
        34:7790,35:8703,36:8707,37:8708,38:8771,39:8772,40:8776
    }
    data = consolidate_soil_types(input_df, drop = False)
    df = input_df.copy()
    df['Geologic_Zone'] = data['Soil_Type'].apply(lambda x: int(str(code[x])[1]))
    return df

def wilderness_geologic(input_df, drop = False):
    data = geologic_zone_original(input_df)
    df = input_df.copy()
    df['Geologic_Area1'] = df['Wilderness_Area1']*data['Geologic_Zone'] 
    df['Geologic_Area2'] = df['Wilderness_Area2']*data['Geologic_Zone']  
    df['Geologic_Area3'] = df['Wilderness_Area3']*data['Geologic_Zone'] 
    df['Geologic_Area4'] = df['Wilderness_Area4']*data['Geologic_Zone'] 
    return df

In [None]:
# AdaBoost
cv_score, oof_preds, test_score = train_original(adaboost, wilderness_geologic)

adaboost_scores.append((
    'Wild_Geo', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

# ExtraTrees
cv_score, oof_preds, test_score = train_original(extratrees, wilderness_geologic)

extratrees_scores.append((
    'Wild_Geo', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

# Bagging
cv_score, oof_preds, test_score = train_original(bagging, wilderness_geologic)

bagging_scores.append((
    'Wild_Geo', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

# RandomForest
cv_score, oof_preds, test_score = train_original(randomforest, wilderness_geologic)

random_scores.append((
    'Wild_Geo', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

## 3. Surface Cover and Wilderness Area

In [None]:
def surface_cover_original(input_df):
    # Group IDs
    no_desc = [7,8,14,15,16,17,19,20,21,23,35]
    stony = [6,12]
    very_stony = [2,9,18,26]
    extremely_stony = [1,22,24,25,27,28,29,30,31,32,33,34,36,37,38,39,40]
    rubbly = [3,4,5,10,11,13]

    # Create dictionary
    surface_cover = {i:0 for i in no_desc}
    surface_cover.update({i:1 for i in stony})
    surface_cover.update({i:2 for i in very_stony})
    surface_cover.update({i:3 for i in extremely_stony})
    surface_cover.update({i:4 for i in rubbly})
    
    # Create Feature
    data = consolidate_soil_types(input_df, drop = False)
    df = input_df.copy()
    df['Surface_Cover'] = data['Soil_Type'].apply(lambda x: surface_cover[x])
    return df

def wilderness_surface(input_df, drop = False):
    data = surface_cover_original(input_df)
    df = input_df.copy()
    df['Surface_Area1'] = df['Wilderness_Area1']*data['Surface_Cover'] 
    df['Surface_Area2'] = df['Wilderness_Area2']*data['Surface_Cover']   
    df['Surface_Area3'] = df['Wilderness_Area3']*data['Surface_Cover']  
    df['Surface_Area4'] = df['Wilderness_Area4']*data['Surface_Cover'] 
    return df

In [None]:
# AdaBoost
cv_score, oof_preds, test_score = train_original(adaboost, wilderness_surface)

adaboost_scores.append((
    'Wild_Surf', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

# ExtraTrees
cv_score, oof_preds, test_score = train_original(extratrees, wilderness_surface)

extratrees_scores.append((
    'Wild_Surf', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

# Bagging
cv_score, oof_preds, test_score = train_original(bagging, wilderness_surface)

bagging_scores.append((
    'Wild_Surf', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

# RandomForest
cv_score, oof_preds, test_score = train_original(randomforest, wilderness_surface)

random_scores.append((
    'Wild_Surf', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

## 4. Rock Size and Wilderness Area

In [None]:
def rock_size_original(input_df):
    
    # Group IDs
    no_desc = [7,8,14,15,16,17,19,20,21,23,35]
    stones = [1,2,6,9,12,18,24,25,26,27,28,29,30,31,32,33,34,36,37,38,39,40]
    boulders = [22]
    rubble = [3,4,5,10,11,13]

    # Create dictionary
    rock_size = {i:0 for i in no_desc}
    rock_size.update({i:1 for i in stones})
    rock_size.update({i:2 for i in boulders})
    rock_size.update({i:3 for i in rubble})
    
    data = consolidate_soil_types(input_df, drop = False)
    df = input_df.copy()
    df['Rock_Size'] = data['Soil_Type'].apply(lambda x: rock_size[x])
    return df

def wilderness_rocksize(input_df, drop = False):
    data = rock_size_original(input_df)
    df = input_df.copy()
    df['Rock_Area1'] = df['Wilderness_Area1']*data['Rock_Size'] 
    df['Rock_Area2'] = df['Wilderness_Area2']*data['Rock_Size']   
    df['Rock_Area3'] = df['Wilderness_Area3']*data['Rock_Size']  
    df['Rock_Area4'] = df['Wilderness_Area4']*data['Rock_Size']
    return df

In [None]:
# AdaBoost
cv_score, oof_preds, test_score = train_original(adaboost, wilderness_rocksize)

adaboost_scores.append((
    'Wild_Rock', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

# ExtraTrees
cv_score, oof_preds, test_score = train_original(extratrees, wilderness_rocksize)

extratrees_scores.append((
    'Wild_Rock', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

# Bagging
cv_score, oof_preds, test_score = train_original(bagging, wilderness_rocksize)

bagging_scores.append((
    'Wild_Rock', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

# RandomForest
cv_score, oof_preds, test_score = train_original(randomforest, wilderness_rocksize)

random_scores.append((
    'Wild_Rock', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

# Summary

In [None]:
# AdaBoost
pd.DataFrame.from_records(
    data = adaboost_scores,
    columns = ['features','cv_score','holdout','recall_0', 'recall_1','recall_2','recall_3','recall_4','recall_5','recall_6']
).sort_values('holdout')

In [None]:
# Extra Trees Classifier
pd.DataFrame.from_records(
    data = extratrees_scores,
    columns = ['features','cv_score','holdout','recall_0', 'recall_1','recall_2','recall_3','recall_4','recall_5','recall_6']
).sort_values('holdout')

In [None]:
# Bagging Classifier
pd.DataFrame.from_records(
    data = bagging_scores,
    columns = ['features','cv_score','holdout','recall_0', 'recall_1','recall_2','recall_3','recall_4','recall_5','recall_6']
).sort_values('holdout')

In [None]:
# Random Forest
pd.DataFrame.from_records(
    data = random_scores,
    columns = ['features','cv_score','holdout','recall_0', 'recall_1','recall_2','recall_3','recall_4','recall_5','recall_6']
).sort_values('holdout')