# Notebook 4 - Soil Type Encoding 

In this notebook, we test out encoding the soil types variables in various ways. Our motivation is that originally the soil type was a one-hot encoded categorical variable,and gradient boosting machines generally do better with ordinally-encoded categorical variables. 

One issue is that our synthetic data no longer preserves this property, hence we have to be clever to reduce the total number of soil type variables. We will try to use properties of the original data to help with our encoding but the synthetic data may not have the same properties.

In [1]:
# Global variables for testing changes to this notebook quickly
RANDOM_SEED = 0
NUM_FOLDS = 10
TRAIN_SIZE = 500000

In [2]:
import numpy as np
import pandas as pd
import time
import os
import pyarrow
import gc

# Hide warnings
import warnings
warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

# display options
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Model/Evaluation
from functools import partial
from sklearn.base import clone
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder, RobustScaler
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, recall_score
from sklearn.inspection import permutation_importance

# Gradient Boosting
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Load and Prepare Data

In [3]:
# Load full training data
train = pd.read_feather('../data/train.feather')

# Drop low/no variance 
train = train[train.Cover_Type != 5]

# Label Encoding
new_encoder = LabelEncoder()
train["Cover_Type"] = new_encoder.fit_transform(train["Cover_Type"])

# Split synthetic data
train, test = train_test_split(
    train, 
    train_size = TRAIN_SIZE, 
    random_state = RANDOM_SEED,
    stratify = train['Cover_Type'],
)
y_train = train['Cover_Type']


# features, data structure for summary scores
features = [x for x in train.columns if x not in ['Id','Cover_Type']]
nonsoil = [x for x in features if not x.startswith('Soil_Type')]
new_rows = list()
gc.collect()

print(f'Training Size: {train.shape[0]} rows, {train.shape[1]} cols')
print(f'Holdout Size: {test.shape[0]} rows, {test.shape[1]} cols\n')

Training Size: 500000 rows, 56 cols
Holdout Size: 3499999 rows, 56 cols



# Models

1. XGBoost
2. LightGBM
3. CatBoost

## 1. XGBoost

In [4]:
# XGBoost Classifier
xgb_pipeline = make_pipeline(
    XGBClassifier(
        booster = 'gbtree',
        tree_method = 'hist',
        eval_metric = 'mlogloss',
        random_state = RANDOM_SEED,
    ),
)

## 2. LightGBM

In [5]:
# LightGBM Classifier
lgbm_pipeline = make_pipeline(
    LGBMClassifier(
        unbalanced_sets = True,
        metric = 'multi_logloss',
        random_state = RANDOM_SEED,
        n_jobs = 4,
    ),
)

## 3. CatBoost

In [6]:
# CatBoost Classifier
catboost_pipeline = make_pipeline(
    CatBoostClassifier(
        eval_metric = 'MultiClass',
        boosting_type = 'Plain',
        verbose = False,
        random_state = RANDOM_SEED,
    ),
)

# Scoring Function

In [7]:
def score_features(sklearn_model, processing = None):
    
    # Original Training/Test Split
    features = [x for x in train.columns if x not in ['Id','Cover_Type']]
    X_temp, X_test = train[features], test[features]
    y_temp, y_test = train['Cover_Type'], test['Cover_Type']
    
    # Feature Engineering
    start = time.time()
    if processing:
        X_temp = processing(X_temp); gc.collect()
        X_test = processing(X_test); gc.collect()
    end = time.time()
    print(f'Data Preprocessing: {round(end-start,2)}s.')
    
    # Store the out-of-fold predictions
    test_preds = np.zeros((X_test.shape[0],6))
    oof_preds = np.zeros((X_temp.shape[0],))
    fi_scores = np.zeros((X_temp.shape[1],))
    scores, times = np.zeros(NUM_FOLDS), np.zeros(NUM_FOLDS)
    
    # Stratified k-fold cross-validation
    skf = StratifiedKFold(n_splits = NUM_FOLDS, shuffle = True, random_state = RANDOM_SEED)
    for fold, (train_idx, valid_idx) in enumerate(skf.split(train[features],train['Cover_Type'])):
       
        # Training and Validation Sets
        X_train, X_valid = X_temp.iloc[train_idx], X_temp.iloc[valid_idx]
        y_train, y_valid = y_temp.iloc[train_idx], y_temp.iloc[valid_idx]
        
        # Create model
        start = time.time()
        model = clone(sklearn_model)
        model.fit(X_train, y_train)

        # Permutation Importance
        result = permutation_importance(
            model, X_valid, y_valid, 
            random_state=RANDOM_SEED
        )
        fi_scores += result.importances_mean / NUM_FOLDS

        # validation/holdout predictions
        valid_preds = np.ravel(model.predict(X_valid))
        oof_preds[valid_idx] = valid_preds
        test_preds += model.predict_proba(X_test)

        # Save scores and times
        scores[fold] = accuracy_score(y_valid, valid_preds)
        end = time.time()
        times[fold] = end-start
        print(f'Fold {fold} Accuracy:  {round(scores[fold], 5)} in {round(end-start,2)}s.')
        time.sleep(0.5)
    
    features = [x for x in X_temp.columns]
    nonsoil = [x for x in X_test.columns if not x.startswith('Soil_Type')]
    test_preds = np.argmax(test_preds, axis = 1)
    test_score = accuracy_score(y_test, test_preds)
    #print('\n'+model.__class__.__name__)
    print("Train Accuracy:", round(scores.mean(), 5))
    print('Test Accuracy:', round(test_score, 5))
    print(f'Training Time: {round(times.sum(), 2)}s\n')
    
    fi_scores = pd.Series(
        data = fi_scores, 
        index = features
    ).sort_values()
    
    return scores.mean(), oof_preds, test_score, fi_scores, times

# Baselines

## XGBoost

In [8]:
# Helper function
def start_at_eps(series, eps=1e-10): 
    return series - series.min() + eps  

def xgboost_features(data):
    df = data.copy()
    
    # Use float64 for calculations
    for col, dtype in df.dtypes.iteritems():
        if dtype.name.startswith('float'):
            df[col] = df[col].astype('float64')
    
    
    shade_features = ['Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm']
    soil_features = [x for x in df.columns if x.startswith("Soil_Type")]
    wilderness_features = [x for x in df.columns if x.startswith("Wilderness_Area")]
    
    # XGBoost Features
    df['Aspect_360'] = df['Aspect'] % 360
    pos_h_hydrology = start_at_eps(df["Horizontal_Distance_To_Hydrology"])
    pos_v_hydrology = start_at_eps(df['Vertical_Distance_To_Hydrology'])
    df["Hydro_Taxicab"] = np.abs(df["Horizontal_Distance_To_Hydrology"]) + np.abs(df["Vertical_Distance_To_Hydrology"])
    df["Hydro_Euclid"] = (df["Horizontal_Distance_To_Hydrology"]**2 + np.abs(df["Vertical_Distance_To_Hydrology"])**2)**0.5
    df['Water Elevation'] = df['Elevation'] - df['Vertical_Distance_To_Hydrology']
    df["Soil_Count"] = df[soil_features].apply(sum, axis=1)
    df["Wilderness_Count"] = df[wilderness_features].apply(sum, axis=1)
    df['Hydro_Fire_Sum'] = df['Horizontal_Distance_To_Hydrology'] + df['Horizontal_Distance_To_Fire_Points']
    df['Hydro_Fire_AbsDiff'] = abs(df['Horizontal_Distance_To_Hydrology'] - df['Horizontal_Distance_To_Fire_Points'])
    df['Hydro_Fire_Diff'] = df['Horizontal_Distance_To_Hydrology'] - df['Horizontal_Distance_To_Fire_Points']
    df['Hydro_Road_1'] = abs(df['Horizontal_Distance_To_Hydrology'] + df['Horizontal_Distance_To_Roadways'])
    df['Hydro_Road_2'] = abs(df['Horizontal_Distance_To_Hydrology'] - df['Horizontal_Distance_To_Roadways'])
    df['Fire_Road_1'] = abs(df['Horizontal_Distance_To_Fire_Points'] + df['Horizontal_Distance_To_Roadways'])
    df['Fire_Road_2'] = abs(df['Horizontal_Distance_To_Fire_Points'] - df['Horizontal_Distance_To_Roadways'])
    df['Elev_HHydro_Diff'] = df.Elevation - df.Horizontal_Distance_To_Hydrology * 0.2
    
    # Save Memory
    for col, dtype in df.dtypes.iteritems():
        if dtype.name.startswith('int'):
            df[col] = pd.to_numeric(df[col], downcast ='integer')
        elif dtype.name.startswith('float'):
            df[col] = pd.to_numeric(df[col], downcast ='float')
    
    return df

In [9]:
cv_score, oof_preds, test_score, fi_scores, times = score_features(
    xgb_pipeline, 
    xgboost_features
)

new_rows.append((
    'XGBoost', cv_score, test_score, times.mean()
))

fi_scores

Data Preprocessing: 53.63s.
Fold 0 Accuracy:  0.96144 in 74.3s.
Fold 1 Accuracy:  0.9617 in 75.96s.
Fold 2 Accuracy:  0.9596 in 71.37s.
Fold 3 Accuracy:  0.96156 in 65.03s.
Fold 4 Accuracy:  0.96004 in 65.47s.
Fold 5 Accuracy:  0.96248 in 65.04s.
Fold 6 Accuracy:  0.95838 in 66.87s.
Fold 7 Accuracy:  0.96112 in 65.97s.
Fold 8 Accuracy:  0.9622 in 65.43s.
Fold 9 Accuracy:  0.96006 in 65.63s.
Train Accuracy: 0.96086
Test Accuracy: 0.96121
Training Time: 681.06s



Soil_Type15                           0.000000
Soil_Type7                            0.000000
Soil_Type19                           0.000001
Soil_Type26                           0.000001
Soil_Type18                           0.000002
Soil_Type14                           0.000002
Soil_Type8                            0.000002
Soil_Type27                           0.000004
Soil_Type28                           0.000004
Soil_Type25                           0.000005
Soil_Type20                           0.000006
Soil_Type17                           0.000009
Soil_Type16                           0.000011
Soil_Type21                           0.000017
Slope                                 0.000018
Aspect                                0.000021
Soil_Type29                           0.000032
Soil_Type34                           0.000050
Soil_Type30                           0.000064
Aspect_360                            0.000064
Soil_Type13                           0.000084
Hillshade_3pm

## LightGBM

In [10]:
def lightgbm_features(data):
    df = data.copy()
    
    # Use float64 for calculations
    for col, dtype in df.dtypes.iteritems():
        if dtype.name.startswith('float'):
            df[col] = df[col].astype('float64')
    
    shade_features = ['Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm']
    soil_features = [x for x in df.columns if x.startswith("Soil_Type")]
    wilderness_features = [x for x in df.columns if x.startswith("Wilderness_Area")]
    
    # LightGBM Features
    pos_h_hydrology = start_at_eps(df["Horizontal_Distance_To_Hydrology"])
    pos_v_hydrology = start_at_eps(df['Vertical_Distance_To_Hydrology'])
    df["Hydro_Taxicab"] = np.abs(df["Horizontal_Distance_To_Hydrology"]) + np.abs(df["Vertical_Distance_To_Hydrology"])
    df['Hydro_Taxicab_Pos'] = (pos_h_hydrology ** 2 + pos_v_hydrology ** 2).apply(np.sqrt).rename('Euclidean_positive_hydrology')
    df["Hydro_Euclid"] = (df["Horizontal_Distance_To_Hydrology"]**2 + np.abs(df["Vertical_Distance_To_Hydrology"])**2)**0.5
    df['Water Elevation'] = df['Elevation'] - df['Vertical_Distance_To_Hydrology']
    df["Soil_Count"] = df[soil_features].apply(sum, axis=1)
    df["Wilderness_Count"] = df[wilderness_features].apply(sum, axis=1)
    df['Hydro_Fire_Sum'] = df['Horizontal_Distance_To_Hydrology'] + df['Horizontal_Distance_To_Fire_Points']
    df['Hydro_Fire_AbsDiff'] = abs(df['Horizontal_Distance_To_Hydrology'] - df['Horizontal_Distance_To_Fire_Points'])
    df['Hydro_Fire_EpsSum'] = pos_h_hydrology + start_at_eps(df['Horizontal_Distance_To_Fire_Points'])
    df['Hydro_Fire_Diff'] = df['Horizontal_Distance_To_Hydrology'] - df['Horizontal_Distance_To_Fire_Points']
    df['Elev_HHydro_Diff'] = df.Elevation - df.Horizontal_Distance_To_Hydrology * 0.2
    
    # Save Memory
    for col, dtype in df.dtypes.iteritems():
        if dtype.name.startswith('int'):
            df[col] = pd.to_numeric(df[col], downcast ='integer')
        elif dtype.name.startswith('float'):
            df[col] = pd.to_numeric(df[col], downcast ='float')
    
    return df

In [11]:
cv_score, oof_preds, test_score, fi_scores, times = score_features(
    lgbm_pipeline, 
    lightgbm_features
)

new_rows.append((
    'LightGBM', cv_score, test_score, times.mean()
))

fi_scores

Data Preprocessing: 51.61s.
Fold 0 Accuracy:  0.94866 in 213.39s.
Fold 1 Accuracy:  0.94674 in 216.2s.
Fold 2 Accuracy:  0.94312 in 190.8s.
Fold 3 Accuracy:  0.95108 in 203.07s.
Fold 4 Accuracy:  0.94662 in 202.4s.
Fold 5 Accuracy:  0.9554 in 200.53s.
Fold 6 Accuracy:  0.94808 in 212.52s.
Fold 7 Accuracy:  0.94912 in 212.72s.
Fold 8 Accuracy:  0.9529 in 216.97s.
Fold 9 Accuracy:  0.94644 in 204.08s.
Train Accuracy: 0.94882
Test Accuracy: 0.95798
Training Time: 2072.67s



Soil_Type21                          -2.400000e-06
Soil_Type15                           0.000000e+00
Soil_Type7                            0.000000e+00
Soil_Type25                           8.000000e-07
Soil_Type18                           8.000000e-07
Soil_Type8                            2.000000e-06
Soil_Type28                           2.400000e-06
Soil_Type19                           2.800000e-06
Soil_Type27                           7.200000e-06
Soil_Type14                           7.200000e-06
Soil_Type17                           1.280000e-05
Hillshade_9am                         1.760000e-05
Soil_Type26                           1.760000e-05
Soil_Type30                           2.000000e-05
Soil_Type29                           2.120000e-05
Soil_Type16                           2.320000e-05
Soil_Type9                            2.400000e-05
Soil_Type34                           2.480000e-05
Soil_Type20                           2.560000e-05
Soil_Type24                    

## CatBoost

In [12]:
def catboost_features(data):
    df = data.copy()
    
    # Use float64 for calculations
    for col, dtype in df.dtypes.iteritems():
        if dtype.name.startswith('float'):
            df[col] = df[col].astype('float64')
    
    shade_features = ['Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm']
    soil_features = [x for x in df.columns if x.startswith("Soil_Type")]
    wilderness_features = [x for x in df.columns if x.startswith("Wilderness_Area")]
    
    # CatBoost Features
    pos_h_hydrology = start_at_eps(df["Horizontal_Distance_To_Hydrology"])
    pos_v_hydrology = start_at_eps(df['Vertical_Distance_To_Hydrology'])
    df["Hydro_Taxicab"] = np.abs(df["Horizontal_Distance_To_Hydrology"]) + np.abs(df["Vertical_Distance_To_Hydrology"])
    df["Hydro_Euclid"] = (df["Horizontal_Distance_To_Hydrology"]**2 + np.abs(df["Vertical_Distance_To_Hydrology"])**2)**0.5
    df['Hydro_Euclid_Pos'] = (pos_h_hydrology ** 2 + pos_v_hydrology ** 2).apply(np.sqrt)
    df['Water Elevation'] = df['Elevation'] - df['Vertical_Distance_To_Hydrology']
    df["Soil_Count"] = df[soil_features].apply(sum, axis=1)
    df['Hydro_Fire_Sum'] = df['Horizontal_Distance_To_Hydrology'] + df['Horizontal_Distance_To_Fire_Points']
    df['Hydro_Fire_AbsDiff'] = abs(df['Horizontal_Distance_To_Hydrology'] - df['Horizontal_Distance_To_Fire_Points'])
    df['Hydro_Fire_EpsSum'] = pos_h_hydrology + start_at_eps(df['Horizontal_Distance_To_Fire_Points'])
    df['Hydro_Fire_Diff'] = df['Horizontal_Distance_To_Hydrology'] - df['Horizontal_Distance_To_Fire_Points']
    df['Hydro_Road_1'] = abs(df['Horizontal_Distance_To_Hydrology'] + df['Horizontal_Distance_To_Roadways'])
    df['Hydro_Road_2'] = abs(df['Horizontal_Distance_To_Hydrology'] - df['Horizontal_Distance_To_Roadways'])
    df['Fire_Road_1'] = abs(df['Horizontal_Distance_To_Fire_Points'] + df['Horizontal_Distance_To_Roadways'])
    df['Fire_Road_2'] = abs(df['Horizontal_Distance_To_Fire_Points'] - df['Horizontal_Distance_To_Roadways'])
    df['Elev_VHydro_Diff'] = df.Elevation - df.Vertical_Distance_To_Hydrology
    df['Elev_HHydro_Diff'] = df.Elevation - df.Horizontal_Distance_To_Hydrology * 0.2
    
    # Save Memory
    for col, dtype in df.dtypes.iteritems():
        if dtype.name.startswith('int'):
            df[col] = pd.to_numeric(df[col], downcast ='integer')
        elif dtype.name.startswith('float'):
            df[col] = pd.to_numeric(df[col], downcast ='float')
    
    return df

In [13]:
cv_score, oof_preds, test_score, fi_scores, times = score_features(
    catboost_pipeline, 
    catboost_features
)

new_rows.append((
    'CatBoost', cv_score, test_score, times.mean()
))

fi_scores

Data Preprocessing: 29.97s.
Fold 0 Accuracy:  0.96014 in 180.64s.
Fold 1 Accuracy:  0.961 in 181.3s.
Fold 2 Accuracy:  0.95968 in 180.34s.
Fold 3 Accuracy:  0.9616 in 180.66s.
Fold 4 Accuracy:  0.9598 in 181.24s.
Fold 5 Accuracy:  0.96192 in 181.66s.
Fold 6 Accuracy:  0.95898 in 180.67s.
Fold 7 Accuracy:  0.96066 in 180.82s.
Fold 8 Accuracy:  0.9621 in 179.88s.
Fold 9 Accuracy:  0.95938 in 179.3s.
Train Accuracy: 0.96053
Test Accuracy: 0.96078
Training Time: 1806.51s



Soil_Type28                          -0.000011
Soil_Type25                          -0.000007
Soil_Type21                          -0.000007
Slope                                -0.000006
Soil_Type26                          -0.000004
Soil_Type15                           0.000000
Soil_Type7                            0.000000
Soil_Type17                           0.000002
Soil_Type19                           0.000004
Soil_Type34                           0.000004
Soil_Type16                           0.000004
Soil_Type18                           0.000006
Soil_Type8                            0.000009
Soil_Type30                           0.000013
Soil_Type27                           0.000018
Hydro_Euclid                          0.000022
Soil_Type20                           0.000037
Soil_Type14                           0.000040
Soil_Type29                           0.000042
Wilderness_Area2                      0.000054
Hillshade_9am                         0.000054
Soil_Type31  

# Soil Feature Encoding

We reduce the ~40 or so soil columns to the following:

1. `Soil_Type` - For observations with all 0's or exactly 1 soil type, we indicated the original label. NAs if more than one.
2. `Soil_Count` - Number of non-zero soil types.
3. `Avg_Climatic` - Average climatic zone (1-lower montane dry...8-alpine)
4. `Avg_SurfaceCover` - Average surface cover (1- stony...4-rubbly)
4. `Avg_RockSize` - Average rock size (1-stones, 2-boulders, 3-rubble)

In [19]:
def soil_features(data, other_features, drop = False, verbose = False):
    start = time.time()
    df = other_features(data)
    end =  time.time()
    if verbose: print(f'Previous Features: {round(end-start, 2)}')
    gc.collect()
    
    code = {
        1:2702,2:2703,3:2704,4:2705,5:2706,6:2717,7:3501,8:3502,9:4201,
        10:4703,11:4704,12:4744,13:4758,14:5101,15:5151,16:6101,17:6102,
        18:6731,19:7101,20:7102,21:7103,22:7201,23:7202,24:7700,25:7701,
        26:7702,27:7709,28:7710,29:7745,30:7746,31:7755,32:7756,33:7757,
        34:7790,35:8703,36:8707,37:8708,38:8771,39:8772,40:8776
    }
    
    # Soil Type
    start = time.time()
    df['Soil_Type'] = 0
    for i in range(1,41):
        df['Soil_Type'] += i*df[f'Soil_Type{i}']
    df['Soil_Type'][df.Soil_Count != 1] = 0
    end =  time.time()
    if verbose: print(f'Soil Type: {round(end-start, 2)}')

    
    # Avg Climatic Zone
    start = time.time()
    df['Avg_Climatic'] = 0
    for i in range(1,41):
        df['Avg_Climatic'] += int(str(code[i])[0])*df[f'Soil_Type{i}']
    df['Avg_Climatic'] /= df['Soil_Count']
    end =  time.time()
    if verbose: print(f'Climatic Zone: {round(end-start, 2)}')

    # Avg Surface Cover
    start = time.time()
    no_desc = [7,8,14,15,16,17,19,20,21,23,35]
    stony = [6,12]
    very_stony = [2,9,18,26]
    extremely_stony = [1,22,24,25,27,28,29,30,31,32,33,34,36,37,38,39,40]
    rubbly = [3,4,5,10,11,13]
    surface_cover = {i:0 for i in no_desc}
    surface_cover.update({i:1 for i in stony})
    surface_cover.update({i:2 for i in very_stony})
    surface_cover.update({i:3 for i in extremely_stony})
    surface_cover.update({i:4 for i in rubbly})
    
    df['Avg_SurfaceCover'] = 0
    for i in range(1,41):
        df['Avg_SurfaceCover'] += surface_cover[i]*df[f'Soil_Type{i}']
    df['Avg_SurfaceCover'] /= df['Soil_Count']
    end =  time.time()
    if verbose: print(f'Surface Cover: {round(end-start, 2)}')
    
    # Avg Rock Size
    start = time.time()
    no_desc = [7,8,14,15,16,17,19,20,21,23,35]
    stones = [1,2,6,9,12,18,24,25,26,27,28,29,30,31,32,33,34,36,37,38,39,40]
    boulders = [22]
    rubble = [3,4,5,10,11,13]
    rock_size = {i:0 for i in no_desc}
    rock_size.update({i:1 for i in stones})
    rock_size.update({i:2 for i in boulders})
    rock_size.update({i:3 for i in rubble})
    
    df['Avg_RockSize'] = 0
    for i in range(1,41):
        df['Avg_RockSize'] += rock_size[i]*df[f'Soil_Type{i}']
    df['Avg_RockSize'] /= df['Soil_Count']
    end =  time.time()
    if verbose: print(f'Rock Size: {round(end-start, 2)}')
    
    # drop irrel columns, fill NAs
    if drop:
        # drop all soil features
        df.drop([f"Soil_Type{i}" for i in range(1,41)], axis=1, inplace=True)
    else:
        # drop irrel soil features
        df.drop(["Soil_Type7", "Soil_Type15"], axis=1, inplace=True)
        
    # Save Memory
    for col, dtype in df.dtypes.iteritems():
        if dtype.name.startswith('int'):
            df[col] = pd.to_numeric(df[col], downcast ='integer')
        elif dtype.name.startswith('float'):
            df[col] = pd.to_numeric(df[col], downcast ='float')

    return df

# Soil Features (keep original)

## 1. XGBoost

In [15]:
cv_score, oof_preds, test_score, fi_scores, times = score_features(
    xgb_pipeline, 
    partial(soil_features, other_features = xgboost_features)
)

new_rows.append((
    'XGBoost_Keep', cv_score, test_score, times.mean()
))

fi_scores

Data Preprocessing: 77.95s.
Fold 0 Accuracy:  0.9613 in 70.86s.
Fold 1 Accuracy:  0.9616 in 70.01s.
Fold 2 Accuracy:  0.96014 in 69.79s.
Fold 3 Accuracy:  0.961 in 69.64s.
Fold 4 Accuracy:  0.96142 in 70.07s.
Fold 5 Accuracy:  0.962 in 72.17s.
Fold 6 Accuracy:  0.96042 in 69.25s.
Fold 7 Accuracy:  0.96156 in 68.48s.
Fold 8 Accuracy:  0.96286 in 69.24s.
Fold 9 Accuracy:  0.96044 in 69.85s.
Train Accuracy: 0.96127
Test Accuracy: 0.96178
Training Time: 699.36s



Soil_Type13                          -1.440000e-05
Soil_Type36                          -1.160000e-05
Aspect                               -1.040000e-05
Soil_Type30                          -9.200000e-06
Soil_Type17                          -7.600000e-06
Soil_Type20                          -7.200000e-06
Soil_Type3                           -4.800000e-06
Soil_Type28                          -3.600000e-06
Soil_Type12                          -2.800000e-06
Soil_Type37                          -2.000000e-06
Soil_Type16                          -1.600000e-06
Soil_Type8                           -8.000000e-07
Soil_Type32                           1.998659e-17
Soil_Type14                           1.600000e-06
Soil_Type25                           2.000000e-06
Soil_Type27                           2.800000e-06
Soil_Type9                            3.200000e-06
Soil_Type21                           4.800000e-06
Soil_Type5                            6.400000e-06
Soil_Type34                    

## 2. LightGBM

In [16]:
cv_score, oof_preds, test_score, fi_scores, times = score_features(
    lgbm_pipeline, 
    partial(soil_features, other_features = lightgbm_features)
)

new_rows.append((
    'LightGBM_Keep', cv_score, test_score, times.mean()
))

fi_scores

Data Preprocessing: 77.69s.
Fold 0 Accuracy:  0.95048 in 224.46s.
Fold 1 Accuracy:  0.9341 in 191.28s.
Fold 2 Accuracy:  0.93328 in 199.64s.
Fold 3 Accuracy:  0.94872 in 196.4s.
Fold 4 Accuracy:  0.9501 in 229.38s.
Fold 5 Accuracy:  0.95106 in 208.05s.
Fold 6 Accuracy:  0.94528 in 227.65s.
Fold 7 Accuracy:  0.95328 in 221.99s.
Fold 8 Accuracy:  0.95278 in 205.32s.
Fold 9 Accuracy:  0.94826 in 221.66s.
Train Accuracy: 0.94673
Test Accuracy: 0.95861
Training Time: 2125.84s



Soil_Type37                          -1.200000e-05
Soil_Type14                          -5.200000e-06
Soil_Type33                          -2.000000e-06
Soil_Type20                          -8.000000e-07
Soil_Type34                           2.220444e-18
Soil_Type28                           4.441047e-18
Soil_Type25                           8.000000e-07
Hillshade_3pm                         8.000000e-07
Soil_Type27                           1.600000e-06
Soil_Type21                           2.400000e-06
Soil_Type13                           3.200000e-06
Soil_Type24                           4.800000e-06
Soil_Type8                            4.800000e-06
Soil_Type36                           5.600000e-06
Soil_Type29                           6.400000e-06
Soil_Type18                           6.400000e-06
Soil_Type26                           1.080000e-05
Slope                                 1.200000e-05
Soil_Type30                           1.320000e-05
Soil_Type17                    

## 3. CatBoost

In [17]:
cv_score, oof_preds, test_score, fi_scores, times = score_features(
    catboost_pipeline, 
    partial(soil_features, other_features = catboost_features)
)

new_rows.append((
    'CatBoost_Keep', cv_score, test_score, times.mean()
))

fi_scores

Data Preprocessing: 58.06s.
Fold 0 Accuracy:  0.96106 in 198.33s.
Fold 1 Accuracy:  0.96096 in 194.81s.
Fold 2 Accuracy:  0.96022 in 193.41s.
Fold 3 Accuracy:  0.96142 in 194.52s.
Fold 4 Accuracy:  0.96072 in 193.58s.
Fold 5 Accuracy:  0.96312 in 195.69s.
Fold 6 Accuracy:  0.9599 in 194.5s.
Fold 7 Accuracy:  0.96124 in 192.98s.
Fold 8 Accuracy:  0.96258 in 193.68s.
Fold 9 Accuracy:  0.95956 in 194.5s.
Train Accuracy: 0.96108
Test Accuracy: 0.96131
Training Time: 1945.99s



Hillshade_3pm                        -3.760000e-05
Soil_Type21                          -1.640000e-05
Hydro_Euclid                         -1.200000e-05
Soil_Type31                          -1.200000e-05
Soil_Type16                          -9.200000e-06
Soil_Type14                          -7.200000e-06
Soil_Type36                          -4.800000e-06
Soil_Type27                          -4.400000e-06
Soil_Type28                          -2.400000e-06
Soil_Type20                          -2.400000e-06
Soil_Type17                          -2.000000e-06
Soil_Type12                          -1.200000e-06
Soil_Type6                           -8.000000e-07
Soil_Type13                          -2.221344e-18
Soil_Type8                            1.600000e-06
Soil_Type5                            2.400000e-06
Soil_Type25                           2.400000e-06
Soil_Type1                            4.000000e-06
Soil_Type37                           4.800000e-06
Soil_Type34                    

# Soil Features (drop original)

Same as above but drop the soil columns.

## 1. XGBoost

In [20]:
cv_score, oof_preds, test_score, fi_scores, times = score_features(
    xgb_pipeline, 
    partial(soil_features, other_features = xgboost_features, drop = True)
)

new_rows.append((
    'XGBoost_Drop', cv_score, test_score, times.mean()
))

fi_scores

Data Preprocessing: 78.25s.
Fold 0 Accuracy:  0.9614 in 38.36s.
Fold 1 Accuracy:  0.96048 in 38.4s.
Fold 2 Accuracy:  0.95956 in 39.72s.
Fold 3 Accuracy:  0.96026 in 40.01s.
Fold 4 Accuracy:  0.95976 in 38.9s.
Fold 5 Accuracy:  0.9621 in 38.71s.
Fold 6 Accuracy:  0.959 in 38.93s.
Fold 7 Accuracy:  0.96084 in 38.98s.
Fold 8 Accuracy:  0.96268 in 39.32s.
Fold 9 Accuracy:  0.9597 in 39.15s.
Train Accuracy: 0.96058
Test Accuracy: 0.9611
Training Time: 390.48s



Hydro_Euclid                          0.000006
Hillshade_3pm                         0.000008
Aspect_360                            0.000036
Slope                                 0.000052
Hillshade_9am                         0.000065
Aspect                                0.000125
Wilderness_Area2                      0.000126
Horizontal_Distance_To_Hydrology      0.000172
Vertical_Distance_To_Hydrology        0.000252
Hillshade_Noon                        0.000534
Wilderness_Count                      0.000664
Hydro_Fire_AbsDiff                    0.001290
Hydro_Road_1                          0.001406
Avg_SurfaceCover                      0.001599
Hydro_Taxicab                         0.001708
Horizontal_Distance_To_Fire_Points    0.001814
Hydro_Fire_Diff                       0.002197
Wilderness_Area4                      0.002596
Hydro_Road_2                          0.002819
Soil_Type                             0.002957
Hydro_Fire_Sum                        0.003812
Fire_Road_2  

## 2. LightGBM

In [21]:
cv_score, oof_preds, test_score, fi_scores, times = score_features(
    lgbm_pipeline, 
    partial(soil_features, other_features = lightgbm_features, drop = True)
)

new_rows.append((
    'LightGBM_Drop', cv_score, test_score, times.mean()
))

fi_scores

Data Preprocessing: 77.81s.
Fold 0 Accuracy:  0.94418 in 112.36s.
Fold 1 Accuracy:  0.92616 in 101.26s.
Fold 2 Accuracy:  0.94774 in 115.07s.
Fold 3 Accuracy:  0.95238 in 115.75s.
Fold 4 Accuracy:  0.94888 in 113.79s.
Fold 5 Accuracy:  0.95016 in 114.05s.
Fold 6 Accuracy:  0.95144 in 121.86s.
Fold 7 Accuracy:  0.94504 in 107.4s.
Fold 8 Accuracy:  0.95042 in 119.35s.
Fold 9 Accuracy:  0.9463 in 114.01s.
Train Accuracy: 0.94627
Test Accuracy: 0.95818
Training Time: 1134.9s



Hillshade_3pm                        -4.320000e-05
Aspect                               -8.000000e-07
Hillshade_9am                         1.280000e-05
Slope                                 1.016000e-04
Wilderness_Area2                      1.672000e-04
Hillshade_Noon                        3.668000e-04
Wilderness_Count                      6.076000e-04
Hydro_Euclid                          2.036000e-03
Soil_Type                             2.468400e-03
Avg_RockSize                          2.553200e-03
Vertical_Distance_To_Hydrology        2.669200e-03
Wilderness_Area4                      4.061200e-03
Hydro_Fire_EpsSum                     4.924800e-03
Avg_SurfaceCover                      5.408800e-03
Hydro_Taxicab                         5.784000e-03
Hydro_Taxicab_Pos                     5.885600e-03
Soil_Count                            6.258000e-03
Horizontal_Distance_To_Hydrology      6.672800e-03
Hydro_Fire_Diff                       7.404000e-03
Hydro_Fire_AbsDiff             

## 3. CatBoost

In [22]:
cv_score, oof_preds, test_score, fi_scores, times = score_features(
    catboost_pipeline, 
    partial(soil_features, other_features = catboost_features, drop = True)
)

new_rows.append((
    'CatBoost_Drop', cv_score, test_score, times.mean()
))

fi_scores

Data Preprocessing: 58.5s.
Fold 0 Accuracy:  0.96024 in 171.59s.
Fold 1 Accuracy:  0.96068 in 171.77s.
Fold 2 Accuracy:  0.96 in 171.55s.
Fold 3 Accuracy:  0.96016 in 172.35s.
Fold 4 Accuracy:  0.96026 in 172.13s.
Fold 5 Accuracy:  0.96178 in 173.02s.
Fold 6 Accuracy:  0.95962 in 175.53s.
Fold 7 Accuracy:  0.96036 in 171.57s.
Fold 8 Accuracy:  0.96268 in 173.64s.
Fold 9 Accuracy:  0.9589 in 171.44s.
Train Accuracy: 0.96047
Test Accuracy: 0.96051
Training Time: 1724.59s



Hillshade_3pm                         0.000015
Hydro_Euclid                          0.000038
Hillshade_9am                         0.000048
Aspect                                0.000052
Slope                                 0.000120
Wilderness_Area2                      0.000140
Horizontal_Distance_To_Hydrology      0.000302
Vertical_Distance_To_Hydrology        0.000326
Hillshade_Noon                        0.000608
Hydro_Fire_EpsSum                     0.000723
Hydro_Euclid_Pos                      0.000734
Hydro_Taxicab                         0.000748
Avg_SurfaceCover                      0.000828
Hydro_Road_1                          0.000949
Wilderness_Area4                      0.001124
Hydro_Fire_AbsDiff                    0.001294
Hydro_Fire_Sum                        0.001667
Hydro_Fire_Diff                       0.001832
Avg_RockSize                          0.002091
Soil_Type                             0.002179
Horizontal_Distance_To_Fire_Points    0.002396
Hydro_Road_2 

# Summary

In [23]:
pd.DataFrame.from_records(
    data = new_rows,
    columns = ['features','cv_scores','holdout','time']
).sort_values('holdout')

Unnamed: 0,features,cv_scores,holdout,time
1,LightGBM,0.948816,0.957976,207.267333
7,LightGBM_Drop,0.94627,0.958176,113.489884
4,LightGBM_Keep,0.946734,0.958605,212.583639
8,CatBoost_Drop,0.960468,0.960509,172.459318
2,CatBoost,0.960526,0.960775,180.651405
6,XGBoost_Drop,0.960578,0.9611,39.047622
0,XGBoost,0.960858,0.961211,68.106018
5,CatBoost_Keep,0.961078,0.961312,194.599431
3,XGBoost_Keep,0.961274,0.961777,69.936417
