# Notebook 3b - Breakdown Soil Type

In this notebook, we test out breaking down the soil-type features using domain knowledge and their descriptions

In [1]:
# Global variables for testing changes to this notebook quickly
RANDOM_SEED = 0
NUM_FOLDS = 12

In [2]:
import numpy as np
import pandas as pd
import time
import pyarrow
import gc
from collections import defaultdict

# Model evaluation
from functools import partial
from sklearn.base import clone
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, recall_score
from sklearn.inspection import partial_dependence, permutation_importance
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier

# Hide warnings
import warnings
warnings.filterwarnings('ignore')

# Prepare Data

In [3]:
%%time

# Load original data
original = pd.read_feather('../data/original.feather')

# Label Encode
old_encoder = LabelEncoder()
original["Cover_Type"] = old_encoder.fit_transform(original["Cover_Type"])
y_train = original['Cover_Type'].iloc[:15119]
y_test = original['Cover_Type'].iloc[15119:]

# Get feature columns
features = [x for x in original.columns if x not in ['Id','Cover_Type']]

# Data structures for summary scores
baseline = defaultdict(dict)
bagging_scores = list()
extratrees_scores = list()
adaboost_scores = list()
random_scores = list()

CPU times: total: 484 ms
Wall time: 76 ms


# Scoring Function

In [4]:
def train_original(sklearn_model, processing = None):
    
    # Original Training/Test Split
    X_temp = original[features].iloc[:15119]
    X_test = original[features].iloc[15119:]
    y_temp = original['Cover_Type'].iloc[:15119]
    y_test = original['Cover_Type'].iloc[15119:]
    
    # Feature Engineering
    if processing:
        X_temp = processing(X_temp)
        X_test = processing(X_test)
        
    # Store the out-of-fold predictions
    test_preds = np.zeros((X_test.shape[0],7))
    oof_preds = np.zeros((X_temp.shape[0],))
    scores, times = np.zeros(NUM_FOLDS), np.zeros(NUM_FOLDS)
    
    # Stratified k-fold cross-validation
    skf = StratifiedKFold(n_splits = NUM_FOLDS, shuffle = True, random_state = RANDOM_SEED)
    for fold, (train_idx, valid_idx) in enumerate(skf.split(X_temp,y_temp)):
       
        # Training and Validation Sets
        X_train, X_valid = X_temp.iloc[train_idx], X_temp.iloc[valid_idx]
        y_train, y_valid = y_temp.iloc[train_idx], y_temp.iloc[valid_idx]
        
        # Create model
        start = time.time()
        model = clone(sklearn_model)
        model.fit(X_train, y_train)

        # validation and test predictions
        valid_preds = np.ravel(model.predict(X_valid))
        oof_preds[valid_idx] = valid_preds
        test_preds += model.predict_proba(X_test)
        
        # Save scores and times
        scores[fold] = accuracy_score(y_valid, valid_preds)
        end = time.time()
        times[fold] = end-start
        time.sleep(0.5)
    
    test_preds = np.argmax(test_preds, axis = 1)
    test_score = accuracy_score(y_test, test_preds)
    print('\n'+model.__class__.__name__)
    print("Train Accuracy:", round(scores.mean(), 5))
    print('Test Accuracy:', round(test_score, 5))
    print(f'Training Time: {round(times.sum(), 2)}s')
    
    return scores.mean(), oof_preds, test_score

# Models

We use the following 4 models from the scikit-learn library:

1. AdaBoost 
2. ExtraTrees
3. Bagging
4. Random Forest

In [5]:
# AdaBoost Classifier
adaboost = AdaBoostClassifier(
    base_estimator = DecisionTreeClassifier(
        splitter = 'random',
        random_state = RANDOM_SEED,
    ),
    random_state = RANDOM_SEED,
)

# ExtraTrees Classifier
extratrees = ExtraTreesClassifier(
    n_jobs = -1,
    random_state = RANDOM_SEED,
    max_features = None,
)

# Bagging Classifier
bagging = BaggingClassifier(
    base_estimator = DecisionTreeClassifier(
        splitter = 'random',
        random_state = RANDOM_SEED,
    ),
    n_jobs = -1,
    random_state = RANDOM_SEED
)

# Random Forest Classifier
randomforest = RandomForestClassifier(
    n_jobs = -1,
    random_state = RANDOM_SEED,
)

# Baseline

In [6]:
# AdaBoost
cv_score, oof_preds, test_score = train_original(adaboost)

adaboost_scores.append((
    'Baseline', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

baseline['Adaboost']['cv_score'] = cv_score
baseline['Adaboost']['oof_preds'] = oof_preds
baseline['Adaboost']['test_score'] = test_score

# ExtraTrees
cv_score, oof_preds, test_score = train_original(extratrees)

extratrees_scores.append((
    'Baseline', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

baseline['ExtraTrees']['cv_score'] = cv_score
baseline['ExtraTrees']['oof_preds'] = oof_preds
baseline['ExtraTrees']['test_score'] = test_score

# Bagging
cv_score, oof_preds, test_score = train_original(bagging)

bagging_scores.append((
    'Baseline', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

baseline['Bagging']['cv_score'] = cv_score
baseline['Bagging']['oof_preds'] = oof_preds
baseline['Bagging']['test_score'] = test_score

# Random Forest
cv_score, oof_preds, test_score = train_original(randomforest)

random_scores.append((
    'Baseline', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

baseline['RandomForest']['cv_score'] = cv_score
baseline['RandomForest']['oof_preds'] = oof_preds
baseline['RandomForest']['test_score'] = test_score


AdaBoostClassifier
Train Accuracy: 0.80356
Test Accuracy: 0.75373
Training Time: 4.32s

ExtraTreesClassifier
Train Accuracy: 0.88491
Test Accuracy: 0.77808
Training Time: 42.34s

BaggingClassifier
Train Accuracy: 0.85581
Test Accuracy: 0.75372
Training Time: 27.42s

RandomForestClassifier
Train Accuracy: 0.86395
Test Accuracy: 0.74895
Training Time: 48.55s


# Categorical

We first test whether our models do better with or without one-hot encoding.

In [7]:
# Encode soil type
def categorical_encoding(input_df):
    data = input_df.copy()
    data['Soil_Type'] = 0
    soil_features = list()
    for i in range(1,41):
        data['Soil_Type'] += i*data[f'Soil_Type{i}']
        soil_features.append(f'Soil_Type{i}')
    nonsoil_features = [x for x in data.columns if x not in soil_features]
    return data[nonsoil_features]

In [8]:
# AdaBoost
cv_score, oof_preds, test_score = train_original(adaboost, categorical_encoding)

adaboost_scores.append((
    'Categorical', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

print('Train (+/-):', round(cv_score - baseline['Adaboost']['cv_score'], 6))
print('Test  (+/-):', round(test_score - baseline['Adaboost']['test_score'], 6))

# ExtraTrees
cv_score, oof_preds, test_score = train_original(extratrees, categorical_encoding)

extratrees_scores.append((
    'Categorical', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

print('Train (+/-):', round(cv_score - baseline['ExtraTrees']['cv_score'], 6))
print('Test  (+/-):', round(test_score - baseline['ExtraTrees']['test_score'], 6))

# Bagging
cv_score, oof_preds, test_score = train_original(bagging, categorical_encoding)

bagging_scores.append((
    'Categorical', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

print('Train (+/-):', round(cv_score - baseline['Bagging']['cv_score'], 6))
print('Test  (+/-):', round(test_score - baseline['Bagging']['test_score'], 6))

# Random Forest
cv_score, oof_preds, test_score = train_original(randomforest, categorical_encoding)

random_scores.append((
    'Categorical', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

print('Train (+/-):', round(cv_score - baseline['RandomForest']['cv_score'], 6))
print('Test  (+/-):', round(test_score - baseline['RandomForest']['test_score'], 6))


AdaBoostClassifier
Train Accuracy: 0.78596
Test Accuracy: 0.76229
Training Time: 3.53s
Train (+/-): -0.017594
Test  (+/-): 0.008563

ExtraTreesClassifier
Train Accuracy: 0.88538
Test Accuracy: 0.78206
Training Time: 40.39s
Train (+/-): 0.000463
Test  (+/-): 0.003981

BaggingClassifier
Train Accuracy: 0.85006
Test Accuracy: 0.75982
Training Time: 22.89s
Train (+/-): -0.005755
Test  (+/-): 0.006097

RandomForestClassifier
Train Accuracy: 0.86454
Test Accuracy: 0.74822
Training Time: 50.25s
Train (+/-): 0.000595
Test  (+/-): -0.000733


# Soil Type Features

We test the following features based off of the soil type descriptions from the original data:

1. Climatic Zone (Ordinal)
2. Geologic Zone (Nominal)
3. Surface Cover (Ordinal)
4. Rock Size (Ordinal)

## Climatic Zone (Ordinal)

We create a feature based on the climatic zone of the soil, which has a natural ordering:

1. lower montane dry
2. lower montane
3. montane dry
4. montane
5. montane dry and montane
6. montane and subalpine
7. subalpine
8. alpine

However, the ordering of the soil type labels roughly follows the ordering of their respectively climatic zones, so there's a chance this feature won't be particularly informative.

In [9]:
def climatic_zone_original(input_df):
    code = {
        1:2702,2:2703,3:2704,4:2705,5:2706,6:2717,7:3501,8:3502,9:4201,
        10:4703,11:4704,12:4744,13:4758,14:5101,15:5151,16:6101,17:6102,
        18:6731,19:7101,20:7102,21:7103,22:7201,23:7202,24:7700,25:7701,
        26:7702,27:7709,28:7710,29:7745,30:7746,31:7755,32:7756,33:7757,
        34:7790,35:8703,36:8707,37:8708,38:8771,39:8772,40:8776
    }
    temp_df = categorical_encoding(input_df)
    df = input_df.copy()
    df['Climatic_Zone'] = temp_df['Soil_Type'].apply(lambda x: int(str(code[x])[0]))
    return df

In [10]:
# AdaBoost
cv_score, oof_preds, test_score = train_original(adaboost, climatic_zone_original)

adaboost_scores.append((
    'Climatic_Zone', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

print('Train (+/-):', round(cv_score - baseline['Adaboost']['cv_score'], 6))
print('Test  (+/-):', round(test_score - baseline['Adaboost']['test_score'], 6))

# Extra Trees
cv_score, oof_preds, test_score = train_original(extratrees, climatic_zone_original)

extratrees_scores.append((
    'Climatic_Zone', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

print('Train (+/-):', round(cv_score - baseline['ExtraTrees']['cv_score'], 6))
print('Test  (+/-):', round(test_score - baseline['ExtraTrees']['test_score'], 6))

# Bagging
cv_score, oof_preds, test_score = train_original(bagging, climatic_zone_original)

bagging_scores.append((
    'Climatic_Zone', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

print('Train (+/-):', round(cv_score - baseline['Bagging']['cv_score'], 6))
print('Test  (+/-):', round(test_score - baseline['Bagging']['test_score'], 6))

# Random Forest
cv_score, oof_preds, test_score = train_original(randomforest, climatic_zone_original)

random_scores.append((
    'Climatic_Zone', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

print('Train (+/-):', round(cv_score - baseline['RandomForest']['cv_score'], 6))
print('Test  (+/-):', round(test_score - baseline['RandomForest']['test_score'], 6))


AdaBoostClassifier
Train Accuracy: 0.80098
Test Accuracy: 0.75969
Training Time: 5.72s
Train (+/-): -0.00258
Test  (+/-): 0.005968

ExtraTreesClassifier
Train Accuracy: 0.88683
Test Accuracy: 0.78076
Training Time: 42.9s
Train (+/-): 0.001918
Test  (+/-): 0.002677

BaggingClassifier
Train Accuracy: 0.85469
Test Accuracy: 0.75823
Training Time: 35.65s
Train (+/-): -0.001124
Test  (+/-): 0.004506

RandomForestClassifier
Train Accuracy: 0.86408
Test Accuracy: 0.74883
Training Time: 41.28s
Train (+/-): 0.000132
Test  (+/-): -0.000124


## Geologic Zones (Nominal)

This is another feature which is based on the soil type codes, but is not ordered like climatic zone.

1. alluvium
2. glacial
3. shale
4. sandstone
5. mixed sedimentary
6. unspecified in the USFS ELU Survey
7. igneous and metamorphic
8. volcanic

In [11]:
def geologic_zone_original(input_df):
    code = {
        1:2702,2:2703,3:2704,4:2705,5:2706,6:2717,7:3501,8:3502,9:4201,
        10:4703,11:4704,12:4744,13:4758,14:5101,15:5151,16:6101,17:6102,
        18:6731,19:7101,20:7102,21:7103,22:7201,23:7202,24:7700,25:7701,
        26:7702,27:7709,28:7710,29:7745,30:7746,31:7755,32:7756,33:7757,
        34:7790,35:8703,36:8707,37:8708,38:8771,39:8772,40:8776
    }
    temp_df = categorical_encoding(input_df)
    df = input_df.copy()
    df['Geologic_Zone'] = temp_df['Soil_Type'].apply(lambda x: int(str(code[x])[1]))
    return df

In [12]:
# AdaBoost
cv_score, oof_preds, test_score = train_original(adaboost, geologic_zone_original)

adaboost_scores.append((
    'Geologic_Zone', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

print('Train (+/-):', round(cv_score - baseline['Adaboost']['cv_score'], 6))
print('Test  (+/-):', round(test_score - baseline['Adaboost']['test_score'], 6))

# Extra Trees
cv_score, oof_preds, test_score = train_original(extratrees, geologic_zone_original)

extratrees_scores.append((
    'Geologic_Zone', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

print('Train (+/-):', round(cv_score - baseline['ExtraTrees']['cv_score'], 6))
print('Test  (+/-):', round(test_score - baseline['ExtraTrees']['test_score'], 6))

# Bagging
cv_score, oof_preds, test_score = train_original(bagging, geologic_zone_original)

bagging_scores.append((
    'Geologic_Zone', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

print('Train (+/-):', round(cv_score - baseline['Bagging']['cv_score'], 6))
print('Test  (+/-):', round(test_score - baseline['Bagging']['test_score'], 6))

# Random Forest
cv_score, oof_preds, test_score = train_original(randomforest, geologic_zone_original)

random_scores.append((
    'Geologic_Zone',  cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

print('Train (+/-):', round(cv_score - baseline['RandomForest']['cv_score'], 6))
print('Test  (+/-):', round(test_score - baseline['RandomForest']['test_score'], 6))


AdaBoostClassifier
Train Accuracy: 0.79641
Test Accuracy: 0.75359
Training Time: 4.7s
Train (+/-): -0.007145
Test  (+/-): -0.00014

ExtraTreesClassifier
Train Accuracy: 0.88763
Test Accuracy: 0.77757
Training Time: 45.34s
Train (+/-): 0.002711
Test  (+/-): -0.000507

BaggingClassifier
Train Accuracy: 0.86024
Test Accuracy: 0.75713
Training Time: 34.01s
Train (+/-): 0.004432
Test  (+/-): 0.003403

RandomForestClassifier
Train Accuracy: 0.86441
Test Accuracy: 0.75013
Training Time: 43.19s
Train (+/-): 0.000463
Test  (+/-): 0.00118


## Surface Cover (Ordinal)

According to the [USDA reference](https://www.nrcs.usda.gov/wps/portal/nrcs/detail/soils/ref/?cid=nrcs142p2_054253#surface_fragments) on soil profiling:

1. **(Stony/Bouldery)** — Stones or boulders cover 0.01 to less than 0.1 percent of the surface. The smallest stones are at least 8 meters apart; the smallest boulders are at least 20 meters apart (fig. 3-9).

2. **(Very Stony/Very Bouldery)** — Stones or boulders cover 0.1 to less than 3 percent of the surface. The smallest stones are not less than 1 meter apart; the smallest boulders are not less than 3 meters apart (fig. 3-10).

3. **(Extremely Stony/Extremely Bouldery)** — Stones or boulders cover 3 to less than 15 percent of the surface. The smallest stones are as little as 0.5 meter apart; the smallest boulders are as little as 1 meter apart (fig. 3-11).

4. **(Rubbly)** — Stones or boulders cover 15 to less than 50 percent of the surface. The smallest stones are as little as 0.3 meter apart; the smallest boulders are as little as 0.5 meter apart. In most places it is possible to step from stone to stone or jump from boulder to boulder without touching the soil (fig. 3-12).

5. **(Very Rubbly)** — Stones or boulders appear to be nearly continuous and cover 50 percent or more of the surface. The smallest stones are less than 0.03 meter apart; the smallest boulders are less than 0.05 meter apart. Classifiable soil is among the rock fragments, and plant growth is possible (fig. 3-13).

In [13]:
def surface_cover_original(input_df):
    # Group IDs
    no_desc = [7,8,14,15,16,17,19,20,21,23,35]
    stony = [6,12]
    very_stony = [2,9,18,26]
    extremely_stony = [1,22,24,25,27,28,29,30,31,32,33,34,36,37,38,39,40]
    rubbly = [3,4,5,10,11,13]

    # Create dictionary
    surface_cover = {i:0 for i in no_desc}
    surface_cover.update({i:1 for i in stony})
    surface_cover.update({i:2 for i in very_stony})
    surface_cover.update({i:3 for i in extremely_stony})
    surface_cover.update({i:4 for i in rubbly})
    
    # Create Feature
    temp_df = categorical_encoding(input_df)
    df = input_df.copy()
    df['Surface_Cover'] = temp_df['Soil_Type'].apply(lambda x: surface_cover[x])
    return df

In [14]:
# AdaBoost
cv_score, oof_preds, test_score = train_original(adaboost, surface_cover_original)

adaboost_scores.append((
    'Surface_Cover', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

print('Train (+/-):', round(cv_score - baseline['Adaboost']['cv_score'], 6))
print('Test  (+/-):', round(test_score - baseline['Adaboost']['test_score'], 6))

# Extra Trees
cv_score, oof_preds, test_score = train_original(extratrees, surface_cover_original)

extratrees_scores.append((
    'Surface_Cover', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

print('Train (+/-):', round(cv_score - baseline['ExtraTrees']['cv_score'], 6))
print('Test  (+/-):', round(test_score - baseline['ExtraTrees']['test_score'], 6))

# Bagging
cv_score, oof_preds, test_score = train_original(bagging, surface_cover_original)

bagging_scores.append((
    'Surface_Cover', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

print('Train (+/-):', round(cv_score - baseline['Bagging']['cv_score'], 6))
print('Test  (+/-):', round(test_score - baseline['Bagging']['test_score'], 6))

# Random Forest
cv_score, oof_preds, test_score = train_original(randomforest, surface_cover_original)

random_scores.append((
    'Surface_Cover', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

print('Train (+/-):', round(cv_score - baseline['RandomForest']['cv_score'], 6))
print('Test  (+/-):', round(test_score - baseline['RandomForest']['test_score'], 6))


AdaBoostClassifier
Train Accuracy: 0.7945
Test Accuracy: 0.75744
Training Time: 5.21s
Train (+/-): -0.009062
Test  (+/-): 0.003716

ExtraTreesClassifier
Train Accuracy: 0.88538
Test Accuracy: 0.7782
Training Time: 45.44s
Train (+/-): 0.000462
Test  (+/-): 0.00012

BaggingClassifier
Train Accuracy: 0.85627
Test Accuracy: 0.75621
Training Time: 29.66s
Train (+/-): 0.000463
Test  (+/-): 0.002485

RandomForestClassifier
Train Accuracy: 0.86348
Test Accuracy: 0.74902
Training Time: 38.57s
Train (+/-): -0.000463
Test  (+/-): 7.2e-05


## Rock Size (Ordinal)

In [15]:
def rock_size_original(input_df):
    
    # Group IDs
    no_desc = [7,8,14,15,16,17,19,20,21,23,35]
    stones = [1,2,6,9,12,18,24,25,26,27,28,29,30,31,32,33,34,36,37,38,39,40]
    boulders = [22]
    rubble = [3,4,5,10,11,13]

    # Create dictionary
    rock_size = {i:0 for i in no_desc}
    rock_size.update({i:1 for i in stones})
    rock_size.update({i:2 for i in boulders})
    rock_size.update({i:3 for i in rubble})
    
    temp_df = categorical_encoding(input_df)
    df = input_df.copy()
    df['Rock_Size'] = temp_df['Soil_Type'].apply(lambda x: rock_size[x])
    return df

In [16]:
# AdaBoost
cv_score, oof_preds, test_score = train_original(adaboost, rock_size_original)

adaboost_scores.append((
    'Rock_Size', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

print('Train (+/-):', round(cv_score - baseline['Adaboost']['cv_score'], 6))
print('Test  (+/-):', round(test_score - baseline['Adaboost']['test_score'], 6))

# Extra Trees
cv_score, oof_preds, test_score = train_original(extratrees, rock_size_original)

extratrees_scores.append((
    'Rock_Size', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

print('Train (+/-):', round(cv_score - baseline['ExtraTrees']['cv_score'], 6))
print('Test  (+/-):', round(test_score - baseline['ExtraTrees']['test_score'], 6))

# Bagging
cv_score, oof_preds, test_score = train_original(bagging, rock_size_original)

bagging_scores.append((
    'Rock_Size', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

print('Train (+/-):', round(cv_score - baseline['Bagging']['cv_score'], 6))
print('Test  (+/-):', round(test_score - baseline['Bagging']['test_score'], 6))

# Random Forest
cv_score, oof_preds, test_score = train_original(randomforest, rock_size_original)

random_scores.append((
    'Rock_Size', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

print('Train (+/-):', round(cv_score - baseline['RandomForest']['cv_score'], 6))
print('Test  (+/-):', round(test_score - baseline['RandomForest']['test_score'], 6))


AdaBoostClassifier
Train Accuracy: 0.7943
Test Accuracy: 0.75704
Training Time: 4.89s
Train (+/-): -0.00926
Test  (+/-): 0.00331

ExtraTreesClassifier
Train Accuracy: 0.88703
Test Accuracy: 0.77801
Training Time: 42.13s
Train (+/-): 0.002116
Test  (+/-): -7.1e-05

BaggingClassifier
Train Accuracy: 0.85535
Test Accuracy: 0.7568
Training Time: 30.65s
Train (+/-): -0.000463
Test  (+/-): 0.00308

RandomForestClassifier
Train Accuracy: 0.86666
Test Accuracy: 0.74992
Training Time: 37.38s
Train (+/-): 0.002712
Test  (+/-): 0.00097


# Summary

All of these features seem promising, so we won't rule any out just yet.

In [17]:
# AdaBoost
pd.DataFrame.from_records(
    data = adaboost_scores,
    columns = ['features','cv_score','holdout','recall_0', 'recall_1','recall_2','recall_3','recall_4','recall_5','recall_6']
).sort_values('cv_score', ascending = False)

Unnamed: 0,features,cv_score,holdout,recall_0,recall_1,recall_2,recall_3,recall_4,recall_5,recall_6
0,Baseline,0.803559,0.753727,0.681944,0.650463,0.745716,0.926389,0.893519,0.791204,0.935648
2,Climatic_Zone,0.800979,0.759695,0.665741,0.622222,0.755442,0.943056,0.9,0.78287,0.9375
3,Geologic_Zone,0.796414,0.753588,0.682407,0.630093,0.732747,0.925,0.893519,0.780556,0.930556
4,Surface_Cover,0.794497,0.757444,0.674537,0.6375,0.739231,0.928704,0.890278,0.768519,0.922685
5,Rock_Size,0.794299,0.757037,0.665278,0.622222,0.753126,0.943981,0.886574,0.764815,0.924074
1,Categorical,0.785965,0.762291,0.674537,0.6375,0.72302,0.918519,0.878704,0.74537,0.924074


In [18]:
# Extra Trees Classifier
pd.DataFrame.from_records(
    data = extratrees_scores,
    columns = ['features','cv_score','holdout','recall_0', 'recall_1','recall_2','recall_3','recall_4','recall_5','recall_6']
).sort_values('cv_score', ascending = False)

Unnamed: 0,features,cv_score,holdout,recall_0,recall_1,recall_2,recall_3,recall_4,recall_5,recall_6
3,Geologic_Zone,0.887626,0.777571,0.781481,0.74537,0.864752,0.973148,0.9625,0.9125,0.973611
5,Rock_Size,0.88703,0.778008,0.782407,0.747685,0.866605,0.973148,0.960648,0.906481,0.972222
2,Climatic_Zone,0.886832,0.780755,0.785185,0.742593,0.868458,0.972222,0.961111,0.905093,0.973148
1,Categorical,0.885377,0.78206,0.7875,0.7375,0.86151,0.971759,0.961111,0.905556,0.972685
4,Surface_Cover,0.885377,0.778198,0.785648,0.741204,0.86012,0.971296,0.959259,0.906944,0.973148
0,Baseline,0.884914,0.778078,0.786574,0.734259,0.866142,0.971759,0.961111,0.903704,0.970833


In [19]:
# Bagging Classifier
pd.DataFrame.from_records(
    data = bagging_scores,
    columns = ['features','cv_score','holdout','recall_0', 'recall_1','recall_2','recall_3','recall_4','recall_5','recall_6']
).sort_values('cv_score', ascending = False)

Unnamed: 0,features,cv_score,holdout,recall_0,recall_1,recall_2,recall_3,recall_4,recall_5,recall_6
3,Geologic_Zone,0.860243,0.757127,0.781481,0.685648,0.84113,0.962963,0.938426,0.851389,0.960648
4,Surface_Cover,0.856274,0.756208,0.774537,0.674537,0.84113,0.963889,0.939815,0.844444,0.955556
0,Baseline,0.855812,0.753724,0.769907,0.670833,0.837888,0.964352,0.934259,0.85,0.963426
5,Rock_Size,0.855348,0.756804,0.769907,0.673148,0.841593,0.9625,0.941204,0.843981,0.955093
2,Climatic_Zone,0.854687,0.75823,0.774074,0.659259,0.839741,0.964352,0.940741,0.849074,0.955556
1,Categorical,0.850057,0.75982,0.775,0.672222,0.819824,0.958333,0.939815,0.830556,0.95463


In [20]:
# Random Forest
pd.DataFrame.from_records(
    data = random_scores,
    columns = ['features','cv_score','holdout','recall_0', 'recall_1','recall_2','recall_3','recall_4','recall_5','recall_6']
).sort_values('cv_score', ascending = False)

Unnamed: 0,features,cv_score,holdout,recall_0,recall_1,recall_2,recall_3,recall_4,recall_5,recall_6
5,Rock_Size,0.866658,0.749919,0.768981,0.710185,0.821214,0.971759,0.949074,0.877315,0.968056
1,Categorical,0.864542,0.748216,0.765741,0.69537,0.824456,0.970833,0.955556,0.871759,0.968056
3,Geologic_Zone,0.864409,0.750129,0.772222,0.708333,0.817508,0.969907,0.949074,0.87037,0.963426
2,Climatic_Zone,0.864079,0.748825,0.768519,0.701852,0.823993,0.971759,0.951852,0.867593,0.962963
0,Baseline,0.863947,0.748949,0.767593,0.701852,0.819824,0.971759,0.950926,0.869444,0.966204
4,Surface_Cover,0.863484,0.749021,0.763889,0.696759,0.823066,0.972222,0.950926,0.87037,0.96713
