# Notebook 3b - Breakdown Soil Type

In this notebook, we test out breaking down the soil-type features using domain knowledge and their descriptions

In [1]:
# Global variables for testing changes to this notebook quickly
RANDOM_SEED = 0
NUM_FOLDS = 12

In [2]:
import numpy as np
import pandas as pd
import time
import pyarrow
import gc

# Model evaluation
from functools import partial
from sklearn.base import clone
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, recall_score
from sklearn.inspection import partial_dependence, permutation_importance
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier

# Plotting
import matplotlib
import seaborn as sns
from matplotlib import pyplot as plt

# Hide warnings
import warnings
warnings.filterwarnings('ignore')

# Prepare Data

In [3]:
# Encode soil type
def categorical_encoding(input_df):
    data = input_df.copy()
    data['Soil_Type'] = 0
    soil_features = list()
    for i in range(1,41):
        data['Soil_Type'] += i*data[f'Soil_Type{i}']
        soil_features.append(f'Soil_Type{i}')
    nonsoil_features = [x for x in data.columns if x not in soil_features]
    return data[nonsoil_features]

In [4]:
%%time

# Load original data
original = categorical_encoding(pd.read_feather('../data/original.feather'))

# Label Encode
old_encoder = LabelEncoder()
original["Cover_Type"] = old_encoder.fit_transform(original["Cover_Type"])
y_train = original['Cover_Type'].iloc[:15119]
y_test = original['Cover_Type'].iloc[15119:]

# Get feature columns
features = [x for x in original.columns if x not in ['Id','Cover_Type']]

# Data structures for summary scores
bagging_scores = list()
extratrees_scores = list()
adaboost_scores = list()
random_scores = list()

Wall time: 273 ms


# Scoring Function

In [5]:
def train_original(sklearn_model, processing = None):
    
    # Original Training/Test Split
    X_temp = original[features].iloc[:15119]
    X_test = original[features].iloc[15119:]
    y_temp = original['Cover_Type'].iloc[:15119]
    y_test = original['Cover_Type'].iloc[15119:]
    
    # Feature Engineering
    if processing:
        X_temp = processing(X_temp)
        X_test = processing(X_test)
        
    # Store the out-of-fold predictions
    test_preds = np.zeros((X_test.shape[0],7))
    oof_preds = np.zeros((X_temp.shape[0],))
    scores, times = np.zeros(NUM_FOLDS), np.zeros(NUM_FOLDS)
    
    # Stratified k-fold cross-validation
    skf = StratifiedKFold(n_splits = NUM_FOLDS, shuffle = True, random_state = RANDOM_SEED)
    for fold, (train_idx, valid_idx) in enumerate(skf.split(X_temp,y_temp)):
       
        # Training and Validation Sets
        X_train, X_valid = X_temp.iloc[train_idx], X_temp.iloc[valid_idx]
        y_train, y_valid = y_temp.iloc[train_idx], y_temp.iloc[valid_idx]
        
        # Create model
        start = time.time()
        model = clone(sklearn_model)
        model.fit(X_train, y_train)

        # validation and test predictions
        valid_preds = np.ravel(model.predict(X_valid))
        oof_preds[valid_idx] = valid_preds
        test_preds += model.predict_proba(X_test)
        
        # Save scores and times
        scores[fold] = accuracy_score(y_valid, valid_preds)
        end = time.time()
        times[fold] = end-start
        time.sleep(0.5)
    
    test_preds = np.argmax(test_preds, axis = 1)
    test_score = accuracy_score(y_test, test_preds)
    print('\n'+model.__class__.__name__)
    print("Train Accuracy:", round(scores.mean(), 5))
    print('Test Accuracy:', round(test_score, 5))
    print(f'Training Time: {round(times.sum(), 2)}s')
    
    return scores.mean(), oof_preds, test_score

# Models

We use the following 4 models from the scikit-learn library:

1. AdaBoost 
2. ExtraTrees
3. Bagging
4. Random Forest

In [6]:
# AdaBoost Classifier
adaboost = AdaBoostClassifier(
    base_estimator = DecisionTreeClassifier(
        splitter = 'random',
        random_state = RANDOM_SEED,
    ),
    random_state = RANDOM_SEED,
)

# ExtraTrees Classifier
extratrees = ExtraTreesClassifier(
    n_jobs = -1,
    random_state = RANDOM_SEED,
    max_features = None,
)

# Bagging Classifier
bagging = BaggingClassifier(
    base_estimator = DecisionTreeClassifier(
        splitter = 'random',
        random_state = RANDOM_SEED,
    ),
    n_jobs = -1,
    random_state = RANDOM_SEED
)

# Random Forest Classifier
randomforest = RandomForestClassifier(
    n_jobs = -1,
    random_state = RANDOM_SEED,
)

# Baselines

In [7]:
# AdaBoost
cv_score, oof_preds, test_score = train_original(adaboost)

adaboost_scores.append((
    'Baseline', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

# ExtraTrees
cv_score, oof_preds, test_score = train_original(extratrees)

extratrees_scores.append((
    'Baseline', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

# Bagging
cv_score, oof_preds, test_score = train_original(bagging)

bagging_scores.append((
    'Baseline', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

# Random Forest
cv_score, oof_preds, test_score = train_original(randomforest)

random_scores.append((
    'Baseline', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))


AdaBoostClassifier
Train Accuracy: 0.78596
Test Accuracy: 0.76229
Training Time: 3.79s

ExtraTreesClassifier
Train Accuracy: 0.88538
Test Accuracy: 0.78206
Training Time: 36.41s

BaggingClassifier
Train Accuracy: 0.85006
Test Accuracy: 0.75982
Training Time: 20.23s

RandomForestClassifier
Train Accuracy: 0.86454
Test Accuracy: 0.74822
Training Time: 37.07s


# Soil Type Features

We test the following features based off of the soil type descriptions from the original data:

1. Climatic Zone (Ordinal)
2. Geologic Zone (Nominal)
3. Surface Cover (Ordinal)
4. Rock Size (Ordinal)

## Climatic Zone (Ordinal)

We create a feature based on the climatic zone of the soil, which has a natural ordering:

1. lower montane dry
2. lower montane
3. montane dry
4. montane
5. montane dry and montane
6. montane and subalpine
7. subalpine
8. alpine

However, the ordering of the soil type labels roughly follows the ordering of their respectively climatic zones, so there's a chance this feature won't be particularly informative.

In [8]:
def climatic_zone_original(input_df):
    code = {
        1:2702,2:2703,3:2704,4:2705,5:2706,6:2717,7:3501,8:3502,9:4201,
        10:4703,11:4704,12:4744,13:4758,14:5101,15:5151,16:6101,17:6102,
        18:6731,19:7101,20:7102,21:7103,22:7201,23:7202,24:7700,25:7701,
        26:7702,27:7709,28:7710,29:7745,30:7746,31:7755,32:7756,33:7757,
        34:7790,35:8703,36:8707,37:8708,38:8771,39:8772,40:8776
    }
    df = input_df.copy()
    df['Climatic_Zone'] = input_df['Soil_Type'].apply(lambda x: int(str(code[x])[0]))
    return df

In [9]:
# AdaBoost
cv_score, oof_preds, test_score = train_original(adaboost, climatic_zone_original)

adaboost_scores.append((
    'Climatic_Zone', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

# Extra Trees
cv_score, oof_preds, test_score = train_original(extratrees, climatic_zone_original)

extratrees_scores.append((
    'Climatic_Zone', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

# Bagging
cv_score, oof_preds, test_score = train_original(bagging, climatic_zone_original)

bagging_scores.append((
    'Climatic_Zone', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

# Random Forest
cv_score, oof_preds, test_score = train_original(randomforest, climatic_zone_original)

random_scores.append((
    'Climatic_Zone', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))


AdaBoostClassifier
Train Accuracy: 0.79536
Test Accuracy: 0.75743
Training Time: 3.5s

ExtraTreesClassifier
Train Accuracy: 0.88511
Test Accuracy: 0.78361
Training Time: 38.33s

BaggingClassifier
Train Accuracy: 0.85396
Test Accuracy: 0.75876
Training Time: 19.07s

RandomForestClassifier
Train Accuracy: 0.8656
Test Accuracy: 0.75154
Training Time: 38.48s


## Geologic Zones (Nominal)

This is another feature which is based on the soil type codes, but is not ordered like climatic zone.

1. alluvium
2. glacial
3. shale
4. sandstone
5. mixed sedimentary
6. unspecified in the USFS ELU Survey
7. igneous and metamorphic
8. volcanic

In [10]:
def geologic_zone_original(input_df):
    code = {
        1:2702,2:2703,3:2704,4:2705,5:2706,6:2717,7:3501,8:3502,9:4201,
        10:4703,11:4704,12:4744,13:4758,14:5101,15:5151,16:6101,17:6102,
        18:6731,19:7101,20:7102,21:7103,22:7201,23:7202,24:7700,25:7701,
        26:7702,27:7709,28:7710,29:7745,30:7746,31:7755,32:7756,33:7757,
        34:7790,35:8703,36:8707,37:8708,38:8771,39:8772,40:8776
    }
    df = input_df.copy()
    df['Geologic_Zone'] = input_df['Soil_Type'].apply(lambda x: int(str(code[x])[1]))
    return df

In [11]:
# AdaBoost
cv_score, oof_preds, test_score = train_original(adaboost, geologic_zone_original)

adaboost_scores.append((
    'Geologic_Zone', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

# Extra Trees
cv_score, oof_preds, test_score = train_original(extratrees, geologic_zone_original)

extratrees_scores.append((
    'Geologic_Zone', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

# Bagging
cv_score, oof_preds, test_score = train_original(bagging, geologic_zone_original)

bagging_scores.append((
    'Geologic_Zone', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

# Random Forest
cv_score, oof_preds, test_score = train_original(randomforest, geologic_zone_original)

random_scores.append((
    'Geologic_Zone',  cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))


AdaBoostClassifier
Train Accuracy: 0.79489
Test Accuracy: 0.75732
Training Time: 3.81s

ExtraTreesClassifier
Train Accuracy: 0.88551
Test Accuracy: 0.78032
Training Time: 38.91s

BaggingClassifier
Train Accuracy: 0.84946
Test Accuracy: 0.75634
Training Time: 20.04s

RandomForestClassifier
Train Accuracy: 0.86461
Test Accuracy: 0.75179
Training Time: 37.78s


## Surface Cover (Ordinal)

According to the [USDA reference](https://www.nrcs.usda.gov/wps/portal/nrcs/detail/soils/ref/?cid=nrcs142p2_054253#surface_fragments) on soil profiling:

1. **(Stony/Bouldery)** — Stones or boulders cover 0.01 to less than 0.1 percent of the surface. The smallest stones are at least 8 meters apart; the smallest boulders are at least 20 meters apart (fig. 3-9).

2. **(Very Stony/Very Bouldery)** — Stones or boulders cover 0.1 to less than 3 percent of the surface. The smallest stones are not less than 1 meter apart; the smallest boulders are not less than 3 meters apart (fig. 3-10).

3. **(Extremely Stony/Extremely Bouldery)** — Stones or boulders cover 3 to less than 15 percent of the surface. The smallest stones are as little as 0.5 meter apart; the smallest boulders are as little as 1 meter apart (fig. 3-11).

4. **(Rubbly)** — Stones or boulders cover 15 to less than 50 percent of the surface. The smallest stones are as little as 0.3 meter apart; the smallest boulders are as little as 0.5 meter apart. In most places it is possible to step from stone to stone or jump from boulder to boulder without touching the soil (fig. 3-12).

5. **(Very Rubbly)** — Stones or boulders appear to be nearly continuous and cover 50 percent or more of the surface. The smallest stones are less than 0.03 meter apart; the smallest boulders are less than 0.05 meter apart. Classifiable soil is among the rock fragments, and plant growth is possible (fig. 3-13).

In [12]:
def surface_cover_original(input_df):
    # Group IDs
    no_desc = [7,8,14,15,16,17,19,20,21,23,35]
    stony = [6,12]
    very_stony = [2,9,18,26]
    extremely_stony = [1,22,24,25,27,28,29,30,31,32,33,34,36,37,38,39,40]
    rubbly = [3,4,5,10,11,13]

    # Create dictionary
    surface_cover = {i:0 for i in no_desc}
    surface_cover.update({i:1 for i in stony})
    surface_cover.update({i:2 for i in very_stony})
    surface_cover.update({i:3 for i in extremely_stony})
    surface_cover.update({i:4 for i in rubbly})
    
    # Create Feature
    df = input_df.copy()
    df['Surface_Cover'] = input_df['Soil_Type'].apply(lambda x: surface_cover[x])
    return df

In [13]:
# AdaBoost
cv_score, oof_preds, test_score = train_original(adaboost, surface_cover_original)

adaboost_scores.append((
    'Surface_Cover', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

# Extra Trees
cv_score, oof_preds, test_score = train_original(extratrees, surface_cover_original)

extratrees_scores.append((
    'Surface_Cover', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

# Bagging
cv_score, oof_preds, test_score = train_original(bagging, surface_cover_original)

bagging_scores.append((
    'Surface_Cover', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

# Random Forest
cv_score, oof_preds, test_score = train_original(randomforest, surface_cover_original)

random_scores.append((
    'Surface_Cover', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))


AdaBoostClassifier
Train Accuracy: 0.79132
Test Accuracy: 0.75978
Training Time: 3.93s

ExtraTreesClassifier
Train Accuracy: 0.88584
Test Accuracy: 0.78266
Training Time: 40.44s

BaggingClassifier
Train Accuracy: 0.85032
Test Accuracy: 0.75947
Training Time: 23.44s

RandomForestClassifier
Train Accuracy: 0.86937
Test Accuracy: 0.75266
Training Time: 41.04s


## Rock Size (Ordinal)

In [14]:
def rock_size_original(input_df):
    
    # Group IDs
    no_desc = [7,8,14,15,16,17,19,20,21,23,35]
    stones = [1,2,6,9,12,18,24,25,26,27,28,29,30,31,32,33,34,36,37,38,39,40]
    boulders = [22]
    rubble = [3,4,5,10,11,13]

    # Create dictionary
    rock_size = {i:0 for i in no_desc}
    rock_size.update({i:1 for i in stones})
    rock_size.update({i:2 for i in boulders})
    rock_size.update({i:3 for i in rubble})
    
    df = input_df.copy()
    df['Rock_Size'] = input_df['Soil_Type'].apply(lambda x: rock_size[x])
    return df

In [15]:
# AdaBoost
cv_score, oof_preds, test_score = train_original(adaboost, rock_size_original)

adaboost_scores.append((
    'Rock_Size', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

# Extra Trees
cv_score, oof_preds, test_score = train_original(extratrees, rock_size_original)

extratrees_scores.append((
    'Rock_Size', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

# Bagging
cv_score, oof_preds, test_score = train_original(bagging, rock_size_original)

bagging_scores.append((
    'Rock_Size', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

# Random Forest
cv_score, oof_preds, test_score = train_original(randomforest, rock_size_original)

random_scores.append((
    'Rock_Size', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))


AdaBoostClassifier
Train Accuracy: 0.79397
Test Accuracy: 0.75969
Training Time: 3.49s

ExtraTreesClassifier
Train Accuracy: 0.88544
Test Accuracy: 0.78247
Training Time: 41.97s

BaggingClassifier
Train Accuracy: 0.85138
Test Accuracy: 0.75762
Training Time: 20.08s

RandomForestClassifier
Train Accuracy: 0.86798
Test Accuracy: 0.75305
Training Time: 33.46s


# Summary

All of these features seem promising, so we won't rule any out just yet.

In [16]:
# AdaBoost
pd.DataFrame.from_records(
    data = adaboost_scores,
    columns = ['features','cv_score','holdout','recall_0', 'recall_1','recall_2','recall_3','recall_4','recall_5','recall_6']
).sort_values('cv_score')

Unnamed: 0,features,cv_score,holdout,recall_0,recall_1,recall_2,recall_3,recall_4,recall_5,recall_6
0,Baseline,0.785965,0.762291,0.674537,0.6375,0.72302,0.918519,0.878704,0.74537,0.924074
3,Surface_Cover,0.791321,0.759783,0.656019,0.647222,0.738305,0.919444,0.882407,0.765278,0.930556
4,Rock_Size,0.793968,0.75969,0.684722,0.639352,0.73321,0.910185,0.89537,0.766667,0.928241
2,Geologic_Zone,0.794894,0.757316,0.665278,0.648148,0.740157,0.919444,0.891667,0.76713,0.932407
1,Climatic_Zone,0.795357,0.757429,0.676389,0.6375,0.746642,0.913889,0.882407,0.771759,0.938889


In [17]:
# Extra Trees Classifier
pd.DataFrame.from_records(
    data = extratrees_scores,
    columns = ['features','cv_score','holdout','recall_0', 'recall_1','recall_2','recall_3','recall_4','recall_5','recall_6']
).sort_values('cv_score')

Unnamed: 0,features,cv_score,holdout,recall_0,recall_1,recall_2,recall_3,recall_4,recall_5,recall_6
1,Climatic_Zone,0.885112,0.783611,0.783333,0.74213,0.861047,0.972222,0.960648,0.902315,0.974074
0,Baseline,0.885377,0.78206,0.7875,0.7375,0.86151,0.971759,0.961111,0.905556,0.972685
4,Rock_Size,0.885443,0.782475,0.787037,0.74213,0.856415,0.972222,0.960185,0.906944,0.973148
2,Geologic_Zone,0.885509,0.780322,0.787037,0.743981,0.858731,0.970833,0.959722,0.90463,0.973611
3,Surface_Cover,0.88584,0.782662,0.79213,0.738426,0.858731,0.973611,0.960185,0.905556,0.972222


In [18]:
# Bagging Classifier
pd.DataFrame.from_records(
    data = bagging_scores,
    columns = ['features','cv_score','holdout','recall_0', 'recall_1','recall_2','recall_3','recall_4','recall_5','recall_6']
).sort_values('cv_score')

Unnamed: 0,features,cv_score,holdout,recall_0,recall_1,recall_2,recall_3,recall_4,recall_5,recall_6
2,Geologic_Zone,0.849462,0.756343,0.763889,0.665278,0.83094,0.961574,0.943981,0.822222,0.958333
0,Baseline,0.850057,0.75982,0.775,0.672222,0.819824,0.958333,0.939815,0.830556,0.95463
3,Surface_Cover,0.850321,0.759465,0.773611,0.671296,0.814729,0.963426,0.940278,0.836111,0.952778
4,Rock_Size,0.851379,0.757622,0.772222,0.654167,0.832793,0.96713,0.939352,0.836574,0.957407
1,Climatic_Zone,0.853959,0.758758,0.772685,0.664815,0.837888,0.962963,0.945833,0.834259,0.959259


In [19]:
# Random Forest
pd.DataFrame.from_records(
    data = random_scores,
    columns = ['features','cv_score','holdout','recall_0', 'recall_1','recall_2','recall_3','recall_4','recall_5','recall_6']
).sort_values('cv_score')

Unnamed: 0,features,cv_score,holdout,recall_0,recall_1,recall_2,recall_3,recall_4,recall_5,recall_6
0,Baseline,0.864542,0.748216,0.765741,0.69537,0.824456,0.970833,0.955556,0.871759,0.968056
2,Geologic_Zone,0.864608,0.751794,0.766667,0.694907,0.828161,0.969907,0.955093,0.871296,0.966204
1,Climatic_Zone,0.8656,0.751543,0.766204,0.698148,0.831867,0.973148,0.95463,0.872685,0.9625
4,Rock_Size,0.867981,0.753045,0.772222,0.696296,0.836498,0.972685,0.951852,0.878241,0.968056
3,Surface_Cover,0.86937,0.75266,0.768981,0.703704,0.833256,0.973611,0.957407,0.881944,0.966667
