# Notebook 4 - Soil Type Encoding 

In this notebook, we test out encoding the soil types variables in various ways. Our motivation is that originally the soil type was a one-hot encoded categorical variable,and gradient boosting machines generally do better with ordinally-encoded categorical variables. 

One issue is that our synthetic data no longer preserves this property, hence we have to be clever to reduce the total number of soil type variables. We will try to use properties of the original data to help with our encoding but the synthetic data may not have the same properties.

In [1]:
# Global variables for testing changes to this notebook quickly
RANDOM_SEED = 0
NUM_FOLDS = 10
TRAIN_SIZE = 500000

In [2]:
import numpy as np
import pandas as pd
import time
import os
import pyarrow
import gc

# Hide warnings
import warnings
warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

# Model/Evaluation
from functools import partial
from sklearn.base import clone
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder, RobustScaler
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, recall_score
from sklearn.inspection import permutation_importance

# Gradient Boosting
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Load and Prepare Data

In [3]:
# Load full training data
train = pd.read_feather('../data/train.feather')

# Drop low/no variance 
train.drop(["Soil_Type7", "Id", "Soil_Type15"], axis=1, inplace=True)
train = train[train.Cover_Type != 5]

# Label Encoding
new_encoder = LabelEncoder()
train["Cover_Type"] = new_encoder.fit_transform(train["Cover_Type"])

# Split synthetic data
train, test = train_test_split(
    train, 
    train_size = TRAIN_SIZE, 
    random_state = RANDOM_SEED,
    stratify = train['Cover_Type'],
)
y_train = train['Cover_Type']


# features, data structure for summary scores
features = [x for x in train.columns if x not in ['Id','Cover_Type']]
nonsoil = [x for x in features if not x.startswith('Soil_Type')]
new_rows = list()
gc.collect()

print(f'Training Size: {train.shape[0]} rows, {train.shape[1]} cols')
print(f'Holdout Size: {test.shape[0]} rows, {test.shape[1]} cols\n')

Training Size: 500000 rows, 53 cols
Holdout Size: 3499999 rows, 53 cols



# Models

1. XGBoost
2. LightGBM
3. CatBoost

## 1. XGBoost

In [4]:
# XGBoost Classifier
xgb_pipeline = make_pipeline(
    XGBClassifier(
        booster = 'gbtree',
        tree_method = 'hist',
        eval_metric = 'mlogloss',
        random_state = RANDOM_SEED,
    ),
)

## 2. LightGBM

In [5]:
# LightGBM Classifier
lgbm_pipeline = make_pipeline(
    LGBMClassifier(
        unbalanced_sets = True,
        metric = 'multi_logloss',
        random_state = RANDOM_SEED,
        n_jobs = 4,
    ),
)

## 3. CatBoost

In [6]:
# CatBoost Classifier
catboost_pipeline = make_pipeline(
    CatBoostClassifier(
        eval_metric = 'MultiClass',
        boosting_type = 'Plain',
        verbose = False,
        random_state = RANDOM_SEED,
    ),
)

# Scoring Function

In [7]:
def score_features(sklearn_model, processing = None):
    
    # Original Training/Test Split
    features = [x for x in train.columns if x not in ['Id','Cover_Type']]
    X_temp, X_test = train[features], test[features]
    y_temp, y_test = train['Cover_Type'], test['Cover_Type']
    
    # Feature Engineering
    if processing:
        X_temp = processing(X_temp)
        X_test = processing(X_test)
    
    # Store the out-of-fold predictions
    test_preds = np.zeros((X_test.shape[0],6))
    oof_preds = np.zeros((X_temp.shape[0],))
    fi_scores = np.zeros((X_temp.shape[1],))
    scores, times = np.zeros(NUM_FOLDS), np.zeros(NUM_FOLDS)
    
    # Stratified k-fold cross-validation
    skf = StratifiedKFold(n_splits = NUM_FOLDS, shuffle = True, random_state = RANDOM_SEED)
    for fold, (train_idx, valid_idx) in enumerate(skf.split(train[features],train['Cover_Type'])):
       
        # Training and Validation Sets
        X_train, X_valid = X_temp.iloc[train_idx], X_temp.iloc[valid_idx]
        y_train, y_valid = y_temp.iloc[train_idx], y_temp.iloc[valid_idx]
        
        # Create model
        start = time.time()
        model = clone(sklearn_model)
        model.fit(X_train, y_train)

        # Permutation Importance
        result = permutation_importance(
            model, X_valid, y_valid, 
            random_state=RANDOM_SEED
        )
        fi_scores += result.importances_mean / NUM_FOLDS

        # validation/holdout predictions
        valid_preds = np.ravel(model.predict(X_valid))
        oof_preds[valid_idx] = valid_preds
        test_preds += model.predict_proba(X_test)

        # Save scores and times
        scores[fold] = accuracy_score(y_valid, valid_preds)
        end = time.time()
        times[fold] = end-start
        print(f'Fold {fold} Accuracy:  {round(scores[fold], 5)} in {round(end-start,2)}s.')
        time.sleep(0.5)
    
    features = [x for x in X_temp.columns]
    nonsoil = [x for x in X_test.columns if not x.startswith('Soil_Type')]
    test_preds = np.argmax(test_preds, axis = 1)
    test_score = accuracy_score(y_test, test_preds)
    #print('\n'+model.__class__.__name__)
    print("Train Accuracy:", round(scores.mean(), 5))
    print('Test Accuracy:', round(test_score, 5))
    print(f'Training Time: {round(times.sum(), 2)}s\n')
    
    fi_scores = pd.Series(
        data = fi_scores, 
        index = features
    ).loc[nonsoil].sort_values()
    
    return scores.mean(), oof_preds, test_score, fi_scores

# Baselines

## XGBoost

In [8]:
# Helper function
def start_at_eps(series, eps=1e-10): 
    return series - series.min() + eps  

def xgboost_features(data):
    df = data.copy()
    shade_features = ['Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm']
    soil_features = [x for x in df.columns if x.startswith("Soil_Type")]
    wilderness_features = [x for x in df.columns if x.startswith("Wilderness_Area")]
    
    # XGBoost Features
    df['Aspect_360'] = df['Aspect'] % 360
    df["Horizontal_Distance_To_Hydrology"] = df["Horizontal_Distance_To_Hydrology"].astype('float64')
    df["Vertical_Distance_To_Hydrology"] = df["Vertical_Distance_To_Hydrology"].astype('float64')
    pos_h_hydrology = start_at_eps(df["Horizontal_Distance_To_Hydrology"])
    pos_v_hydrology = start_at_eps(df['Vertical_Distance_To_Hydrology'])
    df["Hydro_Taxicab"] = np.abs(df["Horizontal_Distance_To_Hydrology"]) + np.abs(df["Vertical_Distance_To_Hydrology"])
    df['Hydro_Taxicab_Pos'] = (pos_h_hydrology ** 2 + pos_v_hydrology ** 2).apply(np.sqrt).rename('Euclidean_positive_hydrology').astype(np.float32)
    df["Hydro_Euclid"] = (df["Horizontal_Distance_To_Hydrology"]**2 + np.abs(df["Vertical_Distance_To_Hydrology"])**2)**0.5
    df['Water Elevation'] = df['Elevation'] - df['Vertical_Distance_To_Hydrology']
    df["Soil_Count"] = df[soil_features].apply(sum, axis=1)
    df["Wilderness_Count"] = df[wilderness_features].apply(sum, axis=1)
    df['Hydro_Fire_Sum'] = df['Horizontal_Distance_To_Hydrology'] + df['Horizontal_Distance_To_Fire_Points']
    df['Hydro_Fire_AbsDiff'] = abs(df['Horizontal_Distance_To_Hydrology'] - df['Horizontal_Distance_To_Fire_Points'])
    df['Hydro_Fire_Diff'] = df['Horizontal_Distance_To_Hydrology'] - df['Horizontal_Distance_To_Fire_Points']
    df['Hydro_Road_1'] = abs(df['Horizontal_Distance_To_Hydrology'] + df['Horizontal_Distance_To_Roadways'])
    df['Hydro_Road_2'] = abs(df['Horizontal_Distance_To_Hydrology'] - df['Horizontal_Distance_To_Roadways'])
    df['Fire_Road_1'] = abs(df['Horizontal_Distance_To_Fire_Points'] + df['Horizontal_Distance_To_Roadways'])
    df['Fire_Road_2'] = abs(df['Horizontal_Distance_To_Fire_Points'] - df['Horizontal_Distance_To_Roadways'])
    df['Elev_HHydro_Diff'] = df.Elevation - df.Horizontal_Distance_To_Hydrology * 0.2
    
    return df

In [9]:
cv_score, oof_preds, test_score, fi_scores = score_features(
    xgb_pipeline, 
    xgboost_features
)

new_rows.append((
    'XGBoost', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

fi_scores

Fold 0 Accuracy:  0.96114 in 55.9s.
Fold 1 Accuracy:  0.96134 in 55.42s.
Fold 2 Accuracy:  0.95946 in 55.49s.
Fold 3 Accuracy:  0.96106 in 55.72s.
Fold 4 Accuracy:  0.96014 in 56.7s.
Fold 5 Accuracy:  0.96198 in 55.48s.
Fold 6 Accuracy:  0.9588 in 55.82s.
Fold 7 Accuracy:  0.96042 in 55.9s.
Fold 8 Accuracy:  0.96184 in 55.86s.
Fold 9 Accuracy:  0.9599 in 56.11s.
Train Accuracy: 0.96061
Test Accuracy: 0.96124
Training Time: 558.39s



Slope                                -0.000029
Hillshade_9am                         0.000020
Aspect                                0.000034
Horizontal_Distance_To_Hydrology      0.000042
Aspect_360                            0.000050
Wilderness_Area2                      0.000077
Hillshade_3pm                         0.000120
Hydro_Euclid                          0.000162
Vertical_Distance_To_Hydrology        0.000280
Hydro_Taxicab                         0.000377
Hillshade_Noon                        0.000400
Wilderness_Count                      0.000583
Hydro_Taxicab_Pos                     0.000614
Hydro_Fire_AbsDiff                    0.001227
Hydro_Road_1                          0.001275
Horizontal_Distance_To_Fire_Points    0.002006
Hydro_Fire_Diff                       0.002084
Hydro_Road_2                          0.002560
Hydro_Fire_Sum                        0.003632
Wilderness_Area4                      0.004248
Fire_Road_2                           0.004960
Fire_Road_1  

## LightGBM

In [10]:
def lightgbm_features(data):
    df = data.copy()
    shade_features = ['Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm']
    soil_features = [x for x in df.columns if x.startswith("Soil_Type")]
    wilderness_features = [x for x in df.columns if x.startswith("Wilderness_Area")]
    
    # LightGBM Features
    df["Horizontal_Distance_To_Hydrology"] = df["Horizontal_Distance_To_Hydrology"].astype('float64')
    df["Vertical_Distance_To_Hydrology"] = df["Vertical_Distance_To_Hydrology"].astype('float64')
    pos_h_hydrology = start_at_eps(df["Horizontal_Distance_To_Hydrology"])
    pos_v_hydrology = start_at_eps(df['Vertical_Distance_To_Hydrology'])
    df["Hydro_Taxicab"] = np.abs(df["Horizontal_Distance_To_Hydrology"]) + np.abs(df["Vertical_Distance_To_Hydrology"])
    df['Hydro_Taxicab_Pos'] = (pos_h_hydrology ** 2 + pos_v_hydrology ** 2).apply(np.sqrt).rename('Euclidean_positive_hydrology').astype(np.float32)
    df["Hydro_Euclid"] = (df["Horizontal_Distance_To_Hydrology"]**2 + np.abs(df["Vertical_Distance_To_Hydrology"])**2)**0.5
    df['Water Elevation'] = df['Elevation'] - df['Vertical_Distance_To_Hydrology']
    df["Soil_Count"] = df[soil_features].apply(sum, axis=1)
    df["Wilderness_Count"] = df[wilderness_features].apply(sum, axis=1)
    df['Hydro_Fire_Sum'] = df['Horizontal_Distance_To_Hydrology'] + df['Horizontal_Distance_To_Fire_Points']
    df['Hydro_Fire_AbsDiff'] = abs(df['Horizontal_Distance_To_Hydrology'] - df['Horizontal_Distance_To_Fire_Points'])
    df['Hydro_Fire_EpsSum'] = start_at_eps(df['Horizontal_Distance_To_Hydrology']) + start_at_eps(df['Horizontal_Distance_To_Fire_Points'])
    df['Hydro_Fire_Diff'] = df['Horizontal_Distance_To_Hydrology'] - df['Horizontal_Distance_To_Fire_Points']
    df['Elev_HHydro_Diff'] = df.Elevation - df.Horizontal_Distance_To_Hydrology * 0.2
    
    return df

In [11]:
cv_score, oof_preds, test_score, fi_scores = score_features(
    lgbm_pipeline, 
    lightgbm_features
)

new_rows.append((
    'LightGBM', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

fi_scores

Fold 0 Accuracy:  0.94934 in 210.96s.
Fold 1 Accuracy:  0.94828 in 206.52s.
Fold 2 Accuracy:  0.95022 in 204.6s.
Fold 3 Accuracy:  0.9504 in 203.47s.
Fold 4 Accuracy:  0.94954 in 211.2s.
Fold 5 Accuracy:  0.95028 in 208.35s.
Fold 6 Accuracy:  0.94498 in 205.22s.
Fold 7 Accuracy:  0.95038 in 214.19s.
Fold 8 Accuracy:  0.95406 in 182.91s.
Fold 9 Accuracy:  0.95288 in 212.65s.
Train Accuracy: 0.95004
Test Accuracy: 0.95845
Training Time: 2060.08s



Slope                                 0.000009
Aspect                                0.000022
Hillshade_9am                         0.000098
Wilderness_Area2                      0.000125
Hillshade_3pm                         0.000143
Hillshade_Noon                        0.000446
Wilderness_Count                      0.000697
Wilderness_Area4                      0.002302
Vertical_Distance_To_Hydrology        0.003855
Hydro_Taxicab_Pos                     0.003922
Hydro_Euclid                          0.004348
Hydro_Taxicab                         0.006063
Hydro_Fire_AbsDiff                    0.006725
Hydro_Fire_Sum                        0.007820
Hydro_Fire_EpsSum                     0.008922
Horizontal_Distance_To_Hydrology      0.010108
Hydro_Fire_Diff                       0.010675
Horizontal_Distance_To_Fire_Points    0.015822
Wilderness_Area1                      0.017188
Wilderness_Area3                      0.020926
Soil_Count                            0.035834
Horizontal_Di

## CatBoost

In [12]:
def catboost_features(data):
    df = data.copy()
    shade_features = ['Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm']
    soil_features = [x for x in df.columns if x.startswith("Soil_Type")]
    wilderness_features = [x for x in df.columns if x.startswith("Wilderness_Area")]
    
    # CatBoost Features
    df["Horizontal_Distance_To_Hydrology"] = df["Horizontal_Distance_To_Hydrology"].astype('float64')
    df["Vertical_Distance_To_Hydrology"] = df["Vertical_Distance_To_Hydrology"].astype('float64')
    pos_h_hydrology = start_at_eps(df["Horizontal_Distance_To_Hydrology"])
    pos_v_hydrology = start_at_eps(df['Vertical_Distance_To_Hydrology'])
    df["Hydro_Taxicab"] = np.abs(df["Horizontal_Distance_To_Hydrology"]) + np.abs(df["Vertical_Distance_To_Hydrology"])
    df['Hydro_Taxicab_Pos'] = (pos_h_hydrology ** 2 + pos_v_hydrology ** 2).apply(np.sqrt).rename('Euclidean_positive_hydrology').astype(np.float32)
    df["Hydro_Euclid"] = (df["Horizontal_Distance_To_Hydrology"]**2 + np.abs(df["Vertical_Distance_To_Hydrology"])**2)**0.5
    df['Hydro_Euclid_Pos'] = (pos_h_hydrology ** 2 + pos_v_hydrology ** 2).apply(np.sqrt)
    df['Water Elevation'] = df['Elevation'] - df['Vertical_Distance_To_Hydrology']
    df["Soil_Count"] = df[soil_features].apply(sum, axis=1)
    df['Hydro_Fire_Sum'] = df['Horizontal_Distance_To_Hydrology'] + df['Horizontal_Distance_To_Fire_Points']
    df['Hydro_Fire_AbsDiff'] = abs(df['Horizontal_Distance_To_Hydrology'] - df['Horizontal_Distance_To_Fire_Points'])
    df['Hydro_Fire_EpsSum'] = start_at_eps(df['Horizontal_Distance_To_Hydrology']) + start_at_eps(df['Horizontal_Distance_To_Fire_Points'])
    df['Hydro_Fire_Diff'] = df['Horizontal_Distance_To_Hydrology'] - df['Horizontal_Distance_To_Fire_Points']
    df['Hydro_Road_1'] = abs(df['Horizontal_Distance_To_Hydrology'] + df['Horizontal_Distance_To_Roadways'])
    df['Hydro_Road_2'] = abs(df['Horizontal_Distance_To_Hydrology'] - df['Horizontal_Distance_To_Roadways'])
    df['Fire_Road_1'] = abs(df['Horizontal_Distance_To_Fire_Points'] + df['Horizontal_Distance_To_Roadways'])
    df['Fire_Road_2'] = abs(df['Horizontal_Distance_To_Fire_Points'] - df['Horizontal_Distance_To_Roadways'])
    df['Elev_VHydro_Diff'] = df.Elevation - df.Vertical_Distance_To_Hydrology
    df['Elev_HHydro_Diff'] = df.Elevation - df.Horizontal_Distance_To_Hydrology * 0.2
    
    return df

In [13]:
cv_score, oof_preds, test_score, fi_scores = score_features(
    catboost_pipeline, 
    catboost_features
)

new_rows.append((
    'CatBoost', cv_score, test_score,
     *recall_score(y_train, oof_preds, average = None)
))

fi_scores

Fold 0 Accuracy:  0.96084 in 185.17s.
Fold 1 Accuracy:  0.96124 in 186.36s.
Fold 2 Accuracy:  0.95948 in 184.52s.
Fold 3 Accuracy:  0.96142 in 181.81s.
Fold 4 Accuracy:  0.95962 in 181.9s.
Fold 5 Accuracy:  0.96204 in 183.73s.
Fold 6 Accuracy:  0.95916 in 182.3s.
Fold 7 Accuracy:  0.9606 in 181.84s.
Fold 8 Accuracy:  0.96222 in 180.94s.
Fold 9 Accuracy:  0.9598 in 182.57s.
Train Accuracy: 0.96064
Test Accuracy: 0.96082
Training Time: 1831.14s



Slope                                -0.000029
Hillshade_9am                         0.000012
Hillshade_3pm                         0.000024
Wilderness_Area2                      0.000079
Aspect                                0.000100
Horizontal_Distance_To_Hydrology      0.000160
Hydro_Euclid                          0.000206
Hydro_Taxicab_Pos                     0.000270
Hydro_Euclid_Pos                      0.000288
Vertical_Distance_To_Hydrology        0.000294
Hydro_Taxicab                         0.000330
Hillshade_Noon                        0.000427
Hydro_Fire_EpsSum                     0.000716
Hydro_Road_1                          0.000888
Hydro_Fire_AbsDiff                    0.001514
Hydro_Fire_Sum                        0.001530
Hydro_Fire_Diff                       0.001596
Wilderness_Area4                      0.001720
Horizontal_Distance_To_Fire_Points    0.002580
Hydro_Road_2                          0.002973
Fire_Road_2                           0.003645
Wilderness_Ar

# Feature Encoding

We reduce the ~40 or so soil columns to the following:

1. `Soil_Type` - For observations with all 0's or exactly 1 soil type, we indicated the original label. NAs if more than one.
2. `Soil_Count` - Number of non-zero soil types.
3. `Avg_Climatic` - Average climatic zone (1-lower montane dry...8-alpine)
4. `Avg_SurfaceCover` - Average surface cover (1- stony...4-rubbly)
4. `Avg_RockSize` - Average rock size (1-stones, 2-boulders, 3-rubble)

In [24]:
def feature_encoding(data, other_features):
    df = other_features(data)
    soil_cols = [x for x in data.columns if x.startswith('Soil_Type')]
    
    code = {
        1:2702,2:2703,3:2704,4:2705,5:2706,6:2717,7:3501,8:3502,9:4201,
        10:4703,11:4704,12:4744,13:4758,14:5101,15:5151,16:6101,17:6102,
        18:6731,19:7101,20:7102,21:7103,22:7201,23:7202,24:7700,25:7701,
        26:7702,27:7709,28:7710,29:7745,30:7746,31:7755,32:7756,33:7757,
        34:7790,35:8703,36:8707,37:8708,38:8771,39:8772,40:8776
    }
    
    # Soil Type
    soil_type = list()
    for index, row in df.iterrows():
        if row.loc['Soil_Count'] > 1:
            soil_type.append(pd.NA)
        elif row.loc['Soil_Count'] == 0:
            soil_type.append(0)
        else:
            for x in soil_cols:
                if row.loc[x] == 1:
                    soil_type.append(int(x[9:]))
                    break
    df['Soil_Type'] = soil_type
    
    # Avg Climatic Zone
    df['Avg_Climatic'] = 0
    df[df.Soil_Count == 0]['Avg_Climatic'] = pd.NA
    for i, col in enumerate(soil_cols, start = 1):
        df[df.Soil_Count > 0]['Avg_Climatic'] += df[df.Soil_Count > 0][col].apply(
            lambda x: x*int(str(code[int(col[9:])])[0])
        )
    df[df.Soil_Count > 0]['Avg_Climatic'] /= df[df.Soil_Count > 0]['Soil_Count']

    # Avg Surface Cover
    no_desc = [7,8,14,15,16,17,19,20,21,23,35]
    stony = [6,12]
    very_stony = [2,9,18,26]
    extremely_stony = [1,22,24,25,27,28,29,30,31,32,33,34,36,37,38,39,40]
    rubbly = [3,4,5,10,11,13]
    surface_cover = {i:0 for i in no_desc}
    surface_cover.update({i:1 for i in stony})
    surface_cover.update({i:2 for i in very_stony})
    surface_cover.update({i:3 for i in extremely_stony})
    surface_cover.update({i:4 for i in rubbly})
    
    df['Avg_SurfaceCover'] = 0
    df[df.Soil_Count == 0]['Avg_SurfaceCover'] = pd.NA
    for i, col in enumerate(soil_cols, start = 1):
        df[df.Soil_Count > 0]['Avg_SurfaceCover'] += df[df.Soil_Count > 0][col].apply(
            lambda x: x*surface_cover[int(col[9:])]
        )
    df[df.Soil_Count > 0]['Avg_SurfaceCover'] /= df[df.Soil_Count > 0]['Soil_Count']
    df[df.Avg_SurfaceCover == 0]['Avg_SurfaceCover'] = pd.NA
    
    # Avg Rock Size
    no_desc = [7,8,14,15,16,17,19,20,21,23,35]
    stones = [1,2,6,9,12,18,24,25,26,27,28,29,30,31,32,33,34,36,37,38,39,40]
    boulders = [22]
    rubble = [3,4,5,10,11,13]
    rock_size = {i:0 for i in no_desc}
    rock_size.update({i:1 for i in stones})
    rock_size.update({i:2 for i in boulders})
    rock_size.update({i:3 for i in rubble})
    
    df['Avg_RockSize'] = 0
    df[df.Soil_Count == 0]['Avg_RockSize'] = pd.NA
    for i, col in enumerate(soil_cols, start = 1):
        df[df.Soil_Count > 0]['Avg_RockSize'] += df[df.Soil_Count > 0][col].apply(
            lambda x: x*rock_size[int(col[9:])]
        )
    df[df.Soil_Count > 0]['Avg_RockSize'] /= df[df.Soil_Count > 0]['Soil_Count']
    df[df.Avg_RockSize == 0]['Avg_RockSize'] = pd.NA

    return df

In [25]:
temp = feature_encoding(train, xgboost_features)
temp.head(25)

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Hydro_Fire_Diff,Hydro_Road_1,Hydro_Road_2,Fire_Road_1,Fire_Road_2,Elev_HHydro_Diff,Soil_Type,Avg_Climatic,Avg_SurfaceCover,Avg_RockSize
2325872,2829,184,19,230.0,-3.0,2217,164,190,151,2544,...,-2314.0,2447.0,1987.0,4761,327,2783.0,1.0,0,0,0
3568157,3349,9,19,159.0,125.0,504,219,171,180,1907,...,-1748.0,663.0,345.0,2411,1403,3317.2,0.0,0,0,0
2290813,3274,133,18,591.0,41.0,1225,227,222,168,5152,...,-4561.0,1816.0,634.0,6377,3927,3155.8,37.0,0,0,0
1108917,2870,3,29,314.0,33.0,1278,208,216,74,1428,...,-1114.0,1592.0,964.0,2706,150,2807.2,0.0,0,0,0
3920926,3264,220,6,150.0,66.0,1625,235,225,210,289,...,-139.0,1775.0,1475.0,1914,1336,3234.0,,0,0,0
3420313,2956,-9,6,614.0,3.0,1699,224,242,194,1860,...,-1246.0,2313.0,1085.0,3559,161,2833.2,,0,0,0
2981355,2738,244,12,133.0,7.0,120,192,135,186,955,...,-822.0,253.0,13.0,1075,835,2711.4,,0,0,0
2974558,2732,297,23,674.0,5.0,1866,199,211,171,837,...,-163.0,2540.0,1192.0,2703,1029,2597.2,31.0,0,0,0
82838,3393,51,3,798.0,106.0,3372,210,231,133,1614,...,-816.0,4170.0,2574.0,4986,1758,3233.4,,0,0,0
616731,3028,6,10,481.0,1.0,1896,243,216,151,1113,...,-632.0,2377.0,1415.0,3009,783,2931.8,0.0,0,0,0


# Binary Encoding

We do the following:

1. `Soil_Type` - For observations with all 0's or exactly 1 soil type, we indicated the original label. NAs if more than one.
2. `Soil_Count` - Number of non-zero soil types.
3. `Soil_Encoding` - Five labels indicating a binary encoding of soil types grouped by climatic zones.

In [None]:
def binary_encoding(data, other_encoding):
    df = other_features(data)
    pass

# Summary

In [None]:
pd.DataFrame.from_records(
    data = new_rows,
    columns = ['features','cv_scores','holdout','recall_0', 'recall_1','recall_2','recall_3','recall_4','recall_5']
).sort_values('holdout')