# Notebook 4 - Feature Importance

In this notebook, we check the feature importances using permutation.

In [1]:
# Global variables for testing changes to this notebook quickly
RANDOM_SEED = 0
NUM_FOLDS = 12

In [2]:
import numpy as np
import pandas as pd
import time
import pyarrow
import gc

# Model evaluation
from functools import partial
from sklearn.base import clone
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, recall_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.inspection import partial_dependence, permutation_importance

# Plotting
import matplotlib
import seaborn as sns
from matplotlib import pyplot as plt

# Hide warnings
import warnings
warnings.filterwarnings('ignore')

# Load Data

In [3]:
def categorical_encoding(input_df):
    data = input_df.copy()
    data['Soil_Type'] = 0
    soil_features = list()
    for i in range(1,41):
        data['Soil_Type'] += i*data[f'Soil_Type{i}']
        soil_features.append(f'Soil_Type{i}')
    nonsoil_features = [x for x in data.columns if x not in soil_features]
    return data[nonsoil_features]

In [17]:
%%time

# Load original data
original = categorical_encoding(pd.read_feather('../data/original.feather'))

# Label Encode
old_encoder = LabelEncoder()
original["Cover_Type"] = old_encoder.fit_transform(original["Cover_Type"])
y_train = original['Cover_Type'].iloc[:15119]
y_test = original['Cover_Type'].iloc[15119:]

# Get feature columns
features = [x for x in original.columns if x not in ['Id','Cover_Type']]

Wall time: 222 ms


# Scoring and Plotting Functions

The plotting functions are borrowed from [this example](https://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html) in the scikit-learn documentation.

In [5]:
def train_original(sklearn_model, processing = None):
    
    # Original Training/Test Split
    features = [x for x in original.columns if x not in ['Id','Cover_Type']]
    X_temp = original[features].iloc[:15119]
    X_test = original[features].iloc[15119:]
    y_temp = original['Cover_Type'].iloc[:15119]
    y_test = original['Cover_Type'].iloc[15119:]
    
    # Feature Engineering
    if processing:
        X_temp = processing(X_temp)
        X_test = processing(X_test)
    
    # Store the out-of-fold predictions
    test_preds = np.zeros((X_test.shape[0],7))
    oof_preds = np.zeros((X_temp.shape[0],))
    fi_scores = np.zeros((X_temp.shape[1],))
    scores, times = np.zeros(NUM_FOLDS), np.zeros(NUM_FOLDS)
    
    # Stratified k-fold cross-validation
    skf = StratifiedKFold(n_splits = NUM_FOLDS, shuffle = True, random_state = RANDOM_SEED)
    for fold, (train_idx, valid_idx) in enumerate(skf.split(X_temp,y_temp)):
       
        # Training and Validation Sets
        X_train, X_valid = X_temp.iloc[train_idx], X_temp.iloc[valid_idx]
        y_train, y_valid = y_temp.iloc[train_idx], y_temp.iloc[valid_idx]
        
        # Create model
        start = time.time()
        model = clone(sklearn_model)
        model.fit(X_train, y_train)

        # Permutation Importance
        result = permutation_importance(
            model, X_valid, y_valid, 
            n_repeats=10, random_state=RANDOM_SEED, n_jobs=-1
        )
        fi_scores += result.importances_mean / NUM_FOLDS

        # validation/holdout predictions
        valid_preds = np.ravel(model.predict(X_valid))
        oof_preds[valid_idx] = valid_preds
        test_preds += model.predict_proba(X_test)

        # Save scores and times
        scores[fold] = accuracy_score(y_valid, valid_preds)
        end = time.time()
        times[fold] = end-start
        time.sleep(0.5)
    
    test_preds = np.argmax(test_preds, axis = 1)
    test_score = accuracy_score(y_test, test_preds)
    print('\n'+model.__class__.__name__)
    print("Train Accuracy:", round(scores.mean(), 5))
    print('Test Accuracy:', round(test_score, 5))
    print(f'Training Time: {round(times.sum(), 2)}s')
    
    return pd.Series(
        data = fi_scores, 
        index = features
    ).sort_values()

# Baselines

## 1. ExtraTrees

In [6]:
extratrees = ExtraTreesClassifier(
    n_jobs = -1,
    random_state = RANDOM_SEED,
    max_features = None,
)

In [7]:
train_original(extratrees)


ExtraTreesClassifier
Train Accuracy: 0.88538
Test Accuracy: 0.78206
Training Time: 95.27s


Hillshade_3pm                         0.000609
Wilderness_Area2                      0.001687
Slope                                 0.002573
Aspect                                0.010326
Hillshade_Noon                        0.012018
Vertical_Distance_To_Hydrology        0.015028
Wilderness_Area3                      0.020987
Wilderness_Area1                      0.022739
Hillshade_9am                         0.028844
Horizontal_Distance_To_Hydrology      0.063417
Horizontal_Distance_To_Fire_Points    0.089490
Horizontal_Distance_To_Roadways       0.105305
Soil_Type                             0.231934
Wilderness_Area4                      0.243944
Elevation                             0.476791
dtype: float64

## 2. Bagging

In [8]:
bagging = BaggingClassifier(
    base_estimator = DecisionTreeClassifier(
        splitter = 'random',
        random_state = RANDOM_SEED,
    ),
    n_jobs = -1,
    random_state = RANDOM_SEED,
)

In [9]:
train_original(bagging)


BaggingClassifier
Train Accuracy: 0.85006
Test Accuracy: 0.75982
Training Time: 36.85s


Wilderness_Area2                      0.001204
Slope                                 0.008579
Hillshade_3pm                         0.008585
Hillshade_Noon                        0.021556
Aspect                                0.021708
Vertical_Distance_To_Hydrology        0.027462
Wilderness_Area1                      0.039135
Wilderness_Area3                      0.044951
Hillshade_9am                         0.045829
Horizontal_Distance_To_Hydrology      0.063913
Horizontal_Distance_To_Fire_Points    0.083643
Horizontal_Distance_To_Roadways       0.102970
Soil_Type                             0.246215
Wilderness_Area4                      0.280245
Elevation                             0.429830
dtype: float64

## 3. Random Forest

In [10]:
randomforest = RandomForestClassifier(
    n_jobs = -1,
    random_state = RANDOM_SEED,
)

In [11]:
train_original(randomforest)


RandomForestClassifier
Train Accuracy: 0.86454
Test Accuracy: 0.74822
Training Time: 78.85s


Wilderness_Area2                      0.001429
Slope                                 0.004173
Hillshade_3pm                         0.004378
Aspect                                0.011383
Wilderness_Area3                      0.013400
Hillshade_Noon                        0.016304
Hillshade_9am                         0.018241
Vertical_Distance_To_Hydrology        0.019856
Wilderness_Area4                      0.022899
Wilderness_Area1                      0.026992
Horizontal_Distance_To_Hydrology      0.046928
Horizontal_Distance_To_Fire_Points    0.080125
Horizontal_Distance_To_Roadways       0.103525
Soil_Type                             0.166313
Elevation                             0.350493
dtype: float64

# Feature Engineering

In [18]:
def feature_engineering(data):
    
    df = data.copy()
    
    # Distance Interactions
    
    df['Hydro_Fire_1'] = df['Horizontal_Distance_To_Hydrology'] + df['Horizontal_Distance_To_Fire_Points']
    df['Hydro_Fire_2'] = abs(df['Horizontal_Distance_To_Hydrology'] - df['Horizontal_Distance_To_Fire_Points'])
    df['Hydro_Road_1'] = abs(df['Horizontal_Distance_To_Hydrology'] + df['Horizontal_Distance_To_Roadways'])
    df['Hydro_Road_2'] = abs(df['Horizontal_Distance_To_Hydrology'] - df['Horizontal_Distance_To_Roadways'])
    df['Fire_Road_1'] = abs(df['Horizontal_Distance_To_Fire_Points'] + df['Horizontal_Distance_To_Roadways'])
    df['Fire_Road_2'] = abs(df['Horizontal_Distance_To_Fire_Points'] - df['Horizontal_Distance_To_Roadways'])
    
    # ELU soil codes
    code = {
        1:2702,2:2703,3:2704,4:2705,5:2706,6:2717,7:3501,8:3502,9:4201,
        10:4703,11:4704,12:4744,13:4758,14:5101,15:5151,16:6101,17:6102,
        18:6731,19:7101,20:7102,21:7103,22:7201,23:7202,24:7700,25:7701,
        26:7702,27:7709,28:7710,29:7745,30:7746,31:7755,32:7756,33:7757,
        34:7790,35:8703,36:8707,37:8708,38:8771,39:8772,40:8776
    }
    
    # Climatic Zone
    df['Climatic_Zone'] = df['Soil_Type'].apply(lambda x: int(str(code[x])[0]))
    
    # Geologic Zone
    df['Geologic_Zone'] = df['Soil_Type'].apply(lambda x: int(str(code[x])[1]))
    
    # Surface cover by rocks
    no_desc = [7,8,14,15,16,17,19,20,21,23,35]
    stony = [6,12]
    very_stony = [2,9,18,26]
    extremely_stony = [1,22,24,25,27,28,29,30,31,32,33,34,36,37,38,39,40]
    rubbly = [3,4,5,10,11,13]
    surface_cover = {i:0 for i in no_desc}
    surface_cover.update({i:1 for i in stony})
    surface_cover.update({i:2 for i in very_stony})
    surface_cover.update({i:3 for i in extremely_stony})
    surface_cover.update({i:4 for i in rubbly})
    
    df['Surface_Cover'] = df['Soil_Type'].apply(lambda x: surface_cover[x])

    # Rock size
    no_desc = [7,8,14,15,16,17,19,20,21,23,35]
    stones = [1,2,6,9,12,18,24,25,26,27,28,29,30,31,32,33,34,36,37,38,39,40]
    boulders = [22]
    rubble = [3,4,5,10,11,13]
    rock_size = {i:0 for i in no_desc}
    rock_size.update({i:1 for i in stones})
    rock_size.update({i:2 for i in boulders})
    rock_size.update({i:3 for i in rubble})
    
    df['Rock_Size'] = df['Soil_Type'].apply(lambda x: rock_size[x])

            
    # Wilderness Interactions
    #df['Climate_Area1'] = df['Wilderness_Area1']*df['Climatic_Zone'] 
    #df['Climate_Area2'] = df['Wilderness_Area2']*df['Climatic_Zone'] 
    #df['Climate_Area3'] = df['Wilderness_Area3']*df['Climatic_Zone'] 
    #df['Climate_Area4'] = df['Wilderness_Area4']*df['Climatic_Zone'] 
    #df['Geologic_Area1'] = df['Wilderness_Area1']*df['Geologic_Zone'] 
    #df['Geologic_Area2'] = df['Wilderness_Area2']*df['Geologic_Zone']  
    #df['Geologic_Area3'] = df['Wilderness_Area3']*df['Geologic_Zone'] 
    #df['Geologic_Area4'] = df['Wilderness_Area4']*df['Geologic_Zone'] 
    df['Rock_Area1'] = df['Wilderness_Area1']*df['Rock_Size'] 
    #df['Rock_Area2'] = df['Wilderness_Area2']*df['Rock_Size']   
    df['Rock_Area3'] = df['Wilderness_Area3']*df['Rock_Size']  
    df['Rock_Area4'] = df['Wilderness_Area4']*df['Rock_Size']
    #df['Surface_Area1'] = df['Wilderness_Area1']*df['Surface_Cover'] 
    #df['Surface_Area2'] = df['Wilderness_Area2']*df['Surface_Cover']   
    #df['Surface_Area3'] = df['Wilderness_Area3']*df['Surface_Cover']  
    #df['Surface_Area4'] = df['Wilderness_Area4']*df['Surface_Cover'] 
    
    for col, dtype in df.dtypes.iteritems():
        if dtype.name.startswith('int'):
            df[col] = pd.to_numeric(df[col], downcast ='integer')
        elif dtype.name.startswith('float'):
            df[col] = pd.to_numeric(df[col], downcast ='float')
    
    return df

In [19]:
%%time

original = feature_engineering(original)

Wall time: 1.03 s


# New Features

## 1. ExtraTrees

In [20]:
train_original(extratrees)


ExtraTreesClassifier
Train Accuracy: 0.90105
Test Accuracy: 0.8051
Training Time: 142.22s


Rock_Size                             0.000351
Rock_Area3                            0.000966
Surface_Cover                         0.001039
Rock_Area4                            0.001336
Hillshade_3pm                         0.001349
Slope                                 0.001350
Wilderness_Area3                      0.002242
Wilderness_Area2                      0.003671
Horizontal_Distance_To_Roadways       0.004861
Geologic_Zone                         0.005272
Hydro_Fire_1                          0.005543
Horizontal_Distance_To_Fire_Points    0.005602
Hydro_Road_1                          0.006786
Aspect                                0.006952
Hydro_Fire_2                          0.007170
Hillshade_Noon                        0.008843
Vertical_Distance_To_Hydrology        0.009101
Wilderness_Area1                      0.009121
Hydro_Road_2                          0.010470
Hillshade_9am                         0.015246
Fire_Road_1                           0.015610
Fire_Road_2  

## 2. Bagging

In [21]:
train_original(bagging)


BaggingClassifier
Train Accuracy: 0.87188
Test Accuracy: 0.78268
Training Time: 52.2s


Wilderness_Area2                      0.002500
Rock_Area3                            0.002666
Surface_Cover                         0.003287
Hillshade_3pm                         0.004643
Rock_Size                             0.004788
Slope                                 0.005245
Rock_Area4                            0.005940
Geologic_Zone                         0.006681
Wilderness_Area3                      0.007501
Vertical_Distance_To_Hydrology        0.013877
Hillshade_Noon                        0.014181
Aspect                                0.014809
Horizontal_Distance_To_Roadways       0.015867
Hydro_Road_1                          0.015868
Horizontal_Distance_To_Fire_Points    0.016615
Hydro_Fire_1                          0.016906
Hydro_Fire_2                          0.017680
Wilderness_Area1                      0.018646
Rock_Area1                            0.024379
Hillshade_9am                         0.027132
Fire_Road_2                           0.028031
Hydro_Road_2 

## 3. Random Forest

In [22]:
train_original(randomforest)


RandomForestClassifier
Train Accuracy: 0.88928
Test Accuracy: 0.78285
Training Time: 111.52s


Wilderness_Area2                      0.000437
Geologic_Zone                         0.001045
Rock_Area1                            0.001336
Rock_Size                             0.001826
Surface_Cover                         0.001839
Rock_Area3                            0.001998
Rock_Area4                            0.002050
Slope                                 0.002176
Wilderness_Area1                      0.002355
Wilderness_Area3                      0.002461
Hillshade_3pm                         0.003506
Aspect                                0.006647
Vertical_Distance_To_Hydrology        0.006780
Wilderness_Area4                      0.006885
Horizontal_Distance_To_Fire_Points    0.006905
Climatic_Zone                         0.006938
Hydro_Fire_2                          0.007633
Horizontal_Distance_To_Roadways       0.007917
Hydro_Fire_1                          0.008526
Hillshade_Noon                        0.008770
Hillshade_9am                         0.009762
Hydro_Road_1 