In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.options.display.float_format = '{:.4f}'.format

# Load Dataset

In [2]:
df = pd.read_csv('data/forestCover.csv', index_col='Observation_ID', na_values='?')

# Preprocessing

## Preprocessing Applied to All Model Datasets

In [3]:
from collections import Counter
from sklearn.model_selection import train_test_split

# Drop observations with missing values as they only make up 0.05% of observations
df.dropna(inplace=True)

# Change Soil_Type1 from categorical to numeric so SMOTETomek can be applied
df['Soil_Type1'] = df['Soil_Type1'].map({ 'positive': 0, 'negative': 1 })

X = df.drop('Cover_Type', axis = 1)
y = df['Cover_Type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

### Soil_Type Sanity Check

In [4]:
soil_columns = [f'Soil_Type{i}' for i in range(1, 41)]
soil_df = df[soil_columns]

row_sums = soil_df.sum(axis=1)

non_exclusive_count = (row_sums > 1).sum()

print(non_exclusive_count)

0


`Soil_Type1` needed to be converted from categorical to numerical in order to use SMOTE and Tomek links to reduce target imbalance. The sanity check confirms that the `Soil_Typex` rows are all mutually exclusive and no observations have more than one of these columns with a value of *1*. 

## Classification Tree Preprocessing

In [None]:
from imblearn.over_sampling import SMOTE

tree_resampler = SMOTE(random_state=42)

X_train_tree, y_train_tree = tree_resampler.fit_resample(X_train, y_train)

In [None]:
X_test_tree = X_test.copy() # Unbiased real world test data
y_test_tree = y_test.copy()

## k-NN Preprocessing

In [7]:
corr = df['Facet'].corr(df['Aspect'])

print(corr)

0.99999805373707


In [None]:
corr = df['Facet'].corr(df['Cover_Type'])
print(corr)

corr = df['Aspect'].corr(df['Cover_Type'])
print(corr)

0.01707189190902871
0.017068499402923768


In [None]:
X_train_knn = X_train.copy()
y_train_knn = y_train.copy()

In [None]:
X_test_knn = X_test.copy() # Unbiased real world test data
y_test_knn = y_test.copy()

In [None]:
X_train_knn.drop(columns=['Aspect', 'Inclination'], inplace=True)
X_test_knn.drop(columns=['Aspect', 'Inclination'], inplace=True)

In [None]:
from sklearn.preprocessing import RobustScaler

# RobustScaler uses statistics resilient to outliers to scale data
scaler = RobustScaler()

X_train_knn = scaler.fit_transform(X_train_knn)
X_test_knn= scaler.transform(X_test_knn)

In [None]:
knn_resampler = SMOTE(random_state=42)

X_train_knn, y_train_knn = knn_resampler.fit_resample(X_train_knn, y_train_knn)