In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.options.display.float_format = '{:.4f}'.format

# Load Dataset

In [2]:
df = pd.read_csv('data/forestCover.csv', index_col='Observation_ID', na_values='?')

# Preprocessing

## Classification Tree

In [3]:
from sklearn.model_selection import train_test_split

df_tree = df.copy()

# Drop observations with missing values as they only make up 0.05% of observations
df_tree.dropna(inplace=True)

# Change Soil_Type1 from categorical to numeric so SMOTE can be applied
df_tree['Soil_Type1'] = df_tree['Soil_Type1'].map({ 'positive': 0, 'negative': 1 })

X = df_tree.drop('Cover_Type', axis = 1)
y = df_tree['Cover_Type']

X_train, X_test_tree, y_train, y_test_tree = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [4]:
from imblearn.over_sampling import SMOTE

tree_resampler = SMOTE(random_state=42)

X_train_tree, y_train_tree = tree_resampler.fit_resample(X_train, y_train)

## k-NN

In [5]:
from sklearn.model_selection import train_test_split

df_knn = df.copy()

# Drop observations with missing values as they only make up 0.05% of observations
df_knn.dropna(inplace=True)

# Change Soil_Type1 from categorical to numeric so SMOTETomek can be applied
df_knn['Soil_Type1'] = df_knn['Soil_Type1'].map({ 'positive': 0, 'negative': 1 })

# Transform to reduce extreme skewness
df_knn['Horizontal_Distance_To_Hydrology'] = np.log1p(df_knn['Horizontal_Distance_To_Hydrology'])

X = df_knn.drop('Cover_Type', axis = 1)
y = df_knn['Cover_Type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [6]:
corr = df_knn['Facet'].corr(df_knn['Aspect'])

print(f'Facet & Aspect Correlation : {corr}')

Facet & Aspect Correlation : 0.99999805373707


In [7]:
corr = df_knn['Facet'].corr(df_knn['Cover_Type'])
print(f'Facet & Cover_Type Correlation : {corr}')

corr = df_knn['Aspect'].corr(df_knn['Cover_Type'])
print(f'Aspet & Cover_Type Correlaton : {corr}')

Facet & Cover_Type Correlation : 0.01707189190902871
Aspet & Cover_Type Correlaton : 0.017068499402923768


In [8]:
X_train.drop(columns=['Aspect', 'Inclination'], inplace=True)
X_test.drop(columns=['Aspect', 'Inclination'], inplace=True)

In [9]:
knn_columns = X_train.columns

In [10]:
from sklearn.preprocessing import RobustScaler

# RobustScaler uses statistics resilient to outliers to scale data
scaler = RobustScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [11]:
knn_resampler = SMOTE(random_state=42)

X_resampled_np, y_resampled_np = knn_resampler.fit_resample(X_train_scaled, y_train)

In [12]:
X_train_knn = pd.DataFrame(data=X_resampled_np, columns=knn_columns)
y_train_knn = pd.Series(y_resampled_np)

X_test_knn = pd.DataFrame(data=X_test_scaled, columns=knn_columns)
y_test_knn = y_test.copy()

In [13]:
# Round the one-hot encoded columns because SMOTE assumes they are continuous when generating synthetic data
one_hot_columns = [col for col in X_train_knn.columns if 'Soil_Type' in col or 'Wilderness_Area' in col]
X_train_knn[one_hot_columns] = X_train_knn[one_hot_columns].round().astype(int)

# Model Training

## Classification Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score

dt_classifier = DecisionTreeClassifier(random_state=42)

param_gid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(
    estimator=dt_classifier,
    param_grid=param_gid,
    cv=5,
    n_jobs=-1,
    verbose=0,
    scoring='f1_macro'
)

grid_search.fit(X_train_tree, y_train_tree)

print('Best hyperparameters found:')
print(grid_search.best_params_)
print('\nBest CV accuracy score:')
print(f'{grid_search.best_score_:.4f}')

Best hyperparameters found:
{'criterion': 'entropy', 'max_depth': 40, 'min_samples_leaf': 1, 'min_samples_split': 2}

Best CV accuracy score:
0.9614


In [15]:
best_tree = grid_search.best_estimator_

y_pred_tree = best_tree.predict(X_test_tree)

test_acc = accuracy_score(y_test_tree, y_pred_tree)

print(f"Test set Accuracy: {test_acc:.4f}\n")

print("Classification Report")
print(classification_report(y_test_tree, y_pred_tree))

Test set Accuracy: 0.9372

Classification Report
              precision    recall  f1-score   support

           1       0.94      0.94      0.94     42349
           2       0.95      0.94      0.95     56629
           3       0.91      0.92      0.92      7147
           4       0.81      0.85      0.83       549
           5       0.75      0.85      0.80      1898
           6       0.84      0.87      0.86      3471
           7       0.93      0.96      0.95      4100

    accuracy                           0.94    116143
   macro avg       0.88      0.91      0.89    116143
weighted avg       0.94      0.94      0.94    116143



## k-NN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score

knn_classifier = KNeighborsClassifier()

param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['minkowski', 'euclidean']
}

grid_search = GridSearchCV(
    estimator=knn_classifier,
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    verbose=0,
    scoring='f1_macro'
)

grid_search.fit(X_train_knn, y_train_knn)

print('Best hyperparameters found:')
print(grid_search.best_params_)
print('\nBest CV accuracy score:')
print(f'{grid_search.best_score_:.4f}')

In [None]:
best_knn = grid_search.best_estimator_

y_pred_knn = best_tree.predict(X_test_knn)

test_acc = accuracy_score(y_test_knn, y_pred_knn)

print(f"Test set Accuracy: {test_acc:.4f}\n")

print("Classification Report")
print(classification_report(y_test_knn, y_pred_knn))