In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.options.display.float_format = '{:.4f}'.format

# Load Dataset

In [2]:
df = pd.read_csv('data/forestCover.csv', index_col='Observation_ID', na_values='?')

# Preprocessing

## Classification Tree

In [22]:
from sklearn.model_selection import train_test_split

df_tree = df.copy()

# Drop observations with missing values as they only make up 0.05% of observations
df_tree.dropna(inplace=True)

# Change Soil_Type1 from categorical to numeric so SMOTE can be applied
df_tree['Soil_Type1'] = df_tree['Soil_Type1'].map({ 'positive': 0, 'negative': 1 })

X = df_tree.drop('Cover_Type', axis = 1)
y = df_tree['Cover_Type']

X_train, X_test_tree, y_train, y_test_tree = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [23]:
from imblearn.over_sampling import SMOTE

tree_resampler = SMOTE(random_state=42)

X_train_tree, y_train_tree = tree_resampler.fit_resample(X_train, y_train)

## k-NN

In [3]:
from sklearn.model_selection import train_test_split

df_knn = df.copy()

# Drop observations with missing values as they only make up 0.05% of observations
df_knn.dropna(inplace=True)

# Change Soil_Type1 from categorical to numeric so SMOTETomek can be applied
df_knn['Soil_Type1'] = df_knn['Soil_Type1'].map({ 'positive': 0, 'negative': 1 })

# Transform to reduce extreme skewness
df_knn['Horizontal_Distance_To_Hydrology'] = np.log1p(df_knn['Horizontal_Distance_To_Hydrology'])

X = df_knn.drop('Cover_Type', axis = 1)
y = df_knn['Cover_Type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [4]:
corr = df_knn['Facet'].corr(df_knn['Aspect'])

print(f'Facet & Aspect Correlation : {corr}')

Facet & Aspect Correlation : 0.99999805373707


In [5]:
corr = df_knn['Facet'].corr(df_knn['Cover_Type'])
print(f'Facet & Cover_Type Correlation : {corr}')

corr = df_knn['Aspect'].corr(df_knn['Cover_Type'])
print(f'Aspet & Cover_Type Correlaton : {corr}')

Facet & Cover_Type Correlation : 0.01707189190902871
Aspet & Cover_Type Correlaton : 0.017068499402923768


In [6]:
X_train.drop(columns=['Aspect', 'Inclination'], inplace=True)
X_test.drop(columns=['Aspect', 'Inclination'], inplace=True)

In [7]:
knn_columns = X_train.columns

In [8]:
from sklearn.preprocessing import RobustScaler

# RobustScaler uses statistics resilient to outliers to scale data
scaler = RobustScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [10]:
from imblearn.over_sampling import SMOTE

knn_resampler = SMOTE(random_state=42)

X_resampled_np, y_resampled_np = knn_resampler.fit_resample(X_train_scaled, y_train)

In [11]:
X_train_knn = pd.DataFrame(data=X_resampled_np, columns=knn_columns)
y_train_knn = pd.Series(y_resampled_np)

X_test_knn = pd.DataFrame(data=X_test_scaled, columns=knn_columns)
y_test_knn = y_test.copy()

In [12]:
# Round the one-hot encoded columns because SMOTE assumes they are continuous when generating synthetic data
one_hot_columns = [col for col in X_train_knn.columns if 'Soil_Type' in col or 'Wilderness_Area' in col]
X_train_knn[one_hot_columns] = X_train_knn[one_hot_columns].round().astype(int)

# Model Training

## Classification Tree

In [24]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
dt_classifier = DecisionTreeClassifier(random_state=42)

param_gid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(
    estimator=dt_classifier,
    param_grid=param_gid,
    cv=5,
    n_jobs=-1,
    verbose=0,
    scoring='f1_macro'
)

grid_search.fit(X_train_tree, y_train_tree)

print('Best hyperparameters found:')
print(grid_search.best_params_)
print('\nBest CV F1 macro score:')
print(f'{grid_search.best_score_:.4f}')

Best hyperparameters found:
{'criterion': 'entropy', 'max_depth': 40, 'min_samples_leaf': 1, 'min_samples_split': 2}

Best CV F1 macro score:
0.9614


In [25]:
best_index = grid_search.best_index_

mean_score = grid_search.cv_results_['mean_test_score'][best_index]
std_score = grid_search.cv_results_['std_test_score'][best_index]

# CV performance
print(f"Average F1 Macro: {mean_score:.4f}")
print(f"Stability (Std Dev): {std_score:.4f}")

Average F1 Macro: 0.9614
Stability (Std Dev): 0.0035


In [26]:
from sklearn.metrics import classification_report, accuracy_score, cohen_kappa_score, matthews_corrcoef

best_tree = grid_search.best_estimator_

y_pred_tree = best_tree.predict(X_test_tree)

test_acc = accuracy_score(y_test_tree, y_pred_tree)

kappa_tree = cohen_kappa_score(y_test_tree, y_pred_tree)

mcc_tree = matthews_corrcoef(y_test_tree, y_pred_tree)

print(f"Test set Accuracy: {test_acc:.4f}")
print(f"Cohen's Kappa: {kappa_tree:.4f}")
print(f"Matthews Correlation Coefficient: {mcc_tree:.4f}")

Test set Accuracy: 0.9372
Cohen's Kappa: 0.8996
Matthews Correlation Coefficient: 0.8996


In [27]:
print("Classification Report")
print(classification_report(y_test_tree, y_pred_tree))

Classification Report
              precision    recall  f1-score   support

           1       0.94      0.94      0.94     42349
           2       0.95      0.94      0.95     56629
           3       0.91      0.92      0.92      7147
           4       0.81      0.85      0.83       549
           5       0.75      0.85      0.80      1898
           6       0.84      0.87      0.86      3471
           7       0.93      0.96      0.95      4100

    accuracy                           0.94    116143
   macro avg       0.88      0.91      0.89    116143
weighted avg       0.94      0.94      0.94    116143



## k-NN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

knn_classifier = KNeighborsClassifier()

param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'metric': ['manhattan', 'euclidean'],
}

grid_search = GridSearchCV(
    estimator=knn_classifier,
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    verbose=0,
    scoring='f1_macro'
)

# Use random sample for tuning
X_train_sample = X_train_knn.sample(n=200000, random_state=42)
y_train_sample = y_train_knn.loc[X_train_sample.index]

grid_search.fit(X_train_sample, y_train_sample)

print('Best hyperparameters found:')
print(grid_search.best_params_)
print('\nBest CV F1 macro score:')
print(f'{grid_search.best_score_:.4f}')

Best hyperparameters found:
{'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'distance'}

Best CV F1 macro score:
0.9397


In [None]:
best_index = grid_search.best_index_

mean_score = grid_search.cv_results_['mean_test_score'][best_index]
std_score = grid_search.cv_results_['std_test_score'][best_index]

# CV performance
print(f"Average F1 Macro: {mean_score:.4f}")
print(f"Stability (Std Dev): {std_score:.4f}")

Average F1 Macro: 0.9397
Stability (Std Dev): 0.0018


In [None]:
# Train final kNN model using best parameters
best_params = grid_search.best_params_

final_knn = KNeighborsClassifier(**best_params, n_jobs=-1)
# final_knn = KNeighborsClassifier(n_neighbors=3, weights='distance', metric='manhattan', n_jobs=-1)

# Train on entire dataset
final_knn.fit(X_train_knn, y_train_knn)

0,1,2
,n_neighbors,3
,weights,'distance'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'manhattan'
,metric_params,
,n_jobs,-1


In [20]:
from sklearn.metrics import classification_report, accuracy_score, cohen_kappa_score, matthews_corrcoef

y_pred_knn = final_knn.predict(X_test_knn)

test_acc = accuracy_score(y_test_knn, y_pred_knn)

kappa_knn = cohen_kappa_score(y_test_knn, y_pred_knn)

mcc_knn = matthews_corrcoef(y_test_knn, y_pred_knn)

print(f"Test set Accuracy: {test_acc:.4f}")
print(f"Cohen's Kappa: {kappa_knn:.4f}")
print(f"Matthews Correlation Coefficient: {mcc_knn:.4f}")

Test set Accuracy: 0.9288
Cohen's Kappa: 0.8870
Matthews Correlation Coefficient: 0.8873


In [21]:
print("Classification Report")
print(classification_report(y_test_knn, y_pred_knn))

Classification Report
              precision    recall  f1-score   support

           1       0.93      0.93      0.93     42349
           2       0.96      0.93      0.94     56629
           3       0.90      0.91      0.91      7147
           4       0.78      0.86      0.82       549
           5       0.67      0.92      0.77      1898
           6       0.77      0.89      0.83      3471
           7       0.90      0.98      0.94      4100

    accuracy                           0.93    116143
   macro avg       0.84      0.92      0.88    116143
weighted avg       0.93      0.93      0.93    116143



# Statistical Tests

## Wilcoxon Test

In [30]:
from sklearn.model_selection import KFold, cross_val_score
from scipy.stats import wilcoxon

sample_size = 200000

# Same sample indices
sample_indices = X_train_knn.sample(n=sample_size, random_state=42).index

X_train_knn_sample = X_train_knn.loc[sample_indices]
y_train_knn_sample = y_train_knn.loc[sample_indices]

X_train_tree_sample = X_train_tree.loc[sample_indices]
y_train_tree_sample = y_train_tree.loc[sample_indices]

cv_folds = KFold(n_splits=10, shuffle=True, random_state=42)

# Decision Tree CV
tree_scores = cross_val_score(estimator=best_tree, X=X_train_tree_sample, y=y_train_tree_sample, cv=cv_folds, scoring='accuracy', n_jobs=-1)

# KNN CV
knn_scores = cross_val_score(estimator=final_knn, X=X_train_knn_sample, y=y_train_knn_sample, cv=cv_folds, scoring='accuracy', n_jobs=-1)

print(f"Decision Tree Scores: {np.round(tree_scores, 3)}")
print(f"KNN Scores: {np.round(knn_scores, 3)}")
print(f"Decision Tree Mean Accuracy: {np.mean(tree_scores):.4f}")
print(f"KNN Mean Accuracy: {np.mean(knn_scores):.4f}")

# Wilcoxon Test
statistic, p_value = wilcoxon(tree_scores, knn_scores)
print(f"Wilcoxon Test Statistic: {statistic:.4f}")
print(f"P-value: {p_value:.4f}")

alpha = 0.05
if p_value < alpha:
    print("Result: Reject the null hypothesis.")
    print("Conclusion: The difference in model performance is statistically significant.")
else:
    print("Result: Fail to reject the null hypothesis.")
    print("Conclusion: We cannot conclude that the difference in performance is statistically significant.")

Decision Tree Scores: [0.904 0.902 0.904 0.903 0.908 0.905 0.906 0.907 0.902 0.903]
KNN Scores: [0.945 0.945 0.945 0.944 0.947 0.947 0.945 0.942 0.945 0.943]
Decision Tree Mean Accuracy: 0.9043
KNN Mean Accuracy: 0.9446
Wilcoxon Test Statistic: 0.0000
P-value: 0.0020
Result: Reject the null hypothesis.
Conclusion: The difference in model performance is statistically significant.
