In [9]:
import pandas as pd
# Load your full dataset
df = pd.read_csv('../data/cleaned_cover_type.csv')  # Replace with your actual path

# Define top 15 features (based on previous feature importance analysis)
top_15_features = [
    'Elevation', 'Horizontal_Distance_To_Roadways', 'Horizontal_Distance_To_Fire_Points',
    'Fire_Road_Ratio', 'Hydrology_Road_Ratio', 'Vertical_Distance_To_Hydrology',
    'Wilderness_Area_1', 'Horizontal_Distance_To_Hydrology', 'Noon_vs_Evening_Shade',
    'Aspect', 'Morning_vs_Noon_Shade', 'Hillshade_Noon', 'Hillshade_3pm',
    'Hillshade_9am', 'Wilderness_Area_4'
]

# Split features and target
X = df[top_15_features]
y = df['Cover_Type']


In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [11]:
from imblearn.over_sampling import SMOTE
import pandas as pd

# Define features and target
X = df[top_15_features]  # Replace with your actual top features
y = df['Cover_Type']

# Apply SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Convert to DataFrame
df_resampled = pd.DataFrame(X_resampled, columns=X.columns)
df_resampled['Cover_Type'] = y_resampled

# Verify new class distribution
print(df_resampled['Cover_Type'].value_counts())


Cover_Type
0    103071
4    103071
6    103071
3    103071
5    103071
2    103071
1    103071
Name: count, dtype: int64


In [12]:
# Step 1: Train-test split BEFORE SMOTE
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Step 2: Apply SMOTE only to training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Step 3: Verify class balance
print("Before SMOTE:", y_train.value_counts())
print("After SMOTE:", pd.Series(y_train_resampled).value_counts())


Before SMOTE: Cover_Type
4    82362
6    24891
0     2489
5     1758
3     1755
1     1737
2     1720
Name: count, dtype: int64
After SMOTE: Cover_Type
4    82362
6    82362
1    82362
5    82362
2    82362
0    82362
3    82362
Name: count, dtype: int64


In [13]:
! pip install xgboost




[notice] A new release of pip is available: 24.0 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [15]:
# Define models
models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "KNN": KNeighborsClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
}

# Dictionary to store accuracy results
results = {}

# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train_resampled, y_train_resampled)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    results[name] = acc

    print(f"\n {name} Results:")
    print("Accuracy:", acc)
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))



 Random Forest Results:
Accuracy: 0.942422373020769
Confusion Matrix:
 [[  510     0     6     0    48    13     3]
 [    0   408     6     0     0     9     0]
 [    3    25   346     0     0    66     0]
 [    0     0     0   394     0     0    11]
 [  121     0    14    12 19781    12   769]
 [    4    30    37     0     0   331     0]
 [   20     0     4    68   398     1  5728]]
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.88      0.82       580
           1       0.88      0.96      0.92       423
           2       0.84      0.79      0.81       440
           3       0.83      0.97      0.90       405
           4       0.98      0.96      0.97     20709
           5       0.77      0.82      0.79       402
           6       0.88      0.92      0.90      6219

    accuracy                           0.94     29178
   macro avg       0.85      0.90      0.87     29178
weighted avg       0.94      0.94      0.94   

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



 Logistic Regression Results:
Accuracy: 0.6095345808485846
Confusion Matrix:
 [[  171     0    42    92   106    77    92]
 [   12   324    10    13     0    64     0]
 [   52    98   117     7     0   166     0]
 [   22     0     0   367     0     3    13]
 [ 1274     1    24   457 13403    82  5468]
 [   13    72    87    17     0   213     0]
 [  369     0     6   396  2243    15  3190]]
Classification Report:
               precision    recall  f1-score   support

           0       0.09      0.29      0.14       580
           1       0.65      0.77      0.71       423
           2       0.41      0.27      0.32       440
           3       0.27      0.91      0.42       405
           4       0.85      0.65      0.74     20709
           5       0.34      0.53      0.42       402
           6       0.36      0.51      0.43      6219

    accuracy                           0.61     29178
   macro avg       0.43      0.56      0.45     29178
weighted avg       0.71      0.61      

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



 XGBoost Results:
Accuracy: 0.9019123997532388
Confusion Matrix:
 [[  517     0     8     0    41    11     3]
 [    0   409     5     0     0     9     0]
 [    4    18   357     0     0    61     0]
 [    0     0     0   394     0     0    11]
 [  182     0    20    12 19312    13  1170]
 [    7    29    48     0     0   318     0]
 [   24     0     6    73  1105     2  5009]]
Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.89      0.79       580
           1       0.90      0.97      0.93       423
           2       0.80      0.81      0.81       440
           3       0.82      0.97      0.89       405
           4       0.94      0.93      0.94     20709
           5       0.77      0.79      0.78       402
           6       0.81      0.81      0.81      6219

    accuracy                           0.90     29178
   macro avg       0.82      0.88      0.85     29178
weighted avg       0.90      0.90      0.90     291

In [16]:
from sklearn.model_selection import RandomizedSearchCV

param_dist = {
    'n_estimators': [100, 150],
    'max_depth': [10, 15, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

search = RandomizedSearchCV(
    RandomForestClassifier(random_state=42),
    param_distributions=param_dist,
    n_iter=10,
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

search.fit(X_train_resampled, y_train_resampled)
best_rf = search.best_estimator_


Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [17]:
best_rf


0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,20
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [18]:
import pickle
with open("best_rf_model.pkl", "wb") as f:
    pickle.dump(best_rf, f)
