In [12]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    matthews_corrcoef,
    confusion_matrix,
    classification_report
)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier

import joblib
import os


In [13]:
df = pd.read_csv("train_data.csv")

print("Shape:", df.shape)


Shape: (551961, 55)


In [14]:
X = df.drop("Cover_Type", axis=1)
y = df["Cover_Type"]
y = y - 1


print("Features:", X.shape)
print("Target:", y.shape)


Features: (551961, 54)
Target: (551961,)


In [15]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [16]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [17]:
def evaluate_model(model, X_train, X_test, y_train, y_test, scaled=False):
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # For AUC
    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X_test)
        auc = roc_auc_score(y_test, y_prob, multi_class="ovr")
    else:
        auc = None
    
    results = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, average="weighted"),
        "Recall": recall_score(y_test, y_pred, average="weighted"),
        "F1": f1_score(y_test, y_pred, average="weighted"),
        "AUC": auc,
        "MCC": matthews_corrcoef(y_test, y_pred)
    }
    
    return results


In [18]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, multi_class="multinomial"),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(
        objective="multi:softprob",
        num_class=7,
        eval_metric="mlogloss",
        use_label_encoder=False
    )
}


In [19]:
results_list = []

for name, model in models.items():
    
    print(f"\nTraining {name}...")
    
    if name in ["Logistic Regression", "KNN"]:
        res = evaluate_model(model, X_train_scaled, X_test_scaled, y_train, y_test)
    else:
        res = evaluate_model(model, X_train, X_test, y_train, y_test)
    
    res["Model"] = name
    results_list.append(res)
    
    # Save model
    os.makedirs("model", exist_ok=True)
    joblib.dump(model, f"model/{name.replace(' ', '_')}.pkl")



Training Logistic Regression...





Training Decision Tree...

Training KNN...

Training Naive Bayes...

Training Random Forest...

Training XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [20]:
results_df = pd.DataFrame(results_list)
results_df = results_df.set_index("Model")

results_df


Unnamed: 0_level_0,Accuracy,Precision,Recall,F1,AUC,MCC
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Logistic Regression,0.7227,0.711604,0.7227,0.713108,0.936383,0.54599
Decision Tree,0.93909,0.939034,0.93909,0.939057,0.942102,0.902206
KNN,0.928673,0.928487,0.928673,0.928474,0.982983,0.885302
Naive Bayes,0.460446,0.648065,0.460446,0.418512,0.887854,0.310609
Random Forest,0.953865,0.953979,0.953865,0.953619,0.99777,0.925795
XGBoost,0.869983,0.870065,0.869983,0.869315,0.986509,0.790132


In [21]:
results_df.to_csv("model/model_results.csv")


In [22]:
joblib.dump(scaler, "model/scaler.pkl")


['model/scaler.pkl']