In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    matthews_corrcoef,
    confusion_matrix,
    classification_report
)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier

import joblib
import os


In [None]:
df = pd.read_csv("train_data.csv")
df = df.sample(n=50000, random_state=42)

print("Shape:", df.shape)


In [None]:
X = df.drop("Cover_Type", axis=1)
y = df["Cover_Type"]
y = y - 1


print("Features:", X.shape)
print("Target:", y.shape)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [None]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
def evaluate_model(model, X_train, X_test, y_train, y_test, scaled=False):
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # For AUC
    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X_test)
        auc = roc_auc_score(y_test, y_prob, multi_class="ovr")
    else:
        auc = None
    
    results = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, average="weighted"),
        "Recall": recall_score(y_test, y_pred, average="weighted"),
        "F1": f1_score(y_test, y_pred, average="weighted"),
        "AUC": auc,
        "MCC": matthews_corrcoef(y_test, y_pred)
    }
    
    return results


In [None]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, multi_class="multinomial"),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(
        n_estimators=100,
        max_depth=20,
        random_state=42,
        n_jobs=-1
),
    "XGBoost": XGBClassifier(
        objective="multi:softprob",
        num_class=7,
        n_estimators=150,
        max_depth=6,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric="mlogloss",
        random_state=42
)

}


In [None]:
results_list = []

for name, model in models.items():
    
    print(f"\nTraining {name}...")
    
    if name in ["Logistic Regression", "KNN"]:
        res = evaluate_model(model, X_train_scaled, X_test_scaled, y_train, y_test)
    else:
        res = evaluate_model(model, X_train, X_test, y_train, y_test)
    
    res["Model"] = name
    results_list.append(res)
    
    # Save model
    os.makedirs("model", exist_ok=True)
    joblib.dump(model, f"model/{name.replace(' ', '_')}.pkl")


In [None]:
results_df = pd.DataFrame(results_list)
results_df = results_df.set_index("Model")

results_df


In [None]:
results_df.to_csv("model/model_results.csv")


In [None]:
joblib.dump(scaler, "model/scaler.pkl")
