In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    matthews_corrcoef,
    confusion_matrix,
    classification_report
)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier

import joblib
import os


In [None]:
df = pd.read_csv("../train_data.csv")
df = df.sample(n=50000, random_state=42)

print("Shape:", df.shape)


Shape: (50000, 55)


In [3]:
X = df.drop("Cover_Type", axis=1)
y = df["Cover_Type"]
y = y - 1


print("Features:", X.shape)
print("Target:", y.shape)


Features: (50000, 54)
Target: (50000,)


In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [5]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [6]:
def evaluate_model(model, X_train, X_test, y_train, y_test, scaled=False):
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # For AUC
    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X_test)
        auc = roc_auc_score(y_test, y_prob, multi_class="ovr")
    else:
        auc = None
    
    results = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, average="weighted"),
        "Recall": recall_score(y_test, y_pred, average="weighted"),
        "F1": f1_score(y_test, y_pred, average="weighted"),
        "AUC": auc,
        "MCC": matthews_corrcoef(y_test, y_pred)
    }
    
    return results


In [7]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, multi_class="multinomial"),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(
        n_estimators=100,
        max_depth=20,
        random_state=42,
        n_jobs=-1
),
    "XGBoost": XGBClassifier(
        objective="multi:softprob",
        num_class=7,
        n_estimators=150,
        max_depth=6,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric="mlogloss",
        random_state=42
)

}


In [8]:
results_list = []

for name, model in models.items():
    
    print(f"\nTraining {name}...")
    
    if name in ["Logistic Regression", "KNN"]:
        res = evaluate_model(model, X_train_scaled, X_test_scaled, y_train, y_test)
    else:
        res = evaluate_model(model, X_train, X_test, y_train, y_test)
    
    res["Model"] = name
    results_list.append(res)
    
    # Save model
    os.makedirs("model", exist_ok=True)
    joblib.dump(model, f"model/{name.replace(' ', '_')}.pkl")



Training Logistic Regression...





Training Decision Tree...

Training KNN...

Training Naive Bayes...

Training Random Forest...

Training XGBoost...


In [9]:
results_df = pd.DataFrame(results_list)
results_df = results_df.set_index("Model")

results_df


Unnamed: 0_level_0,Accuracy,Precision,Recall,F1,AUC,MCC
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Logistic Regression,0.7291,0.723096,0.7291,0.719691,0.937374,0.557401
Decision Tree,0.8192,0.819332,0.8192,0.819245,0.847267,0.710592
KNN,0.8276,0.82527,0.8276,0.825701,0.939327,0.721787
Naive Bayes,0.4555,0.655217,0.4555,0.41134,0.886335,0.311253
Random Forest,0.8487,0.851116,0.8487,0.844546,0.980982,0.754338
XGBoost,0.8269,0.827336,0.8269,0.824509,0.97596,0.719316


In [10]:
results_df.to_csv("model/model_results.csv")


In [11]:
joblib.dump(scaler, "model/scaler.pkl")


['model/scaler.pkl']