In [None]:
# =========================
# MULTICLASS RANDOM FOREST
# NUMERICAL + CATEGORICAL PREPROCESSING
# =========================

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# ---------- Load Dataset ----------
df=pd.read_csv("/kaggle/input/mse-2-ai-201-b-ai-d/train.csv")

# ---------- Separate Features & Target ----------
X = df.drop('Class', axis=1)   # features
y = df['Class']                # target (may contain NaN)

y = y.fillna(y.mode()[0])
# ---------- Identify Numerical & Categorical Columns ----------
num_cols = X.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X.select_dtypes(include=['object']).columns

# ---------- Train-Test Split (NO stratify due to NaN in target) ----------
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

# ---------- Numerical Pipeline ----------
num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

# ---------- Categorical Pipeline ----------
cat_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# ---------- Combine Pipelines ----------
preprocessor = ColumnTransformer(transformers=[
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

# ---------- Random Forest Model ----------
rf = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    class_weight='balanced'
)

# ---------- Full Pipeline ----------
model = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('classifier', rf)
])

# ---------- Train Model ----------
model.fit(X_train, y_train)

# ---------- Predictions ----------
y_pred = model.predict(X_test)

# ---------- Evaluation ----------
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))