In [None]:
# ===========================================================
# IMPORT LIBRARIES
# ===========================================================
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

# ===========================================================
# LOAD DATA
# ===========================================================
train = pd.read_csv("/kaggle/input/ai-201-b-mse-2-ai-c/train.csv")
test  = pd.read_csv("/kaggle/input/ai-201-b-mse-2-ai-c/test.csv")

# ===========================================================
# EXTRACT TRAIN IDS
# (Not required unless needed for merging, keeping for clarity)
# ===========================================================
# train_ids = train['id']   # <-- train has no id, so skipping

# ===========================================================
# TARGET COLUMN
# ===========================================================
target = "NObeyesdad"

# ===========================================================
# SPLIT FEATURES & TARGET
# ===========================================================
X = train.drop(columns=[target])
y = train[target]

# ===========================================================
# TRAIN-VALIDATION SPLIT
# ===========================================================
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ===========================================================
# IDENTIFY NUMERIC & CATEGORICAL COLUMNS
# ===========================================================
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# ===========================================================
# PREPROCESSING PIPELINES
# ===========================================================
# Numeric: fill missing with median + scale
numeric_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# Categorical: fill missing with mode + OneHotEncoding
categorical_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

# Combine both pipelines
preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_pipeline, numeric_features),
    ("cat", categorical_pipeline, categorical_features)
])

# ===========================================================
# LABEL ENCODER FOR TARGET (STRING → NUMBER)
# ===========================================================
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_val_enc = le.transform(y_val)

# ===========================================================
# MODEL
# ===========================================================
model = RandomForestClassifier(
    n_estimators=500,
    max_depth=12,
    min_samples_split=6,
    min_samples_leaf=3,
    random_state=42
)

# ===========================================================
# FINAL PIPELINE (PREPROCESSOR + MODEL)
# ===========================================================
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", model)
])

# ===========================================================
# TRAIN MODEL
# ===========================================================
pipeline.fit(X_train, y_train_enc)

# ===========================================================
# VALIDATION PREDICTIONS
# ===========================================================
y_pred = pipeline.predict(X_val)
y_pred_proba = pipeline.predict_proba(X_val)

# Convert numeric predictions → actual class names
y_pred_labels = le.inverse_transform(y_pred)
y_val_labels = le.inverse_transform(y_val_enc)

# ===========================================================
# MODEL PERFORMANCE METRICS
# ===========================================================
print("=========== MODEL PERFORMANCE ===========")
print("Accuracy Score:", accuracy_score(y_val_enc, y_pred))
print("Precision (Macro):", precision_score(y_val_labels, y_pred_labels, average='macro'))
print("Recall (Macro):", recall_score(y_val_labels, y_pred_labels, average='macro'))
print("F1-score (Macro):", f1_score(y_val_labels, y_pred_labels, average='macro'))
print("ROC-AUC Score:", roc_auc_score(y_val_enc, y_pred_proba, multi_class='ovr'))
print("========================================")

# ===========================================================
# PREPARE TEST DATA
# (Same preprocessing: NO column dropping)
# ===========================================================
test_ids = test["id"]
test_features = test.drop(columns=["id"])

# ===========================================================
# PREDICT ON TEST SET
# ===========================================================
test_pred = pipeline.predict(test_features)
test_pred_labels = le.inverse_transform(test_pred)

# ===========================================================
# CREATE SUBMISSION FILE
# ===========================================================
submission = pd.DataFrame({
    "id": test_ids,
    "NObeyesdad": test_pred_labels
})

submission.to_csv("submission.csv", index=False)
print("submission.csv saved successfully!")