In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, log_loss
from sklearn.utils.class_weight import compute_class_weight

# ==========================
# CONFIG
# ==========================
TARGET_COL = "NObeyesdad"
ID_COL = "id"
OUTPUT_FILE = "submission1.csv"

train = pd.read_csv("/kaggle/input/ai-201-b-mse-2-ai-d/train.csv")
test = pd.read_csv("/kaggle/input/ai-201-b-mse-2-ai-d/test.csv")

# ==========================
# HANDLE ID COL
# ==========================
test_ids = test[ID_COL]

if ID_COL in train.columns:
    train = train.drop(columns=[ID_COL])
if ID_COL in test.columns:
    test = test.drop(columns=[ID_COL])

# ==========================
# SPLIT FEATURES/TARGET
# ==========================
X = train.drop(columns=[TARGET_COL])
y = train[TARGET_COL]

# ==========================
# COLUMN TYPES
# ==========================
cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
num_cols = X.select_dtypes(include=['number']).columns.tolist()

# ==========================
# IMPUTE MISSING VALUES
# ==========================
X[num_cols] = X[num_cols].fillna(X[num_cols].mean())
test[num_cols] = test[num_cols].fillna(X[num_cols].mean())

if cat_cols:
    X[cat_cols] = X[cat_cols].fillna(X[cat_cols].mode().iloc[0])
    test[cat_cols] = test[cat_cols].fillna(X[cat_cols].mode().iloc[0])

# ==========================
# PREPROCESSOR
# ==========================
pre = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", StandardScaler(), num_cols)
    ]
)

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_train_pre = pre.fit_transform(X_train)
X_val_pre = pre.transform(X_val)
test_pre = pre.transform(test)

# ==========================
# CLASS WEIGHTS (VERY IMPORTANT)
# ==========================
classes = np.unique(y)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y)
class_weight_dict = {c: w for c, w in zip(classes, weights)}

# ==========================
# ðŸ”¥ OPTIMIZED RANDOM FOREST ðŸ”¥
# ==========================
model = RandomForestClassifier(
    n_estimators=1200,
    max_depth=35,
    min_samples_split=4,
    min_samples_leaf=2,
    max_features="sqrt",
    bootstrap=True,
    class_weight=class_weight_dict,
    random_state=42,
    n_jobs=-1
)

# ==========================
# TRAIN
# ==========================
model.fit(X_train_pre, y_train)

# ==========================
# VALIDATION METRICS
# ==========================
val_pred = model.predict(X_val_pre)
val_proba = model.predict_proba(X_val_pre)

acc = accuracy_score(y_val, val_pred)
ll = log_loss(y_val, val_proba)

print("===================================")
print("VALIDATION RESULTS")
print("Accuracy:", acc)
print("Log Loss:", ll)
print("===================================")

# ==========================
# SUBMISSION
# ==========================
final_pred = model.predict(test_pre)

submission = pd.DataFrame({
    "id": test_ids,
    TARGET_COL: final_pred
})

submission.to_csv(OUTPUT_FILE, index=False)
print("Submission saved to:", OUTPUT_FILE)
print(submission.head())

Loading data...


FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/ai-201-b-mse-2-ai-d/train.csv'