In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, log_loss

# ==========================================
# 1. USER CONFIGURATION (EDIT THIS PART ONLY)
# ==========================================
TRAIN_PATH = "/kaggle/input/ai-201-b-mse-2-aiml-c/train.csv"
TEST_PATH = "/kaggle/input/ai-201-b-mse-2-aiml-c/test.csv"
TARGET_COL = "NObeyesdad"
ID_COL = "id"
OUTPUT_FILE = "submission.csv"
# ==========================================

# 2. Load Data
print("Loading data...")
train_data = pd.read_csv(TRAIN_PATH)
test_data = pd.read_csv(TEST_PATH)

print(f"Train shape: {train_data.shape}")
print(f"Test shape: {test_data.shape}")

# 3. Handle ID Columns (keep test ids for submission)
test_ids = test_data[ID_COL] if ID_COL in test_data.columns else None

if ID_COL in train_data.columns:
    train_data = train_data.drop(columns=[ID_COL])
if ID_COL in test_data.columns:
    test_data = test_data.drop(columns=[ID_COL])

# 4. Separate Features (X) and Target (y)
X = train_data.drop(columns=[TARGET_COL])
y = train_data[TARGET_COL]

# 5. Dynamic Feature Selection
cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
num_cols = X.select_dtypes(include=['number']).columns.tolist()

print(f"\nDetected {len(cat_cols)} categorical columns: {cat_cols}")
print(f"Detected {len(num_cols)} numerical columns: {num_cols}")

# 6. Impute Missing Values (simple imputing with mean/mode)
print("\nImputing missing values...")

if num_cols:
    mean_vals = X[num_cols].mean()
    X[num_cols] = X[num_cols].fillna(mean_vals)
    test_data[num_cols] = test_data[num_cols].fillna(mean_vals)

if cat_cols:
    mode_vals = X[cat_cols].mode().iloc[0]
    X[cat_cols] = X[cat_cols].fillna(mode_vals)
    test_data[cat_cols] = test_data[cat_cols].fillna(mode_vals)

# 7. Visualization (Optional)
plt.figure(figsize=(6, 4))
sns.countplot(x=TARGET_COL, data=train_data)
plt.title(f"Class Distribution for {TARGET_COL}")
plt.show()

# 8. Define Preprocessor
preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse=False), cat_cols),
    ('num', StandardScaler(), num_cols)
])

# 9. Train/Validation Split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# 10. Fit Preprocessor
print("\nTransforming data...")
X_train_pre = preprocessor.fit_transform(X_train)
X_val_pre = preprocessor.transform(X_val)
test_data_pre = preprocessor.transform(test_data)

# 11. Encode labels (handles string labels)
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_val_enc = le.transform(y_val)  # assumes y_val contains only known labels

# 12. Train Model
print("\nTraining Random Forest...")
rfc = RandomForestClassifier(n_estimators=1000, random_state=42, class_weight='balanced', n_jobs=-1)
rfc.fit(X_train_pre, y_train_enc)

# 13. Evaluate - predict probabilities for train and validation
print("Evaluating model...")

# Predict probabilities
train_proba = rfc.predict_proba(X_train_pre)
val_proba = rfc.predict_proba(X_val_pre)

# Determine if binary or multiclass
n_classes = train_proba.shape[1]
is_binary = (n_classes == 2)

try:
    if is_binary:
        # For binary, roc_auc_score expects probabilities of the positive class or two-column proba
        roc_train = roc_auc_score(y_train_enc, train_proba[:, 1])
        roc_val = roc_auc_score(y_val_enc, val_proba[:, 1])
    else:
        # Multiclass: use 'ovr' and macro average
        roc_train = roc_auc_score(y_train_enc, train_proba, multi_class='ovr', average='macro')
        roc_val = roc_auc_score(y_val_enc, val_proba, multi_class='ovr', average='macro')

    loss_train = log_loss(y_train_enc, train_proba)
    loss_val = log_loss(y_val_enc, val_proba)

    print(f"Training ROC AUC: {roc_train:.4f}")
    print(f"Validation ROC AUC: {roc_val:.4f}")
    print(f"Training Log Loss: {loss_train:.4f}")
    print(f"Validation Log Loss: {loss_val:.4f}")

except ValueError as e:
    print(f"Could not calculate metrics: {e}")

# Helpful note: if your test.csv contains true labels (rare in competitions), you can compute ROC AUC on test similarly.
if TARGET_COL in test_data.columns:
    print("\nDetected target column in test.csv; computing metrics on test set as well.")
    y_test = test_data[TARGET_COL]
    y_test_enc = le.transform(y_test)
    test_proba = rfc.predict_proba(test_data_pre)
    if is_binary:
        roc_test = roc_auc_score(y_test_enc, test_proba[:, 1])
    else:
        roc_test = roc_auc_score(y_test_enc, test_proba, multi_class='ovr', average='macro')
    loss_test = log_loss(y_test_enc, test_proba)
    print(f"Test ROC AUC: {roc_test:.4f}")
    print(f"Test Log Loss: {loss_test:.4f}")
else:
    print("\nNo target column in test.csv detected — cannot compute ROC AUC on the test file used for submission.")

# ==========================================
# 14. FIXED SUBMISSION (No more 7 columns!)
# ==========================================

print("\nGenerating submission...")

# Convert probabilities → final class label (argmax over encoded classes), then inverse transform labels
test_pred_enc = rfc.predict(test_data_pre)
test_pred = le.inverse_transform(test_pred_enc)

# FINAL submission DataFrame
if test_ids is not None:
    submission_df = pd.DataFrame({
        ID_COL: test_ids,
        TARGET_COL: test_pred
    })
else:
    # no ID column; just provide predictions
    submission_df = pd.DataFrame({
        TARGET_COL: test_pred
    })

# Save file
submission_df.to_csv(OUTPUT_FILE, index=False)
print(f"Correct submission saved to {OUTPUT_FILE}")
print(submission_df.head())
