In [None]:
# ======================================================
# 1. IMPORT LIBRARIES
# ======================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings("ignore")

# ======================================================
# 2. LOAD DATA
# ======================================================

train_df = pd.read_csv("train.csv")
test_df  = pd.read_csv("test.csv")

TARGET = "target"   # ðŸ”´ CHANGE THIS

# ======================================================
# 3. BASIC DATA INSPECTION
# ======================================================

print(train_df.head())
print(train_df.info())
print(train_df[TARGET].value_counts())

# ======================================================
# 4. DATA CLEANING & PREPROCESSING
# ======================================================

for col in train_df.columns:
    if train_df[col].dtype == "object":
        train_df[col].fillna(train_df[col].mode()[0], inplace=True)
        test_df[col].fillna(test_df[col].mode()[0], inplace=True)
    else:
        train_df[col].fillna(train_df[col].median(), inplace=True)
        test_df[col].fillna(test_df[col].median(), inplace=True)

# ======================================================
# 5. DATA VISUALIZATION
# ======================================================

# Target distribution (Multiclass)
plt.figure(figsize=(6,4))
sns.countplot(x=TARGET, data=train_df)
plt.title("Target Class Distribution")
plt.show()

# ======================================================
# 6. IQR OUTLIER ANALYSIS
# ======================================================

numerical_cols = train_df.select_dtypes(include=np.number).columns
numerical_cols = numerical_cols.drop(TARGET)

for col in numerical_cols:
    Q1 = train_df[col].quantile(0.25)
    Q3 = train_df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR

    train_df[col] = np.clip(train_df[col], lower, upper)
    test_df[col]  = np.clip(test_df[col], lower, upper)

# ======================================================
# 7. ENCODING CATEGORICAL VARIABLES
# ======================================================

le = LabelEncoder()
cat_cols = train_df.select_dtypes(include="object").columns

for col in cat_cols:
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col]  = le.transform(test_df[col])

# ======================================================
# 8. CORRELATION ANALYSIS
# ======================================================

plt.figure(figsize=(10,6))
sns.heatmap(train_df.corr(), cmap="coolwarm", annot=False)
plt.title("Correlation Heatmap")
plt.show()

# ======================================================
# 9. FEATURE & TARGET SPLIT
# ======================================================

X = train_df.drop(TARGET, axis=1)
y = train_df[TARGET]

# ======================================================
# 10. FEATURE SCALING
# ======================================================
# (Not mandatory for Random Forest, but kept for pipeline consistency)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
test_scaled = scaler.transform(test_df)

# ======================================================
# 11. TRAIN-TEST SPLIT (MULTICLASS SAFE)
# ======================================================

X_train, X_val, y_train, y_val = train_test_split(
    X_scaled,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y   # IMPORTANT for multiclass
)

# ======================================================
# 12. RANDOM FOREST MODEL TRAINING (MULTICLASS)
# ======================================================

rf = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)

# ======================================================
# 13. MODEL EVALUATION
# ======================================================

y_pred = rf.predict(X_val)

print("Accuracy:", accuracy_score(y_val, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_val, y_pred))
print("\nClassification Report:\n", classification_report(y_val, y_pred))

# ======================================================
# 14. HYPERPARAMETER TUNING (MULTICLASS)
# ======================================================

param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2]
}

grid = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=3,
    scoring="accuracy",
    n_jobs=-1
)

grid.fit(X_train, y_train)

best_model = grid.best_estimator_

print("Best Parameters:", grid.best_params_)

# ======================================================
# 15. FINAL EVALUATION
# ======================================================

final_pred = best_model.predict(X_val)
print("Final Accuracy:", accuracy_score(y_val, final_pred))

# ======================================================
# 16. TEST PREDICTION & KAGGLE SUBMISSION
# ======================================================

test_predictions = best_model.predict(test_scaled)

submission = pd.DataFrame({
    "id": test_df.index,   # change if Kaggle specifies
    TARGET: test_predictions
})

submission.to_csv("submission.csv", index=False)
print("submission.csv generated successfully!")
