In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# ============================
# 1. IMPORT LIBRARIES
# ============================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings("ignore")

# ============================
# 2. LOAD DATA
# ============================

train_df = pd.read_csv("train.csv")   # change filename if needed
test_df  = pd.read_csv("test.csv")

TARGET = "target"   # ðŸ”´ CHANGE THIS ONLY

# ============================
# 3. BASIC DATA CHECK
# ============================

print(train_df.head())
print(train_df.info())
print(train_df.describe())

# ============================
# 4. DATA CLEANING
# ============================

# Handle missing values
for col in train_df.columns:
    if train_df[col].dtype == "object":
        train_df[col].fillna(train_df[col].mode()[0], inplace=True)
        test_df[col].fillna(test_df[col].mode()[0], inplace=True)
    else:
        train_df[col].fillna(train_df[col].median(), inplace=True)
        test_df[col].fillna(test_df[col].median(), inplace=True)

# ============================
# 5. OUTLIER ANALYSIS (IQR METHOD)
# ============================

numerical_cols = train_df.select_dtypes(include=np.number).columns.drop(TARGET)

for col in numerical_cols:
    Q1 = train_df[col].quantile(0.25)
    Q3 = train_df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    
    train_df[col] = np.clip(train_df[col], lower, upper)
    test_df[col]  = np.clip(test_df[col], lower, upper)

# ============================
# 6. DATA VISUALIZATION
# ============================

plt.figure(figsize=(6,4))
sns.countplot(x=TARGET, data=train_df)
plt.title("Target Variable Distribution")
plt.show()

# ============================
# 7. ENCODING CATEGORICAL VARIABLES
# ============================

label_encoder = LabelEncoder()

categorical_cols = train_df.select_dtypes(include="object").columns

for col in categorical_cols:
    train_df[col] = label_encoder.fit_transform(train_df[col])
    test_df[col]  = label_encoder.transform(test_df[col])

# ============================
# 8. CORRELATION ANALYSIS
# ============================

plt.figure(figsize=(10,6))
sns.heatmap(train_df.corr(), cmap="coolwarm", annot=False)
plt.title("Correlation Heatmap")
plt.show()

# ============================
# 9. FEATURE / TARGET SPLIT
# ============================

X = train_df.drop(TARGET, axis=1)
y = train_df[TARGET]

# ============================
# 10. FEATURE SCALING
# ============================

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
test_scaled = scaler.transform(test_df)

# ============================
# 11. TRAIN-TEST SPLIT
# ============================

X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

# ============================
# 12. MODEL TRAINING
# ============================

rf = RandomForestClassifier(random_state=42)

rf.fit(X_train, y_train)

# ============================
# 13. MODEL EVALUATION
# ============================

y_pred = rf.predict(X_val)

print("Accuracy:", accuracy_score(y_val, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred))
print("Classification Report:\n", classification_report(y_val, y_pred))

# ============================
# 14. HYPERPARAMETER TUNING
# ============================

param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2]
}

grid = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=3,
    scoring="accuracy",
    n_jobs=-1
)

grid.fit(X_train, y_train)

best_model = grid.best_estimator_

print("Best Parameters:", grid.best_params_)

# ============================
# 15. FINAL EVALUATION
# ============================

final_pred = best_model.predict(X_val)

print("Final Accuracy:", accuracy_score(y_val, final_pred))

# ============================
# 16. TEST PREDICTIONS (KAGGLE)
# ============================

test_predictions = best_model.predict(test_scaled)

submission = pd.DataFrame({
    "id": test_df.index,   # change column name if Kaggle specifies
    TARGET: test_predictions
})

submission.to_csv("submission.csv", index=False)
print("Submission file generated successfully!")
