In [None]:
#import libraries
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import f1_score, precision_score, recall_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.filterwarnings("ignore")


In [None]:
#load datasets
train_df = pd.read_csv("/kaggle/input/your-dataset/train.csv")
test_df  = pd.read_csv("/kaggle/input/your-dataset/test.csv")

train_df.head()


In [None]:
#basic data understanding
train_df.info()
train_df.describe()
train_df.isnull().sum()


In [None]:
#Target variable
TARGET = 'target'   # ðŸ‘ˆ CHANGE THIS
X = train_df.drop(TARGET, axis=1)
y = train_df[TARGET]


In [1]:
#Handling missing values
for col in X.columns:
    if X[col].dtype == 'object':
        X[col].fillna(X[col].mode()[0], inplace=True)
        test_df[col].fillna(test_df[col].mode()[0], inplace=True)
    else:
        X[col].fillna(X[col].median(), inplace=True)
        test_df[col].fillna(test_df[col].median(), inplace=True)


NameError: name 'X' is not defined

In [2]:
#target distribution
sns.countplot(x=y)
plt.show()


NameError: name 'sns' is not defined

In [None]:
#numerical feature distribution
X.hist(figsize=(15,10))
plt.show()


In [None]:
#Boxplot
plt.figure(figsize=(15,8))
sns.boxplot(data=X.select_dtypes(include=np.number))
plt.xticks(rotation=90)
plt.show()


In [None]:
#pairplot
sns.pairplot(train_df.sample(500))
plt.show()


In [None]:
#outlier analysis
for col in X.select_dtypes(include=np.number):
    Q1 = X[col].quantile(0.25)
    Q3 = X[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    X[col] = np.where(X[col] < lower, lower, X[col])
    X[col] = np.where(X[col] > upper, upper, X[col])


In [3]:
#encoding cateogrical values
le = LabelEncoder()
for col in X.select_dtypes(include='object'):
    X[col] = le.fit_transform(X[col])
    test_df[col] = le.transform(test_df[col])


In [None]:
#feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
test_scaled = scaler.transform(test_df)


In [None]:
#correlation analysis
plt.figure(figsize=(12,8))
sns.heatmap(pd.DataFrame(X_scaled).corr(), cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()


In [None]:
#train test split
X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:
#all models defined
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(),
    "KNN": KNeighborsClassifier(),
    "SVM": SVC(probability=True),
    "Naive Bayes": GaussianNB()
}


In [None]:
#train all  mdoels in loop
results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_val)

    acc = accuracy_score(y_val, preds)
    f1  = f1_score(y_val, preds, average='weighted')

    results.append((name, acc, f1))

    print(f"\n{name}")
    print("Accuracy:", acc)
    print(classification_report(y_val, preds))


In [None]:
#compare models
results_df = pd.DataFrame(results, columns=["Model", "Accuracy", "F1"])
results_df.sort_values(by="Accuracy", ascending=False)


In [None]:
#hyperparemater tuning best methodmodel random forest
param_grid = {
    'n_estimators': [100,200],
    'max_depth': [None,10,20],
    'min_samples_split': [2,5]
}

grid = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=5,
    scoring='accuracy'
)

grid.fit(X_train, y_train)
best_model = grid.best_estimator_

print("Best Params:", grid.best_params_)


In [None]:
#final evaluation
final_preds = best_model.predict(X_val)

print("Final Accuracy:", accuracy_score(y_val, final_preds))
print(confusion_matrix(y_val, final_preds))
print(classification_report(y_val, final_preds))


In [None]:
#train on full data
best_model.fit(X_scaled, y)


In [None]:
test_predictions = best_model.predict(test_scaled)

submission = pd.DataFrame({
    "id": test_df.index,     # CHANGE if needed
    "target": test_predictions
})

submission.to_csv("submission.csv", index=False)
submission.head()
