# Project 
## Kaggle Competition-Predicting Introvert from the Extrovert

In [5]:
# 📦 LIBRARIES
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")

In [6]:
train=pd.read_csv(r"C:\Users\user\Desktop\Kaggle Competitions\playground-series-s5e7\train.csv")
test=pd.read_csv(r"C:\Users\user\Desktop\Kaggle Competitions\playground-series-s5e7\test.csv")
submission=pd.read_csv(r"C:\Users\user\Desktop\Kaggle Competitions\playground-series-s5e7\sample_submission.csv")

# 🎯 SEPARATE TARGET
X = train.drop(columns=["id", "Personality"])
y = train["Personality"]
X_test = test.drop(columns=["id"])
test_ids = test["id"]

# 🧼 HANDLE OBJECT COLUMNS (LABEL ENCODE)
cat_cols = X.select_dtypes(include="object").columns
for col in cat_cols:
    le = LabelEncoder()
    full_col = pd.concat([X[col], X_test[col]], axis=0).astype(str)
    le.fit(full_col)
    X[col] = le.transform(X[col].astype(str))
    X_test[col] = le.transform(X_test[col].astype(str))

# 🧽 HANDLE MISSING VALUES
imputer = SimpleImputer(strategy="median")
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

# 📏 SCALE
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

# 🔠 ENCODE TARGET
le_y = LabelEncoder()
y_enc = le_y.fit_transform(y)

# 🤖 BASE MODELS
xgb = XGBClassifier(n_estimators=500, max_depth=6, learning_rate=0.03, subsample=0.85,
                    colsample_bytree=0.9, random_state=42, use_label_encoder=False, eval_metric='mlogloss')
lgbm = LGBMClassifier(n_estimators=600, learning_rate=0.03, max_depth=7, subsample=0.9, colsample_bytree=0.9, random_state=42)
cat = CatBoostClassifier(iterations=400, depth=6, learning_rate=0.03, verbose=0, random_seed=42)
rf = RandomForestClassifier(n_estimators=400, max_depth=10, random_state=42)

# 🧠 STACKING MODEL
stack = StackingClassifier(
    estimators=[('xgb', xgb), ('lgbm', lgbm), ('cat', cat), ('rf', rf)],
    final_estimator=GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, max_depth=5),
    cv=5,
    n_jobs=-1,
    passthrough=True
)

# 🎓 CROSS VALIDATION CHECK (Optional)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for fold, (train_idx, val_idx) in enumerate(skf.split(X_scaled, y_enc)):
    X_train_fold, X_val_fold = X_scaled[train_idx], X_scaled[val_idx]
    y_train_fold, y_val_fold = y_enc[train_idx], y_enc[val_idx]
    
    stack.fit(X_train_fold, y_train_fold)
    val_preds = stack.predict(X_val_fold)
    acc = accuracy_score(y_val_fold, val_preds)
    print(f"Fold {fold + 1} Accuracy: {round(acc, 4)}")

# 🧪 FINAL PREDICTION
stack.fit(X_scaled, y_enc)
final_preds = stack.predict(X_test_scaled)
final_labels = le_y.inverse_transform(final_preds)

# 📤 SUBMISSION
submission["Personality"] = final_labels
submission["id"] = test_ids
submission.to_csv(r"c:\Users\user\Desktop\Kaggle Competitions\playground-series-s5e7\submissionLC.csv", index=False)
print("Submission is Complete")

Fold 1 Accuracy: 0.9695
Fold 2 Accuracy: 0.9668
Fold 3 Accuracy: 0.9655
Fold 4 Accuracy: 0.9703
Fold 5 Accuracy: 0.9717
Submission is Complete


In [7]:
submission

Unnamed: 0,id,Personality
0,18524,Extrovert
1,18525,Introvert
2,18526,Extrovert
3,18527,Extrovert
4,18528,Introvert
...,...,...
6170,24694,Extrovert
6171,24695,Introvert
6172,24696,Extrovert
6173,24697,Extrovert
