In [1]:
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# Load training data
train_df = pd.read_csv("../data/prepared/train_fully_prepared.csv")
X = train_df.drop(columns=["id", "Personality"])
y = train_df["Personality"]

# Train/validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, random_state=42)

# Final tuned XGBClassifier
final_model = XGBClassifier(
    colsample_bytree=0.7,
    gamma=0.2,
    learning_rate=0.1,
    max_depth=3,
    n_estimators=200,
    reg_alpha=0,
    reg_lambda=1.5,
    subsample=0.7,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

# Base learners
estimators = [
    ('xgb', final_model),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('lr', LogisticRegression(max_iter=1000))
]

# Stacking classifier with LogisticRegression as final estimator
stacking_clf = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(),
    cv=5,
    passthrough=False
)

# Train the stacking model
stacking_clf.fit(X_train, y_train)

# Predict on validation set
y_pred = stacking_clf.predict(X_val)

# Evaluation
print("Accuracy:", accuracy_score(y_val, y_pred))
print("Classification Report:\n", classification_report(y_val, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred))



Accuracy: 0.9656652360515021
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.98      0.98      3325
           1       0.94      0.92      0.93      1102

    accuracy                           0.97      4427
   macro avg       0.96      0.95      0.95      4427
weighted avg       0.97      0.97      0.97      4427

Confusion Matrix:
 [[3259   66]
 [  86 1016]]


In [2]:
# Load test set
test_df = pd.read_csv("../data/prepared/test_fully_prepared.csv")
X_test = test_df.drop(columns=["id"])
test_ids = test_df["id"]

# Predict using the trained stacking model
y_pred = stacking_clf.predict(X_test)

# Convert numerical labels to string labels
y_labels = pd.Series(y_pred).map({0: "Extrovert", 1: "Introvert"})

# Build the submission DataFrame
submission = pd.DataFrame({
    "id": test_ids,
    "Personality": y_labels
})

# Save the submission file
submission.to_csv("submission.csv", index=False)
print("Submission saved as submission.csv")

Submission saved as submission.csv
