In [None]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# Load original prepared training data
df = pd.read_csv("../data/prepared/train_fully_prepared.csv")

# Split features and target
X = df.drop(columns=["id", "Personality"])
y = df["Personality"]

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Define the hyperparameter grid
param_grid = {
    'max_depth': [3, 5, 7],                      # tree depth
    'n_estimators': [100, 200],                  # number of boosting rounds
    'learning_rate': [0.01, 0.05, 0.1],          # learning rate
    'subsample': [0.7, 0.85, 1.0],               # training instance subsampling
    'colsample_bytree': [0.7, 0.9, 1.0],         # feature subsampling for each tree
    'gamma': [0, 0.2, 0.5],                      # minimum loss reduction to make a split
    'reg_alpha': [0, 0.1, 0.5],                  # L1 regularization (sparsity)
    'reg_lambda': [1, 1.5, 2.0],                 # L2 regularization (weight smoothing)
}

# Set up model
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# GridSearch with accuracy
grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring=make_scorer(accuracy_score),
    cv=3,
    verbose=1,
    n_jobs=-1
)

# Run search
grid_search.fit(X_train, y_train)

# Output results
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)


Fitting 3 folds for each of 4374 candidates, totalling 13122 fits
Best Parameters: {'colsample_bytree': 0.7, 'gamma': 0.2, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200, 'reg_alpha': 0, 'reg_lambda': 1.5, 'subsample': 0.7}
Best Accuracy: 0.9697824921753423


In [3]:
# Train final XGBoost model on all data
final_model = XGBClassifier(
    colsample_bytree=0.7,
    gamma=0.2,
    learning_rate=0.1,
    max_depth=3,
    n_estimators=200,
    reg_alpha=0,
    reg_lambda=1.5,
    subsample=0.7,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)
final_model.fit(X, y)

# Load test set
test_df = pd.read_csv("../data/prepared/test_fully_prepared.csv")
X_test = test_df.drop(columns=["id"])
test_ids = test_df["id"]

# Predict
y_pred = final_model.predict(X_test)
y_labels = pd.Series(y_pred).map({0: "Extrovert", 1: "Introvert"})

# Build submission
submission = pd.DataFrame({
    "id": test_ids,
    "Personality": y_labels
})

# Save
submission.to_csv("submission.csv", index=False)
