# Introvert vs Extrovert Prediction

This notebook aims to predict whether a person is an introvert or extrovert based on behavioral and social features. We'll perform EDA, robust preprocessing, feature engineering, and model tuning for best accuracy.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import VarianceThreshold
import warnings
warnings.filterwarnings('ignore')

RANDOM_STATE = 42

## Data Loading

In [None]:
train_df = pd.read_csv('../input/playground-series-s5e7/train.csv')
test_df = pd.read_csv('../input/playground-series-s5e7/test.csv')
display(train_df.head())

## EDA (Quick Overview)

In [None]:
print(train_df.info())
print(train_df.isnull().sum())
sns.countplot(x='Personality', data=train_df)
plt.title('Target Distribution')
plt.show()

## Preprocessing & Feature Engineering

In [None]:
# Drop ID
train_df = train_df.drop(columns=['id'])
test_ids = test_df['id']
test_df = test_df.drop(columns=['id'])

# Encode target
y = train_df['Personality'].map({'Introvert': 0, 'Extrovert': 1})
X = train_df.drop(columns=['Personality'])

In [None]:
# Feature engineering: Social Activity Score
for df in [X, test_df]:
    df['Social_Score'] = (
        df['Social_event_attendance'].fillna(0) +
        df['Going_outside'].fillna(0) +
        df['Friends_circle_size'].fillna(0)
    )

In [None]:
# Identify feature types
categorical_cols = ['Stage_fear', 'Drained_after_socializing']
numerical_cols = [col for col in X.columns if col not in categorical_cols]
print('Numerical:', numerical_cols)
print('Categorical:', categorical_cols)

## Preprocessing Pipeline

In [None]:
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])

feature_selector = VarianceThreshold(threshold=0.0)

## Train/Validation Split

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)

## Model Pipeline & Hyperparameter Tuning

In [None]:
pipe = Pipeline([
    ('pre', preprocessor),
    ('selector', feature_selector),
    ('clf', RandomForestClassifier(random_state=RANDOM_STATE, class_weight='balanced'))
])

param_grid = {
    'clf__n_estimators': [100, 200],
    'clf__max_depth': [None, 5, 10, 20],
    'clf__min_samples_split': [2, 5, 10],
}

grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)
print(f'Best Params: {grid_search.best_params_}')

## Evaluation

In [None]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_valid)
print('Validation Accuracy:', accuracy_score(y_valid, y_pred))
print(classification_report(y_valid, y_pred))

cm = confusion_matrix(y_valid, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Introvert','Extrovert'], yticklabels=['Introvert','Extrovert'])
plt.xlabel('Predicted'); plt.ylabel('Actual'); plt.show()

## Feature Importance

In [None]:
all_feat_names = (
    numerical_cols +
    list(best_model.named_steps['pre'].named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_cols))
)
importances = best_model.named_steps['clf'].feature_importances_
indices = np.argsort(importances)[::-1]
plt.figure(figsize=(9,5))
plt.title("Feature Importances")
plt.bar(range(len(importances)), importances[indices])
plt.xticks(range(len(importances)), np.array(all_feat_names)[indices], rotation=90)
plt.tight_layout()
plt.show()

## Predict on Test Set & Save Submission

In [None]:
test_pred = best_model.predict(test_df)
output = pd.DataFrame({'id': test_ids, 'Personality': np.where(test_pred==1, 'Extrovert', 'Introvert')})
output.to_csv('submission.csv', index=False)
output.head()

## Conclusion

*With robust preprocessing, feature engineering, and hyperparameter tuning, this workflow aims for excellent accuracy on personality prediction. Feature importances and confusion matrix guide further improvements. Try more advanced models or additional domain features for even better results!*