# Kaggle Introvert vs Extrovert Classification - Kaggle Notebook

This notebook provides a complete analysis and modeling pipeline for the Kaggle Playground Series S5E7 competition, optimized for Kaggle's environment.

## Competition: Predict the Introverts from the Extroverts
- Competition URL: https://www.kaggle.com/competitions/playground-series-s5e7

## Table of Contents
1. [Setup and Imports](#setup)
2. [Data Loading](#data-loading)
3. [Exploratory Data Analysis](#eda)
4. [Feature Engineering](#feature-engineering)
5. [Model Training and Evaluation](#model-training)
6. [Final Predictions](#predictions)


## 1. Setup and Imports

Kaggle notebooks come with most packages pre-installed, but we'll import what we need.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
import os
warnings.filterwarnings('ignore')

# Machine Learning imports
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_selection import SelectKBest, f_classif
import xgboost as xgb
import lightgbm as lgb

# Set plotting style
plt.style.use('default')
sns.set_palette('husl')

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

print('Libraries imported successfully!')

## 2. Data Loading

In Kaggle notebooks, competition data is automatically available in the `/kaggle/input/` directory.

In [None]:
# Kaggle data paths
data_path = Path('/kaggle/input/playground-series-s5e7')

# Load the datasets
train_df = pd.read_csv(data_path / 'train.csv')
test_df = pd.read_csv(data_path / 'test.csv')
sample_submission = pd.read_csv(data_path / 'sample_submission.csv')

print('Data loaded successfully!')
print(f'Training data shape: {train_df.shape}')
print(f'Test data shape: {test_df.shape}')
print(f'Sample submission shape: {sample_submission.shape}')

# Display first few rows
print('\n=== Training Data Sample ===')
display(train_df.head())

print('\n=== Test Data Sample ===')
display(test_df.head())

print('\n=== Sample Submission ===')
display(sample_submission.head())

In [None]:
# Basic dataset information
print('=== Dataset Info ===')
print('\n--- Training Data Info ---')
print(train_df.info())

print('\n--- Test Data Info ---')
print(test_df.info())

print('\n--- Missing Values ---')
print('Training data missing values:')
print(train_df.isnull().sum())
print('\nTest data missing values:')
print(test_df.isnull().sum())

print('\n--- Data Types ---')
print('Training data types:')
print(train_df.dtypes)
print('\nTest data types:')
print(test_df.dtypes)

## 3. Exploratory Data Analysis

In [None]:
# Target variable analysis
target_col = 'Personality'  # Adjust if different

if target_col in train_df.columns:
    print(f'=== Target Variable: {target_col} ===')
    print(train_df[target_col].value_counts())
    print(f'\nTarget distribution (%):')
    print(train_df[target_col].value_counts(normalize=True) * 100)
    
    # Plot target distribution
    plt.figure(figsize=(10, 6))
    
    plt.subplot(1, 2, 1)
    train_df[target_col].value_counts().plot(kind='bar')
    plt.title(f'{target_col} Distribution (Count)')
    plt.xlabel(target_col)
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    
    plt.subplot(1, 2, 2)
    train_df[target_col].value_counts().plot(kind='pie', autopct='%1.1f%%')
    plt.title(f'{target_col} Distribution (%)')
    plt.ylabel('')
    
    plt.tight_layout()
    plt.show()
else:
    print('Target column not found. Available columns:')
    print(train_df.columns.tolist())

In [None]:
# Feature analysis
print('=== Feature Analysis ===')

# Separate numeric and categorical features
numeric_features = train_df.select_dtypes(include=[np.number]).columns.tolist()
if target_col in numeric_features:
    numeric_features.remove(target_col)

categorical_features = train_df.select_dtypes(include=['object']).columns.tolist()
if target_col in categorical_features:
    categorical_features.remove(target_col)

print(f'Numeric features ({len(numeric_features)}): {numeric_features}')
print(f'Categorical features ({len(categorical_features)}): {categorical_features}')

# Statistical summary for numeric features
if len(numeric_features) > 0:
    print('\n=== Numeric Features Summary ===')
    display(train_df[numeric_features].describe())

# Categorical features summary
if len(categorical_features) > 0:
    print('\n=== Categorical Features Summary ===')
    for feature in categorical_features:
        print(f'\n{feature}:')
        print(train_df[feature].value_counts().head(10))

In [None]:
# Plot feature distributions
if len(numeric_features) > 0:
    n_features = min(len(numeric_features), 12)  # Limit to 12 features
    n_cols = 4
    n_rows = (n_features + n_cols - 1) // n_cols
    
    plt.figure(figsize=(20, 5 * n_rows))
    
    for i, feature in enumerate(numeric_features[:n_features]):
        plt.subplot(n_rows, n_cols, i + 1)
        train_df[feature].hist(bins=30, alpha=0.7)
        plt.title(f'{feature} Distribution')
        plt.xlabel(feature)
        plt.ylabel('Frequency')
    
    plt.tight_layout()
    plt.show()

In [None]:
# Correlation analysis
if len(numeric_features) > 1:
    plt.figure(figsize=(15, 12))
    correlation_matrix = train_df[numeric_features].corr()
    
    # Create mask for upper triangle
    mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
    
    # Generate heatmap
    sns.heatmap(correlation_matrix, mask=mask, annot=True, cmap='coolwarm', center=0,
                square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})
    plt.title('Feature Correlation Matrix')
    plt.tight_layout()
    plt.show()
    
    # Find highly correlated features
    high_corr_pairs = []
    for i in range(len(correlation_matrix.columns)):
        for j in range(i+1, len(correlation_matrix.columns)):
            if abs(correlation_matrix.iloc[i, j]) > 0.8:
                high_corr_pairs.append((
                    correlation_matrix.columns[i],
                    correlation_matrix.columns[j],
                    correlation_matrix.iloc[i, j]
                ))
    
    if high_corr_pairs:
        print('\n=== Highly Correlated Features (|r| > 0.8) ===')
        for feat1, feat2, corr in high_corr_pairs:
            print(f'{feat1} - {feat2}: {corr:.3f}')
    else:
        print('\nNo highly correlated features found (|r| > 0.8)')

## 4. Feature Engineering

In [None]:
class FeatureEngineer:
    def __init__(self):
        self.label_encoders = {}
        self.scaler = StandardScaler()
        self.feature_selector = None
        self.target_encoder = LabelEncoder()
    
    def preprocess_data(self, train_df, test_df, target_col='Personality'):
        """Preprocess training and test data"""
        print('Starting feature engineering...')
        
        # Separate features and target
        X_train = train_df.drop(columns=[target_col])
        y_train = train_df[target_col].copy()
        X_test = test_df.copy()
        
        # Remove ID column if present
        id_cols = ['id', 'Id', 'ID']
        for col in id_cols:
            if col in X_train.columns:
                X_train = X_train.drop(columns=[col])
            if col in X_test.columns:
                X_test = X_test.drop(columns=[col])
        
        # Encode target variable
        y_train_encoded = self.target_encoder.fit_transform(y_train)
        
        # Handle categorical features
        categorical_features = X_train.select_dtypes(include=['object']).columns
        
        for feature in categorical_features:
            # Fill missing values
            X_train[feature] = X_train[feature].fillna('Unknown')
            X_test[feature] = X_test[feature].fillna('Unknown')
            
            # Label encoding
            self.label_encoders[feature] = LabelEncoder()
            
            # Fit on combined data to handle unseen categories
            combined_values = pd.concat([X_train[feature], X_test[feature]])
            self.label_encoders[feature].fit(combined_values)
            
            X_train[feature] = self.label_encoders[feature].transform(X_train[feature])
            X_test[feature] = self.label_encoders[feature].transform(X_test[feature])
        
        # Handle numeric features
        numeric_features = X_train.select_dtypes(include=[np.number]).columns
        
        # Fill missing values with median
        for feature in numeric_features:
            median_val = X_train[feature].median()
            X_train[feature] = X_train[feature].fillna(median_val)
            X_test[feature] = X_test[feature].fillna(median_val)
        
        # Scale features
        X_train_scaled = self.scaler.fit_transform(X_train)
        X_test_scaled = self.scaler.transform(X_test)
        
        # Convert back to DataFrame
        X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
        X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
        
        print(f'Feature engineering completed!')
        print(f'Training features shape: {X_train_scaled.shape}')
        print(f'Test features shape: {X_test_scaled.shape}')
        
        return X_train_scaled, X_test_scaled, y_train_encoded
    
    def select_features(self, X_train, y_train, k=50):
        """Select top k features"""
        self.feature_selector = SelectKBest(score_func=f_classif, k=min(k, X_train.shape[1]))
        X_train_selected = self.feature_selector.fit_transform(X_train, y_train)
        
        selected_features = X_train.columns[self.feature_selector.get_support()]
        print(f'Selected {len(selected_features)} features: {selected_features.tolist()}')
        
        return X_train_selected, selected_features
    
    def transform_test(self, X_test):
        """Transform test data using fitted feature selector"""
        if self.feature_selector is not None:
            return self.feature_selector.transform(X_test)
        return X_test

In [None]:
# Apply feature engineering
fe = FeatureEngineer()
X_train, X_test, y_train = fe.preprocess_data(train_df, test_df, target_col='Personality')

# Feature selection (optional)
# X_train_selected, selected_features = fe.select_features(X_train, y_train, k=50)
# X_test_selected = fe.transform_test(X_test)

print('\nFeature engineering completed!')

## 5. Model Training and Evaluation

In [None]:
class ModelTrainer:
    def __init__(self):
        self.models = {}
        self.best_model = None
        self.best_score = 0
    
    def train_models(self, X_train, y_train, cv_folds=5):
        """Train multiple models and compare performance"""
        print('Training models...')
        
        # Define models
        models = {
            'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
            'XGBoost': xgb.XGBClassifier(random_state=42, eval_metric='logloss'),
            'LightGBM': lgb.LGBMClassifier(random_state=42, verbose=-1),
            'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000)
        }
        
        # Cross-validation
        cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)
        results = {}
        
        for name, model in models.items():
            print(f'Training {name}...')
            scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy')
            results[name] = {
                'mean': scores.mean(),
                'std': scores.std(),
                'scores': scores
            }
            
            # Fit model on full training data
            model.fit(X_train, y_train)
            self.models[name] = model
            
            # Track best model
            if results[name]['mean'] > self.best_score:
                self.best_score = results[name]['mean']
                self.best_model = model
        
        # Display results
        print('\n=== Model Performance (Cross-Validation) ===')
        for name, result in results.items():
            print(f'{name}: {result["mean"]:.4f} (+/- {result["std"] * 2:.4f})')
        
        return results
    
    def evaluate_model(self, X_val, y_val, model_name=None):
        """Evaluate model on validation set"""
        if model_name:
            model = self.models[model_name]
        else:
            model = self.best_model
            model_name = 'Best Model'
        
        y_pred = model.predict(X_val)
        accuracy = accuracy_score(y_val, y_pred)
        
        print(f'\n=== {model_name} Evaluation ===')
        print(f'Accuracy: {accuracy:.4f}')
        print('\nClassification Report:')
        print(classification_report(y_val, y_pred))
        
        # Confusion Matrix
        cm = confusion_matrix(y_val, y_pred)
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title(f'{model_name} - Confusion Matrix')
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        plt.show()
        
        return accuracy
    
    def predict(self, X_test, model_name=None):
        """Make predictions on test set"""
        if model_name:
            model = self.models[model_name]
        else:
            model = self.best_model
        
        return model.predict(X_test)

In [None]:
# Split data for validation
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

print(f'Training set: {X_train_split.shape}')
print(f'Validation set: {X_val_split.shape}')

# Train models
trainer = ModelTrainer()
cv_results = trainer.train_models(X_train_split, y_train_split)

# Evaluate on validation set
val_accuracy = trainer.evaluate_model(X_val_split, y_val_split)

## 6. Final Predictions

In [None]:
# Retrain best model on full training data
print('Retraining best model on full training data...')
trainer.best_model.fit(X_train, y_train)

# Make predictions on test set
test_predictions = trainer.predict(X_test)

# Convert predictions back to original labels
test_predictions_labels = fe.target_encoder.inverse_transform(test_predictions)

print(f'Test predictions shape: {test_predictions.shape}')
print(f'Prediction distribution:')
unique, counts = np.unique(test_predictions_labels, return_counts=True)
for label, count in zip(unique, counts):
    print(f'{label}: {count} ({count/len(test_predictions_labels)*100:.1f}%)')

In [None]:
# Create submission file
submission = sample_submission.copy()
submission['Personality'] = test_predictions_labels

# Save submission
submission_filename = 'submission.csv'
submission.to_csv(submission_filename, index=False)

print(f'Submission saved as {submission_filename}')
print('\nSubmission preview:')
display(submission.head(10))

print(f'\nSubmission shape: {submission.shape}')
print(f'Submission file size: {os.path.getsize(submission_filename)} bytes')

In [None]:
# Summary
print('=== Analysis Summary ===')
print(f'Dataset: {train_df.shape[0]} training samples, {test_df.shape[0]} test samples')
print(f'Features: {X_train.shape[1]} features after preprocessing')
print(f'Best model validation accuracy: {val_accuracy:.4f}')
print(f'Submission file: {submission_filename}')
print('\nReady for submission to Kaggle!')