# Kaggle Introvert vs Extrovert Classification

This notebook provides a complete analysis and modeling pipeline for the Kaggle Introvert vs Extrovert classification competition.

## Table of Contents
1. [Data Loading and Setup](#data-loading)
2. [Exploratory Data Analysis](#eda)
3. [Feature Engineering](#feature-engineering)
4. [Model Training and Evaluation](#model-training)
5. [Final Predictions](#predictions)


## 1. Data Loading and Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sys
import warnings
warnings.filterwarnings('ignore')

# Add src to path
sys.path.append('../src')

# Set plotting style
plt.style.use('default')
sns.set_palette('husl')

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

print('Libraries imported successfully!')

In [None]:
# Load data
data_path = Path('../data/raw')

try:
    train_df = pd.read_csv(data_path / 'train.csv')
    test_df = pd.read_csv(data_path / 'test.csv')
    sample_submission = pd.read_csv(data_path / 'sample_submission.csv')
    
    print('Data loaded successfully!')
    print(f'Training data shape: {train_df.shape}')
    print(f'Test data shape: {test_df.shape}')
    print(f'Sample submission shape: {sample_submission.shape}')
except FileNotFoundError as e:
    print(f'Error loading data: {e}')
    print('Please ensure data files are in the ../data/raw/ directory')

## 2. Exploratory Data Analysis

In [None]:
# Basic dataset information
print('=== Dataset Overview ===')
print('\nTraining Data Info:')
print(train_df.info())

print('\nFirst few rows of training data:')
display(train_df.head())

print('\nBasic statistics:')
display(train_df.describe())

In [None]:
# Missing values analysis
print('=== Missing Values Analysis ===')
missing_train = train_df.isnull().sum()
missing_test = test_df.isnull().sum()

print('\nMissing values in training data:')
print(missing_train[missing_train > 0])

print('\nMissing values in test data:')
print(missing_test[missing_test > 0])

# Visualize missing values
if missing_train.sum() > 0 or missing_test.sum() > 0:
    fig, axes = plt.subplots(1, 2, figsize=(15, 6))
    
    # Training data missing values
    missing_train_pct = (missing_train / len(train_df)) * 100
    missing_train_pct = missing_train_pct[missing_train_pct > 0].sort_values(ascending=False)
    if len(missing_train_pct) > 0:
        missing_train_pct.plot(kind='bar', ax=axes[0])
        axes[0].set_title('Missing Values in Training Data (%)')
        axes[0].set_ylabel('Percentage')
    
    # Test data missing values
    missing_test_pct = (missing_test / len(test_df)) * 100
    missing_test_pct = missing_test_pct[missing_test_pct > 0].sort_values(ascending=False)
    if len(missing_test_pct) > 0:
        missing_test_pct.plot(kind='bar', ax=axes[1])
        axes[1].set_title('Missing Values in Test Data (%)')
        axes[1].set_ylabel('Percentage')
    
    plt.tight_layout()
    plt.show()
else:
    print('No missing values found in the datasets!')

In [None]:
# Target variable analysis
target_col = None
for col in train_df.columns:
    if col.lower() in ['personality', 'target', 'label', 'class']:
        target_col = col
        break

if target_col:
    print(f'=== Target Variable Analysis: {target_col} ===')
    print('\nValue counts:')
    print(train_df[target_col].value_counts())
    
    print('\nPercentage distribution:')
    print(train_df[target_col].value_counts(normalize=True) * 100)
    
    # Plot target distribution
    fig, axes = plt.subplots(1, 2, figsize=(15, 6))
    
    # Count plot
    train_df[target_col].value_counts().plot(kind='bar', ax=axes[0])
    axes[0].set_title(f'Distribution of {target_col}')
    axes[0].set_ylabel('Count')
    axes[0].tick_params(axis='x', rotation=45)
    
    # Pie chart
    train_df[target_col].value_counts().plot(kind='pie', ax=axes[1], autopct='%1.1f%%')
    axes[1].set_title(f'Percentage Distribution of {target_col}')
    axes[1].set_ylabel('')
    
    plt.tight_layout()
    plt.show()
else:
    print('Target column not found!')

In [None]:
# Feature analysis
print('=== Feature Analysis ===')

# Separate numeric and categorical features
numeric_features = train_df.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = train_df.select_dtypes(include=['object']).columns.tolist()

# Remove target from features if it's in the lists
if target_col in numeric_features:
    numeric_features.remove(target_col)
if target_col in categorical_features:
    categorical_features.remove(target_col)

print(f'Numeric features ({len(numeric_features)}): {numeric_features}')
print(f'Categorical features ({len(categorical_features)}): {categorical_features}')

# Analyze numeric features
if len(numeric_features) > 0:
    print('\nNumeric features statistics:')
    display(train_df[numeric_features].describe())

# Analyze categorical features
if len(categorical_features) > 0:
    print('\nCategorical features info:')
    for feature in categorical_features:
        unique_count = train_df[feature].nunique()
        print(f'{feature}: {unique_count} unique values')
        if unique_count <= 10:
            print(f'  Values: {train_df[feature].unique().tolist()}')
        print()

In [None]:
# Plot feature distributions
if len(numeric_features) > 0:
    print('=== Numeric Feature Distributions ===')
    
    # Calculate number of rows and columns for subplots
    n_features = len(numeric_features)
    n_cols = min(3, n_features)
    n_rows = (n_features + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5 * n_rows))
    if n_features == 1:
        axes = [axes]
    elif n_rows == 1:
        axes = axes.flatten()
    else:
        axes = axes.flatten()
    
    for i, feature in enumerate(numeric_features):
        train_df[feature].hist(bins=30, ax=axes[i], alpha=0.7)
        axes[i].set_title(f'Distribution of {feature}')
        axes[i].set_xlabel(feature)
        axes[i].set_ylabel('Frequency')
    
    # Hide empty subplots
    for i in range(n_features, len(axes)):
        axes[i].set_visible(False)
    
    plt.tight_layout()
    plt.show()

In [None]:
# Correlation analysis
if len(numeric_features) > 1:
    print('=== Correlation Analysis ===')
    
    # Calculate correlation matrix
    correlation_matrix = train_df[numeric_features].corr()
    
    # Plot correlation heatmap
    plt.figure(figsize=(12, 10))
    mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
    sns.heatmap(correlation_matrix, mask=mask, annot=True, cmap='coolwarm', 
                center=0, square=True, fmt='.2f')
    plt.title('Feature Correlation Matrix')
    plt.tight_layout()
    plt.show()
    
    # Find highly correlated features
    high_corr_pairs = []
    for i in range(len(correlation_matrix.columns)):
        for j in range(i+1, len(correlation_matrix.columns)):
            corr_val = correlation_matrix.iloc[i, j]
            if abs(corr_val) > 0.7:
                high_corr_pairs.append((
                    correlation_matrix.columns[i], 
                    correlation_matrix.columns[j], 
                    corr_val
                ))
    
    if high_corr_pairs:
        print('\nHighly correlated feature pairs (|correlation| > 0.7):')
        for feat1, feat2, corr in high_corr_pairs:
            print(f'{feat1} - {feat2}: {corr:.3f}')
    else:
        print('\nNo highly correlated feature pairs found.')

## 3. Feature Engineering

In [None]:
# Import feature engineering module
from feature_engineering import FeatureEngineer
from sklearn.model_selection import train_test_split

# Initialize feature engineer
feature_engineer = FeatureEngineer()

print('Feature engineering module loaded successfully!')

In [None]:
# Preprocess the data
print('=== Feature Engineering ===')

# Separate features and target
if target_col:
    X = train_df.drop(columns=[target_col])
    y = train_df[target_col]
    
    print(f'Original feature shape: {X.shape}')
    print(f'Target shape: {y.shape}')
    
    # Apply feature engineering
    X_processed, y_processed = feature_engineer.preprocess_data(X, y)
    
    print(f'Processed feature shape: {X_processed.shape}')
    print(f'Processed target shape: {y_processed.shape}')
    
    # Process test data
    X_test_processed = feature_engineer.transform_features(test_df)
    print(f'Processed test shape: {X_test_processed.shape}')
else:
    print('Cannot proceed without target column!')

## 4. Model Training and Evaluation

In [None]:
# Import model training module
from model_training import ModelTrainer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize model trainer
model_trainer = ModelTrainer(random_state=42)

print('Model training module loaded successfully!')

In [None]:
# Split data for training and validation
X_train, X_val, y_train, y_val = train_test_split(
    X_processed, y_processed, 
    test_size=0.2, 
    random_state=42, 
    stratify=y_processed
)

print(f'Training set shape: {X_train.shape}')
print(f'Validation set shape: {X_val.shape}')
print(f'Training target distribution:')
print(pd.Series(y_train).value_counts(normalize=True))
print(f'\nValidation target distribution:')
print(pd.Series(y_val).value_counts(normalize=True))

In [None]:
# Perform cross-validation
print('=== Cross-Validation Results ===')
cv_scores = model_trainer.cross_validate_models(X_processed, y_processed)

# Display results
results_df = pd.DataFrame(cv_scores).T
results_df = results_df.sort_values('mean', ascending=False)
display(results_df)

In [None]:
# Train the best model
print('=== Training Best Model ===')
best_model = model_trainer.train_best_model(X_train, y_train)

# Make predictions on validation set
y_pred = best_model.predict(X_val)

# Evaluate performance
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy:.4f}')

print('\nClassification Report:')
print(classification_report(y_val, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_val, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

In [None]:
# Create ensemble model
print('=== Creating Ensemble Model ===')
ensemble_model = model_trainer.create_ensemble(X_processed, y_processed)

# Evaluate ensemble on validation set
y_pred_ensemble = ensemble_model.predict(X_val)
ensemble_accuracy = accuracy_score(y_val, y_pred_ensemble)

print(f'Ensemble Validation Accuracy: {ensemble_accuracy:.4f}')
print(f'Single Model Accuracy: {accuracy:.4f}')
print(f'Improvement: {ensemble_accuracy - accuracy:.4f}')

## 5. Final Predictions

In [None]:
# Make final predictions on test set
print('=== Generating Final Predictions ===')

# Use the better performing model
final_model = ensemble_model if ensemble_accuracy > accuracy else best_model
model_name = 'Ensemble' if ensemble_accuracy > accuracy else 'Best Single Model'

print(f'Using {model_name} for final predictions')

# Generate predictions
test_predictions = final_model.predict(X_test_processed)

print(f'Generated {len(test_predictions)} predictions')
print(f'Prediction distribution:')
print(pd.Series(test_predictions).value_counts())

In [None]:
# Create submission file
submission = sample_submission.copy()
submission[target_col] = test_predictions

# Save submission
submission_path = Path('../data/processed/submission.csv')
submission_path.parent.mkdir(parents=True, exist_ok=True)
submission.to_csv(submission_path, index=False)

print(f'Submission saved to: {submission_path}')
print('\nSubmission preview:')
display(submission.head(10))

print(f'\nSubmission shape: {submission.shape}')
print(f'Submission target distribution:')
print(submission[target_col].value_counts(normalize=True))

## Summary

This notebook provided a complete analysis pipeline for the Kaggle Introvert vs Extrovert classification competition:

1. **Data Loading**: Successfully loaded training, test, and submission data
2. **EDA**: Analyzed data structure, missing values, target distribution, and feature relationships
3. **Feature Engineering**: Applied preprocessing, feature creation, and scaling
4. **Model Training**: Trained multiple models, performed cross-validation, and created ensemble
5. **Predictions**: Generated final predictions and created submission file

The final model achieved good performance and the submission is ready for upload to Kaggle.