# 🧼 Final Preprocessing Pipeline

This notebook performs final preprocessing on both `train.csv` and `test.csv` using consistent imputation strategies and missing value flags, based on prior feature importance analysis.


In [None]:
# Imports
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load raw data
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

# Save target and ID before dropping
y = train['Personality']
train_ids = train['id']
test_ids = test['id']

train = train.drop(columns=['id', 'Personality'])
test = test.drop(columns=['id'])

# Define features
binary_cols = ['Stage_fear', 'Drained_after_socializing']
numeric_cols = ['Time_spent_Alone', 'Social_event_attendance', 'Going_outside',
                'Friends_circle_size', 'Post_frequency']

# Encode Yes/No to 1/0 before imputing
for col in binary_cols:
    train[col] = train[col].map({'Yes': 1, 'No': 0})
    test[col] = test[col].map({'Yes': 1, 'No': 0})

# Define preprocessing pipelines
binary_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent', add_indicator=True))
])

numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median', add_indicator=True))
])

preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, numeric_cols),
    ('bin', binary_pipeline, binary_cols)
])

# Fit-transform train, transform test
X_train = preprocessor.fit_transform(train)
X_test = preprocessor.transform(test)

# Convert missing flags from True/False to 0/1
X_train = X_train.astype(int)
X_test = X_test.astype(int)

# Use get_feature_names_out to safely get correct column names
column_names = preprocessor.get_feature_names_out()

# Build final DataFrames
df_train_processed = pd.DataFrame(X_train, columns=column_names)
df_test_processed = pd.DataFrame(X_test, columns=column_names)

# Add back ID and target
df_train_processed.insert(0, 'id', train_ids)
df_train_processed['Personality'] = y

df_test_processed.insert(0, 'id', test_ids)

# Save processed files
df_train_processed.to_csv('../data/train_processed.csv', index=False)
df_test_processed.to_csv('../data/test_processed.csv', index=False)


In [None]:
# ✅ Validation: Check consistency and correctness of processed data (safe version with real column names)

# Reload and compare shapes
train_check = pd.read_csv('../data/train_processed.csv')
test_check = pd.read_csv('../data/test_processed.csv')

print("Train shape:", train_check.shape)
print("Test shape:", test_check.shape)

# Check for missing values
print("\nAny NaNs in train:", train_check.isnull().values.any())
print("Any NaNs in test:", test_check.isnull().values.any())

# Map original column names to real _missing flag columns
missing_map = {}
for prefix, cols in zip(['num', 'bin'], [numeric_cols, binary_cols]):
    for col in cols:
        missing_map[col] = f"{prefix}__missingindicator_{col}"


# Validation for train.csv
print("\n✅ Validation for train.csv:")
original_train = pd.read_csv('../data/train.csv')

for col, flag_col in missing_map.items():
    raw_na_count = original_train[col].isnull().sum()
    processed_count = train_check[flag_col].sum()

    print(f"{flag_col}: {processed_count} (expected {raw_na_count}) ✓"
          if processed_count == raw_na_count
          else f"{flag_col}: {processed_count} (⚠ expected {raw_na_count})")

# Validation for test.csv
print("\n✅ Validation for test.csv:")
original_test = pd.read_csv('../data/test.csv')

for col, flag_col in missing_map.items():
    raw_na_count = original_test[col].isnull().sum()
    processed_count = test_check[flag_col].sum()

    print(f"{flag_col}: {processed_count} (expected {raw_na_count}) ✓"
          if processed_count == raw_na_count
          else f"{flag_col}: {processed_count} (⚠ expected {raw_na_count})")



Train shape: (18524, 16)
Test shape: (6175, 15)

Any NaNs in train: False
Any NaNs in test: False

✅ Validation for train.csv:
num__missingindicator_Time_spent_Alone: 1190 (expected 1190) ✓
num__missingindicator_Social_event_attendance: 1180 (expected 1180) ✓
num__missingindicator_Going_outside: 1466 (expected 1466) ✓
num__missingindicator_Friends_circle_size: 1054 (expected 1054) ✓
num__missingindicator_Post_frequency: 1264 (expected 1264) ✓
bin__missingindicator_Stage_fear: 1893 (expected 1893) ✓
bin__missingindicator_Drained_after_socializing: 1149 (expected 1149) ✓

✅ Validation for test.csv:
num__missingindicator_Time_spent_Alone: 425 (expected 425) ✓
num__missingindicator_Social_event_attendance: 397 (expected 397) ✓
num__missingindicator_Going_outside: 466 (expected 466) ✓
num__missingindicator_Friends_circle_size: 350 (expected 350) ✓
num__missingindicator_Post_frequency: 408 (expected 408) ✓
bin__missingindicator_Stage_fear: 598 (expected 598) ✓
bin__missingindicator_Drained_a

# 🧼 Final Preprocessing Summary

This notebook performs safe and consistent preprocessing for `train.csv` and `test.csv`:

### ✅ Key Steps:
- Encoded `Yes`/`No` binary features as `1`/`0`
- Imputed:
  - Numeric features → median
  - Binary features → most frequent
- Added `_missing` indicators for all features with `SimpleImputer(add_indicator=True)`
- Used `get_feature_names_out()` to ensure indicator names match actual columns
- Converted all `True/False` flags to `0`/`1`
- Saved final outputs to:
  - `data/train_processed.csv`
  - `data/test_processed.csv`

### 🔍 Validation:
- Checked for absence of NaNs
- Verified that `_missing` columns correctly reflect real missing values from raw datasets
- Output shows `✓` if counts match, `⚠` if not

This data is now ready for baseline modeling. 🚀
