# Feature Engineering

In [None]:
import pandas as pd
import joblib
import os

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, TargetEncoder

CLEANED_DATA_PATH = '../data/processed/lending-club-cleaned.csv'
# Save separate train and test files to maintain the vault!
TRAIN_DATA_PATH = '../data/processed/train_fe.csv'
TEST_DATA_PATH = '../data/processed/test_fe.csv'
PIPELINE_PATH = '../models/preprocessor.joblib'

In [None]:
df = pd.read_csv(CLEANED_DATA_PATH)
df.head()

### Split Data

In [None]:
df = pd.read_csv(CLEANED_DATA_PATH)
df_prep = df.copy()

df_prep['loan_status'] = df_prep['loan_status'].map({'Fully Paid': 0, 'Charged Off': 1})

X = df_prep.drop(columns=['loan_status'])
y = df_prep['loan_status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training Data Shape: {X_train.shape}")
print(f"Testing Data Shape:  {X_test.shape}")

### Define Pipeline

In [None]:
# Scikit-learn >= 1.2 allows us to output clean Pandas DataFrames instead of messy Numpy Arrays
pd.set_option('display.max_columns', None)

# Blueprint 1: emp_length (Needs BOTH Mode Imputation AND Ordinal Encoding)
emp_categories = ['< 1 year', '1 year', '2 years', '3 years', '4 years', 
                  '5 years', '6 years', '7 years', '8 years', '9 years', '10+ years']

emp_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(categories=[emp_categories]))
])

# Blueprint 2: Standard Ordinals (No missing values expected here)
ord_cols = ['term', 'grade', 'sub_grade', 'verification_status']
ord_categories = [
    [' 36 months', ' 60 months'],
    ['A', 'B', 'C', 'D', 'E', 'F', 'G'],
    ['A1', 'A2', 'A3', 'A4', 'A5', 'B1', 'B2', 'B3', 'B4', 'B5',
     'C1', 'C2', 'C3', 'C4', 'C5', 'D1', 'D2', 'D3', 'D4', 'D5',
     'E1', 'E2', 'E3', 'E4', 'E5', 'F1', 'F2', 'F3', 'F4', 'F5',
     'G1', 'G2', 'G3', 'G4', 'G5'],
    ['Not Verified', 'Verified', 'Source Verified']
]

ord_pipe = Pipeline(steps=[
    ('ordinal', OrdinalEncoder(categories=ord_categories))
])

# Blueprint 3: mort_acc (Needs Median Imputation)
mort_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

In [None]:
handled_categorical_cols = ['emp_length', 'term', 'grade', 'sub_grade', 'verification_status', 'purpose']

all_categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()

ohe_cols = [col for col in all_categorical_cols if col not in handled_categorical_cols]

print(f"Dynamically detected columns for OHE: {ohe_cols}")

In [None]:
preprocessor = ColumnTransformer(transformers=[
    ('emp', emp_pipe, ['emp_length']),
    ('ord', ord_pipe, ord_cols),
    ('mort', mort_pipe, ['mort_acc']),
    ('target', TargetEncoder(target_type='binary'), ['purpose']),
    
    ('ohe', OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False), ohe_cols)
    
], remainder='passthrough') 

preprocessor.set_output(transform="pandas")
preprocessor.fit(X_train, y_train)

### Apply Transformation

In [None]:
# Apply the learned rules to both sets securely
X_train_fe = preprocessor.transform(X_train)
X_test_fe = preprocessor.transform(X_test)

# Optional: Scikit-learn adds prefixes like 'remainder__' to column names. 
# This quick loop cleans them up so your columns look normal again.
X_train_fe.columns = [col.split('__')[-1] for col in X_train_fe.columns]
X_test_fe.columns = [col.split('__')[-1] for col in X_test_fe.columns]

print("Transformations applied.")
print(f"Final Training Features Shape: {X_train_fe.shape}")
print(f"Final Testing Features Shape:  {X_test_fe.shape}")

# Verify no nulls remain in the features
print(f"\nRemaining Nulls in Train:\n{X_train_fe.isnull().sum()[X_train_fe.isnull().sum() > 0]}")

### Save Dataset & Pipeline

In [None]:
os.makedirs('../models', exist_ok=True)

# Reattach the target variable temporarily just to save clean CSVs for Notebook 03
train_df = pd.concat([X_train_fe, y_train.reset_index(drop=True)], axis=1)
test_df = pd.concat([X_test_fe, y_test.reset_index(drop=True)], axis=1)

train_df.to_csv(TRAIN_DATA_PATH, index=False)
test_df.to_csv(TEST_DATA_PATH, index=False)

# Save the actual fitted pipeline! 
joblib.dump(preprocessor, PIPELINE_PATH)

print(f"Saved Train Data to: {TRAIN_DATA_PATH}")
print(f"Saved Test Data to:  {TEST_DATA_PATH}")
print(f"Saved Preprocessor Pipeline to: {PIPELINE_PATH}")