This notebook helps us transform and engineer additional features from the cleaned dataset:

- Convert categorical columns (`term`, `emp_length`, `loan_status`) into numerical form
- Parse date column `earliest_cr_line` and derive `credit_history_years`
- Save the fully cleaned and feature-enhanced dataset for modeling

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import yaml

# Load config file
with open('../config/config.yaml', 'r') as f:
    config = yaml.safe_load(f)

# Load cleaned dataset from 02_data_cleaning
df = pd.read_csv(config['cleaned_data_path'])
print("Cleaned data loaded. Shape:", df.shape)

# STEP 1: Convert 'term' to integer (36/60)
df['term'] = df['term'].str.extract('(\d+)').astype('Int64')


# STEP 2: Convert 'emp_length' to numeric
emp_length_map = {
    '< 1 year': 0, '1 year': 1, '2 years': 2, '3 years': 3, '4 years': 4,
    '5 years': 5, '6 years': 6, '7 years': 7, '8 years': 8, '9 years': 9,
    '10+ years': 10, 'n/a': np.nan
}
df['emp_length'] = df['emp_length'].map(emp_length_map)


# STEP 3: Encode 'loan_status' as binary
# Fully Paid (0) and Charged Off (1)
df = df[df['loan_status'].isin(['Fully Paid', 'Charged Off'])]
df['loan_status'] = df['loan_status'].map({'Fully Paid': 0, 'Charged Off': 1})


# STEP 4: Calculate credit history length
# Convert 'earliest_cr_line' to datetime
# Convert 'credit_history_years'
df['earliest_cr_line'] = pd.to_datetime(df['earliest_cr_line'], format='%b-%Y', errors='coerce')
df['credit_history_years'] = 2025 - df['earliest_cr_line'].dt.year
# Drop the original column
df.drop(columns='earliest_cr_line', inplace=True)


# Final Checks
print("Final dataset shape:", df.shape)
print("Remaining columns:", df.columns.tolist())
print("\n Remaining missing values:")
print(df.isnull().sum().sort_values(ascending=False).head())

# Save to cleaned output CSV
df.to_csv(config['featured_data_path'], index=False)
print("Feature-engineered dataset saved to:", config['featured_data_path'])


Cleaned data loaded. Shape: (2257158, 22)
Final dataset shape: (1344079, 22)
Remaining columns: ['loan_amnt', 'term', 'int_rate', 'installment', 'grade', 'emp_length', 'home_ownership', 'annual_inc', 'verification_status', 'purpose', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'fico_range_low', 'fico_range_high', 'loan_status', 'credit_history_years']

 Remaining missing values:
emp_length     78104
loan_amnt          0
term               0
int_rate           0
installment        0
dtype: int64
Feature-engineered dataset saved to: ../data/featured_lendingclub.csv
