# Feature Engineering

In [1]:
import pandas as pd

from sklearn.preprocessing import LabelEncoder

CLEANED_DATA_PATH = '../data/processed/lending-club-cleaned.csv'
FE_DATA_PATH = '../data/processed/lending-club-fe.csv'

In [2]:
df = pd.read_csv(CLEANED_DATA_PATH)
df.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,grade,sub_grade,emp_length,home_ownership,annual_inc,verification_status,...,earliest_cr_line,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,application_type,mort_acc,pub_rec_bankruptcies
0,30000,36 months,22.35,1151.16,D,D5,5 years,MORTGAGE,100000.0,Source Verified,...,Jan-2012,11,1,15603,37.0,19,w,Joint App,1,1
1,40000,60 months,16.14,975.71,C,C4,< 1 year,MORTGAGE,45000.0,Verified,...,Jun-2009,18,0,34971,64.5,37,w,Joint App,1,0
2,20000,36 months,7.56,622.68,A,A3,10+ years,MORTGAGE,100000.0,Not Verified,...,Feb-1999,9,0,25416,29.9,19,w,Joint App,5,0
3,4500,36 months,11.31,147.99,B,B3,10+ years,RENT,38500.0,Not Verified,...,Dec-2003,12,0,4472,15.3,25,w,Individual,0,0
4,8425,36 months,27.27,345.18,E,E5,3 years,MORTGAGE,450000.0,Verified,...,Oct-1997,21,0,36812,65.7,37,w,Joint App,4,0


In [23]:
df_fe = df.copy()

### Convert datetime

In [24]:
# Convert the text to actual Datetime objects
# format='%b-%Y' tells Python that "Mar-2003" is Month-Year
df_fe['earliest_cr_line'] = pd.to_datetime(df_fe['earliest_cr_line'], format='%b-%Y')

# Pick a "Reference Date"
# Since this is historical data, we shouldn't use "today" (2026).
# We should use a date relevant to the dataset, like 2020 or the max date in the data.
# Let's assume the analysis is happening on Dec 31, 2020.
reference_date = pd.to_datetime('2015-09-30')

# Calculate the difference (Days -> Years)
# We divide by 365.25 to account for leap years
df_fe['credit_history_years'] = (reference_date - df_fe['earliest_cr_line']).dt.days / 365.25

# Check the results
print(df_fe[['earliest_cr_line', 'credit_history_years']].head())

  earliest_cr_line  credit_history_years
0       2012-01-01              3.745380
1       2009-06-01              6.329911
2       1999-02-01             16.659822
3       2003-12-01             11.830253
4       1997-10-01             17.995893


In [25]:
# Drop the original date column because the model can't handle it
df_fe = df_fe.drop(columns=['earliest_cr_line'])

# Verify it's gone
print(df_fe.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2436 entries, 0 to 2435
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   loan_amnt             2436 non-null   int64  
 1   term                  2436 non-null   object 
 2   int_rate              2436 non-null   float64
 3   installment           2436 non-null   float64
 4   grade                 2436 non-null   object 
 5   sub_grade             2436 non-null   object 
 6   emp_length            2436 non-null   object 
 7   home_ownership        2436 non-null   object 
 8   annual_inc            2436 non-null   float64
 9   verification_status   2436 non-null   object 
 10  issue_d               2436 non-null   object 
 11  loan_status           2436 non-null   object 
 12  purpose               2436 non-null   object 
 13  dti                   2436 non-null   float64
 14  open_acc              2436 non-null   int64  
 15  pub_rec              

### Encoding

We'll make everything numerical.

Object columns:
```
term                     2
grade                    7
sub_grade               30
emp_length              11
home_ownership           4
verification_status      3
issue_d                  3
loan_status              2
purpose                 12
initial_list_status      2
application_type         2
```

In [26]:
# ORDINAL ENCODING
ordinal_cols = {
    'term': [' 36 months', ' 60 months'],
    'grade': ['A', 'B', 'C', 'D', 'E', 'F', 'G'],
    'sub_grade': [
        'A1', 'A2', 'A3', 'A4', 'A5',
        'B1', 'B2', 'B3', 'B4', 'B5',
        'C1', 'C2', 'C3', 'C4', 'C5',
        'D1', 'D2', 'D3', 'D4', 'D5',
        'E1', 'E2', 'E3', 'E4', 'E5',
        'F1', 'F2', 'F3', 'F4', 'F5',
        'G1', 'G2', 'G3', 'G4', 'G5'
    ],
    'emp_length': [
        '< 1 year', '1 year', '2 years', 
        '3 years', '4 years', '5 years', 
        '6 years', '7 years', '8 years', 
        '9 years', '10+ years'
    ],
    'verification_status': ['Not Verified', 'Verified', 'Source Verified'],
    'issue_d': ['Oct-2018', 'Nov-2018', 'Dec-2018'],
    
}

for col, order in ordinal_cols.items():
    if col in df_fe.columns:
        mapper = {val: i for i, val in enumerate(order)}
        df_fe[col] = df_fe[col].map(mapper)

# ONE-HOT ENCODING
categorical_cols = df_fe.select_dtypes(include=['object']).columns.tolist()

if 'loan_status' in categorical_cols:
    categorical_cols.remove('loan_status')

print(f"Auto-encoding these columns: {categorical_cols}")

df_fe = pd.get_dummies(df_fe, columns=categorical_cols, drop_first=True)

# VERIFY
print("Final Shape:", df_fe.shape)

Auto-encoding these columns: ['home_ownership', 'purpose', 'initial_list_status', 'application_type']
Final Shape: (2436, 36)


In [27]:
df_fe.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,grade,sub_grade,emp_length,annual_inc,verification_status,issue_d,...,purpose_house,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_renewable_energy,purpose_small_business,purpose_vacation,initial_list_status_w,application_type_Joint App
0,30000,0,22.35,1151.16,3,19,5,100000.0,2,2,...,False,False,False,False,False,False,False,False,True,True
1,40000,1,16.14,975.71,2,13,0,45000.0,1,2,...,False,False,False,False,False,False,False,False,True,True
2,20000,0,7.56,622.68,0,2,10,100000.0,0,2,...,False,False,False,False,False,False,False,False,True,True
3,4500,0,11.31,147.99,1,7,10,38500.0,0,2,...,False,False,False,False,False,False,False,False,True,False
4,8425,0,27.27,345.18,4,24,3,450000.0,1,2,...,False,False,False,False,False,False,False,False,True,True


### Save

In [29]:
df_fe.to_csv(FE_DATA_PATH, index=False)

print(f"Final dataset shape: {df_fe.shape}")
print(f"\nFeature-engineered dataset saved to: {FE_DATA_PATH}")

Final dataset shape: (2436, 36)

Feature-engineered dataset saved to: ../data/processed/lending-club-fe.csv
