# Feature Engineering

In [1]:
import pandas as pd

from sklearn.preprocessing import LabelEncoder

CLEANED_DATA_PATH = '../data/processed/lending-club-cleaned.csv'
FE_DATA_PATH = '../data/processed/lending-club-fe.csv'

In [2]:
df = pd.read_csv(CLEANED_DATA_PATH)
df.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,grade,sub_grade,emp_length,home_ownership,annual_inc,verification_status,...,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,application_type,mort_acc,pub_rec_bankruptcies,mort_acc_missing
0,5000,36 months,11.53,164.96,B,B5,Unknown,OWN,21000.0,Verified,...,9.0,2.0,19947,55.7,27.0,w,Individual,0.0,1.0,0
1,30375,36 months,13.49,1030.64,C,C2,10+ years,OWN,108000.0,Verified,...,8.0,2.0,8959,48.7,14.0,w,Individual,2.0,0.0,0
2,13250,36 months,18.75,484.02,D,D3,10+ years,MORTGAGE,50000.0,Source Verified,...,7.0,0.0,8854,53.0,19.0,f,Individual,2.0,0.0,0
3,7000,36 months,7.07,216.37,A,A2,10+ years,MORTGAGE,50000.0,Not Verified,...,12.0,0.0,7258,31.8,28.0,w,Individual,1.0,0.0,0
4,5000,36 months,7.29,155.05,A,A4,10+ years,OWN,60000.0,Not Verified,...,7.0,0.0,3872,32.8,22.0,f,Individual,1.0,0.0,1


In [32]:
df_fe = df.copy()

### Convert datetime

In [33]:
# Convert the text to actual Datetime objects
# format='%b-%Y' tells Python that "Mar-2003" is Month-Year
df_fe['earliest_cr_line'] = pd.to_datetime(df_fe['earliest_cr_line'], format='%b-%Y')
df_fe['issue_d'] = pd.to_datetime(df_fe['issue_d'], format='%b-%Y')

# Pick a "Reference Date"
# Since this is historical data, we shouldn't use "today" (2026).
# We should use a date relevant to the dataset, like 2020 or the max date in the data.
# Let's assume the analysis is happening on Dec 31, 2020.
reference_date = pd.to_datetime('2018-12-31')

# Calculate the difference (Days -> Years)
# We divide by 365.25 to account for leap years
df_fe['credit_history_years'] = (reference_date - df_fe['earliest_cr_line']).dt.days / 365.25
df_fe['issue_years'] = (reference_date - df_fe['issue_d']).dt.days / 365.25

# Check the results
print(df_fe[['earliest_cr_line', 'credit_history_years']].head())
print(df_fe[['issue_d', 'issue_years']].head())

  earliest_cr_line  credit_history_years
0       1978-05-01             40.668036
1       2007-05-01             11.668720
2       1992-04-01             26.748802
3       2005-09-01             13.330595
4       1999-01-01             19.997262
     issue_d  issue_years
0 2015-07-01     3.501711
1 2016-07-01     2.499658
2 2013-06-01     5.582478
3 2017-08-01     1.415469
4 2011-01-01     7.997262


In [34]:
# Drop the original date column because the model can't handle it
df_fe = df_fe.drop(columns=['earliest_cr_line', 'issue_d'])

# Verify it's gone
print(df_fe.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4994 entries, 0 to 4993
Data columns (total 25 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   loan_amnt             4994 non-null   int64  
 1   term                  4994 non-null   object 
 2   int_rate              4994 non-null   float64
 3   installment           4994 non-null   float64
 4   grade                 4994 non-null   object 
 5   sub_grade             4994 non-null   object 
 6   emp_length            4994 non-null   object 
 7   home_ownership        4994 non-null   object 
 8   annual_inc            4994 non-null   float64
 9   verification_status   4994 non-null   object 
 10  loan_status           4994 non-null   object 
 11  purpose               4994 non-null   object 
 12  dti                   4994 non-null   float64
 13  open_acc              4994 non-null   float64
 14  pub_rec               4994 non-null   float64
 15  revol_bal            

### Encoding

We'll make everything numerical.

Object columns:
```
term                     2
grade                    7
sub_grade               35
emp_length              11
home_ownership           4
verification_status      3
issue_d                124
loan_status              2
purpose                 14
earliest_cr_line       490
initial_list_status      2
application_type         2
```

In [35]:
df_fe['loan_status'] = df_fe['loan_status'].map({'Fully Paid': 0, 'Charged Off': 1})

In [36]:
# ORDINAL ENCODING
ordinal_cols = {
    'term': [' 36 months', ' 60 months'],
    'grade': ['A', 'B', 'C', 'D', 'E', 'F', 'G'],
    'sub_grade': [
        'A1', 'A2', 'A3', 'A4', 'A5',
        'B1', 'B2', 'B3', 'B4', 'B5',
        'C1', 'C2', 'C3', 'C4', 'C5',
        'D1', 'D2', 'D3', 'D4', 'D5',
        'E1', 'E2', 'E3', 'E4', 'E5',
        'F1', 'F2', 'F3', 'F4', 'F5',
        'G1', 'G2', 'G3', 'G4', 'G5'
    ],
    'emp_length': [
        '< 1 year', '1 year', '2 years', 
        '3 years', '4 years', '5 years', 
        '6 years', '7 years', '8 years', 
        '9 years', '10+ years'
    ],
    'verification_status': ['Not Verified', 'Verified', 'Source Verified'],    
}

for col, order in ordinal_cols.items():
    if col in df_fe.columns:
        mapper = {val: i for i, val in enumerate(order)}
        df_fe[col] = df_fe[col].map(mapper)
        
        
# TARGET ENCODING
purpose_target_encoding = df_fe.groupby('purpose')['loan_status'].mean()
print('Purpose Target Encoding:')
print(purpose_target_encoding)

df_fe['purpose'] = df_fe['purpose'].map(purpose_target_encoding)

# ONE-HOT ENCODING
categorical_cols = df_fe.select_dtypes(include=['object']).columns.tolist()

if 'loan_status' in categorical_cols:
    categorical_cols.remove('loan_status')

print(f"\nAuto-encoding these columns: {categorical_cols}")

df_fe = pd.get_dummies(df_fe, columns=categorical_cols, drop_first=True)

# VERIFY
print("\nFinal Shape:", df_fe.shape)

Purpose Target Encoding:
purpose
car                   0.169811
credit_card           0.163516
debt_consolidation    0.211666
educational           0.000000
home_improvement      0.170968
house                 0.162162
major_purchase        0.179688
medical               0.229730
moving                0.200000
other                 0.222571
renewable_energy      0.666667
small_business        0.298507
vacation              0.222222
wedding               0.400000
Name: loan_status, dtype: float64

Auto-encoding these columns: ['home_ownership', 'initial_list_status', 'application_type']

Final Shape: (4994, 27)


In [37]:
df_fe.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,grade,sub_grade,emp_length,annual_inc,verification_status,loan_status,...,mort_acc,pub_rec_bankruptcies,mort_acc_missing,credit_history_years,issue_years,home_ownership_MORTGAGE,home_ownership_OWN,home_ownership_RENT,initial_list_status_w,application_type_Joint App
0,5000,0,11.53,164.96,1,9,,21000.0,1,0,...,0.0,1.0,0,40.668036,3.501711,False,True,False,True,False
1,30375,0,13.49,1030.64,2,11,10.0,108000.0,1,0,...,2.0,0.0,0,11.66872,2.499658,False,True,False,True,False
2,13250,0,18.75,484.02,3,17,10.0,50000.0,2,0,...,2.0,0.0,0,26.748802,5.582478,True,False,False,False,False
3,7000,0,7.07,216.37,0,1,10.0,50000.0,0,0,...,1.0,0.0,0,13.330595,1.415469,True,False,False,True,False
4,5000,0,7.29,155.05,0,3,10.0,60000.0,0,0,...,1.0,0.0,1,19.997262,7.997262,False,True,False,False,False


In [42]:
print(f'Null values after encoding:\n{df_fe.isnull().sum()[df_fe.isnull().sum() > 0]}')

Null values after encoding:
emp_length    285
dtype: int64


### Impute Missing Value after Encoding

In [43]:
df_fe['emp_length_unknown'] = df_fe['emp_length'].isna().astype(int)

mode_value = df_fe['emp_length'].mode()[0]
df_fe['emp_length'] = df_fe['emp_length'].fillna(mode_value)

print(f"Mode value used for imputation: {mode_value}")
print(f"Missing emp_length values imputed: {df_fe['emp_length_unknown'].sum()}")

Mode value used for imputation: 10.0
Missing emp_length values imputed: 285


### Save

In [44]:
df_fe.to_csv(FE_DATA_PATH, index=False)

print(f"Final dataset shape: {df_fe.shape}")
print(f"\nFeature-engineered dataset saved to: {FE_DATA_PATH}")

Final dataset shape: (4994, 28)

Feature-engineered dataset saved to: ../data/processed/lending-club-fe.csv
