# Credit Risk Assessment: Data Wrangling Pipeline

---

### Import Libraries and set Configurations

In [1]:
import os
import warnings
from pathlib import Path
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import sparse
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold

pd.set_option("display.max_columns", 200)
warnings.filterwarnings("ignore")

random_state = 42
rare_min = 200
data_filepath = Path("../../data/raw/accepted_2007_to_2018Q4.csv") 

### Import Dataset

In [2]:
df = pd.read_csv(data_filepath, low_memory=False)
print(f"DatasetLoaded: {data_filepath.name}")
print("Raw Shape:", df.shape)
display(df.head())

DatasetLoaded: accepted_2007_to_2018Q4.csv
Raw Shape: (2260701, 151)


Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_act_il,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,revol_bal_joint,sec_app_fico_range_low,sec_app_fico_range_high,sec_app_earliest_cr_line,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,hardship_flag,hardship_type,hardship_reason,hardship_status,deferral_term,hardship_amount,hardship_start_date,hardship_end_date,payment_plan_start_date,hardship_length,hardship_dpd,hardship_loan_status,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,68407277,,3600.0,3600.0,3600.0,36 months,13.99,123.03,C,C4,leadman,10+ years,MORTGAGE,55000.0,Not Verified,Dec-2015,Fully Paid,n,https://lendingclub.com/browse/loanDetail.acti...,,debt_consolidation,Debt consolidation,190xx,PA,5.91,0.0,Aug-2003,675.0,679.0,1.0,30.0,,7.0,0.0,2765.0,29.7,13.0,w,0.0,0.0,4421.723917,4421.72,3600.0,821.72,0.0,0.0,0.0,Jan-2019,122.67,,Mar-2019,564.0,560.0,0.0,30.0,1.0,Individual,,,,0.0,722.0,144904.0,2.0,2.0,0.0,1.0,21.0,4981.0,36.0,3.0,3.0,722.0,34.0,9300.0,3.0,1.0,4.0,4.0,20701.0,1506.0,37.2,0.0,0.0,148.0,128.0,3.0,3.0,1.0,4.0,69.0,4.0,69.0,2.0,2.0,4.0,2.0,5.0,3.0,4.0,9.0,4.0,7.0,0.0,0.0,0.0,3.0,76.9,0.0,0.0,0.0,178050.0,7746.0,2400.0,13734.0,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
1,68355089,,24700.0,24700.0,24700.0,36 months,11.99,820.28,C,C1,Engineer,10+ years,MORTGAGE,65000.0,Not Verified,Dec-2015,Fully Paid,n,https://lendingclub.com/browse/loanDetail.acti...,,small_business,Business,577xx,SD,16.06,1.0,Dec-1999,715.0,719.0,4.0,6.0,,22.0,0.0,21470.0,19.2,38.0,w,0.0,0.0,25679.66,25679.66,24700.0,979.66,0.0,0.0,0.0,Jun-2016,926.35,,Mar-2019,699.0,695.0,0.0,,1.0,Individual,,,,0.0,0.0,204396.0,1.0,1.0,0.0,1.0,19.0,18005.0,73.0,2.0,3.0,6472.0,29.0,111800.0,0.0,0.0,6.0,4.0,9733.0,57830.0,27.1,0.0,0.0,113.0,192.0,2.0,2.0,4.0,2.0,,0.0,6.0,0.0,5.0,5.0,13.0,17.0,6.0,20.0,27.0,5.0,22.0,0.0,0.0,0.0,2.0,97.4,7.7,0.0,0.0,314017.0,39475.0,79300.0,24667.0,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
2,68341763,,20000.0,20000.0,20000.0,60 months,10.78,432.66,B,B4,truck driver,10+ years,MORTGAGE,63000.0,Not Verified,Dec-2015,Fully Paid,n,https://lendingclub.com/browse/loanDetail.acti...,,home_improvement,,605xx,IL,10.78,0.0,Aug-2000,695.0,699.0,0.0,,,6.0,0.0,7869.0,56.2,18.0,w,0.0,0.0,22705.924294,22705.92,20000.0,2705.92,0.0,0.0,0.0,Jun-2017,15813.3,,Mar-2019,704.0,700.0,0.0,,1.0,Joint App,71000.0,13.85,Not Verified,0.0,0.0,189699.0,0.0,1.0,0.0,4.0,19.0,10827.0,73.0,0.0,2.0,2081.0,65.0,14000.0,2.0,5.0,1.0,6.0,31617.0,2737.0,55.9,0.0,0.0,125.0,184.0,14.0,14.0,5.0,101.0,,10.0,,0.0,2.0,3.0,2.0,4.0,6.0,4.0,7.0,3.0,6.0,0.0,0.0,0.0,0.0,100.0,50.0,0.0,0.0,218418.0,18696.0,6200.0,14877.0,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
3,66310712,,35000.0,35000.0,35000.0,60 months,14.85,829.9,C,C5,Information Systems Officer,10+ years,MORTGAGE,110000.0,Source Verified,Dec-2015,Current,n,https://lendingclub.com/browse/loanDetail.acti...,,debt_consolidation,Debt consolidation,076xx,NJ,17.06,0.0,Sep-2008,785.0,789.0,0.0,,,13.0,0.0,7802.0,11.6,17.0,w,15897.65,15897.65,31464.01,31464.01,19102.35,12361.66,0.0,0.0,0.0,Feb-2019,829.9,Apr-2019,Mar-2019,679.0,675.0,0.0,,1.0,Individual,,,,0.0,0.0,301500.0,1.0,1.0,0.0,1.0,23.0,12609.0,70.0,1.0,1.0,6987.0,45.0,67300.0,0.0,1.0,0.0,2.0,23192.0,54962.0,12.1,0.0,0.0,36.0,87.0,2.0,2.0,1.0,2.0,,,,0.0,4.0,5.0,8.0,10.0,2.0,10.0,13.0,5.0,13.0,0.0,0.0,0.0,1.0,100.0,0.0,0.0,0.0,381215.0,52226.0,62500.0,18000.0,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
4,68476807,,10400.0,10400.0,10400.0,60 months,22.45,289.91,F,F1,Contract Specialist,3 years,MORTGAGE,104433.0,Source Verified,Dec-2015,Fully Paid,n,https://lendingclub.com/browse/loanDetail.acti...,,major_purchase,Major purchase,174xx,PA,25.37,1.0,Jun-1998,695.0,699.0,3.0,12.0,,12.0,0.0,21929.0,64.5,35.0,w,0.0,0.0,11740.5,11740.5,10400.0,1340.5,0.0,0.0,0.0,Jul-2016,10128.96,,Mar-2018,704.0,700.0,0.0,,1.0,Individual,,,,0.0,0.0,331730.0,1.0,3.0,0.0,3.0,14.0,73839.0,84.0,4.0,7.0,9702.0,78.0,34000.0,2.0,1.0,3.0,10.0,27644.0,4567.0,77.5,0.0,0.0,128.0,210.0,4.0,4.0,6.0,4.0,12.0,1.0,12.0,0.0,4.0,6.0,5.0,9.0,10.0,7.0,19.0,6.0,12.0,0.0,0.0,0.0,4.0,96.6,60.0,0.0,0.0,439570.0,95768.0,20300.0,88097.0,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,


In [3]:
df = df[df["id"].notna()].copy() #drops rows like the footer with no id data

status_mapping = {"Fully Paid": 0, "Charged Off": 1}
df=df[df["loan_status"].isin(status_mapping.keys())].copy()
df["default"] = df["loan_status"].map(status_mapping)
print("After mapping loan_status to default:\n", df.shape)
print("Class distribution:\n", df["default"].value_counts(normalize=True))

After mapping loan_status to default:
 (1345310, 152)
Class distribution:
 default
0    0.800374
1    0.199626
Name: proportion, dtype: float64


### EDA

In [4]:
print("Column count:", len(df.columns))
display(df.info(memory_usage="deep"))

Column count: 152
<class 'pandas.core.frame.DataFrame'>
Index: 1345310 entries, 0 to 2260697
Columns: 152 entries, id to default
dtypes: float64(113), int64(1), object(38)
memory usage: 3.5 GB


None

In [5]:
#Check for null values
df.isnull().sum()

id                             0
member_id                1345310
loan_amnt                      0
funded_amnt                    0
funded_amnt_inv                0
                          ...   
settlement_date          1312034
settlement_amount        1312034
settlement_percentage    1312034
settlement_term          1312034
default                        0
Length: 152, dtype: int64

In [6]:
#Check for duplicates
df.duplicated().sum()

0

In [7]:
#Descriptive statistics
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
member_id,0.0,,,,,,,
loan_amnt,1345310.0,14419.972014,8717.050787,500.00,8000.0000,12000.00,20000.0000,40000.00
funded_amnt,1345310.0,14411.556630,8713.117909,500.00,8000.0000,12000.00,20000.0000,40000.00
funded_amnt_inv,1345310.0,14389.141598,8715.493989,0.00,7875.0000,12000.00,20000.0000,40000.00
int_rate,1345310.0,13.239619,4.768716,5.31,9.7500,12.74,15.9900,30.99
...,...,...,...,...,...,...,...,...
hardship_last_payment_amount,5754.0,184.689314,196.459790,0.01,39.5700,120.97,267.6050,1407.86
settlement_amount,33276.0,5029.933417,3684.827275,44.21,2228.6175,4174.68,6884.2375,33601.00
settlement_percentage,33276.0,47.691708,7.306107,0.20,45.0000,45.00,50.0000,521.35
settlement_term,33276.0,13.158132,8.235592,0.00,6.0000,14.00,18.0000,181.00


In [8]:
#Check for object columns
print("Object columns:")
df.describe(include=["object"]).T

Object columns:


Unnamed: 0,count,unique,top,freq
id,1345310,1345310,68407277,1.0
term,1345310,2,36 months,1020743.0
grade,1345310,7,B,392741.0
sub_grade,1345310,35,C1,85494.0
emp_title,1259525,378353,Teacher,21268.0
emp_length,1266799,11,10+ years,442199.0
home_ownership,1345310,6,MORTGAGE,665579.0
verification_status,1345310,3,Source Verified,521273.0
issue_d,1345310,139,Mar-2016,48937.0
loan_status,1345310,2,Fully Paid,1076751.0


### Data Cleaning

In [9]:
#Drop columns that are irrelevant to the model or result in high leakage
leaky_cols = ["total_pymnt","total_pymnt_inv","total_rec_prncp","total_rec_int",
    "total_rec_late_fee","recoveries","collection_recovery_fee",
    "out_prncp","out_prncp_inv","last_pymnt_d","last_pymnt_amnt",
    "next_pymnt_d","last_credit_pull_d","hardship_flag","hardship_status",
    "settlement_status", "pymnt_plan", "initial_list_status", "application_type", "addr_state"]
identifiers_to_drop=["id","member_id","policy_code","url"]
free_text_to_drop=["emp_title","title","desc","zip_code"]
constant_cols= [c for c in df.columns if df[c].nunique(dropna=False)==1]

na_threshold = 0.30
high_na_cols = df.columns[df.isna().mean()>na_threshold].tolist()
drop_cols = set(leaky_cols+identifiers_to_drop+free_text_to_drop+constant_cols+high_na_cols)
df.drop(columns=[c for c in drop_cols if c in df.columns], inplace=True)

print(f"Dropped {len(drop_cols)} columns. New shape is: {df.shape}")

Dropped 81 columns. New shape is: (1345310, 71)


### Missing Indicator Flags

In [10]:
for col in df.columns:
    if df[col].isna().any():
        df[f"{col}_na"]=df[col].isna().astype(int)
print("missing values flag added")

missing values flag added


### Feature Engineering

In [11]:
pct_cols = ["int_rate", "revol_util"]
for col in pct_cols:
    if col in df.columns:
        df[col]=pd.to_numeric(df[col].astype(str).str.rstrip("%"),errors="coerce")/100

date_cols = ["issue_d", "earliest_cr_line"]
for col in date_cols:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], format="%b-%Y", errors="coerce")

df["credit_age_months"] = (df["issue_d"]-df["earliest_cr_line"]).dt.days//30
df["loan_to_income"]= df["loan_amnt"]/(df["annual_inc"]+1)
df["installment_to_income"]= df["installment"]/(df["annual_inc"]+1)

for col in ["loan_amnt","annual_inc"]:
    if col in df.columns:
        df[col]=np.log1p(df[col])

print("Formats standardized (percentages, dates, ordinals)")
display(df[pct_cols + date_cols].head())

Formats standardized (percentages, dates, ordinals)


Unnamed: 0,int_rate,revol_util,issue_d,earliest_cr_line
0,0.1399,0.297,2015-12-01,2003-08-01
1,0.1199,0.192,2015-12-01,1999-12-01
2,0.1078,0.562,2015-12-01,2000-08-01
4,0.2245,0.645,2015-12-01,1998-06-01
5,0.1344,0.684,2015-12-01,1987-10-01


### Encoding

In [12]:
ordinal_cols = [c for c in ("grade","sub_grade","emp_length") if c in df.columns]
numeric_cols = df.select_dtypes(include="number").columns.difference(["default"])
categorical_cols = (df.select_dtypes(include="object").columns.difference(["loan_status"]).difference(ordinal_cols))

grade_order = list("ABCDEFG")
sub_grade_order = [g+str(i) for g in grade_order for i in range(1,6)]
emp_len_order = ['< 1 year','1 year','2 years','3 years','4 years','5 years',
                 '6 years','7 years','8 years','9 years','10+ years']

ordinal_enc = OrdinalEncoder(
    categories=[grade_order, sub_grade_order, emp_len_order][: len(ordinal_cols)],
    handle_unknown="use_encoded_value", unknown_value=-1
)

nom_enc = OneHotEncoder(handle_unknown="ignore", sparse_output=True, min_frequency=rare_min)

num_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("scale",  StandardScaler(with_mean=False)) 
])

preprocessor = ColumnTransformer([
    ("num", num_pipe,list(numeric_cols)),
    ("ord", ordinal_enc,ordinal_cols),
    ("cat", nom_enc,list(categorical_cols)),
])

print(
    f"Numeric: {len(numeric_cols)}, Ordinal: {len(ordinal_cols)}, Nominal: {len(categorical_cols)}"
)

Numeric: 103, Ordinal: 3, Nominal: 6


### Train-Test Split

In [13]:
X = df.drop(columns=["default", "loan_status"])
y = df["default"]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.20,
    random_state=random_state,
    stratify=y,
)

print("Train shape:", X_train.shape)
print("Test shape :", X_test.shape)

Train shape: (1076248, 114)
Test shape : (269062, 114)


In [16]:
Xt_train = preprocessor.fit_transform(X_train)
Xt_test  = preprocessor.transform(X_test)

#drop all-zero columns created by split
vt = VarianceThreshold(threshold=0.0)
Xt_train = vt.fit_transform(Xt_train)
Xt_test  = vt.transform(Xt_test)
Xt_train_sparse = sparse.csr_matrix(Xt_train)
Xt_test_sparse  = sparse.csr_matrix(Xt_test)

print("encoded train", Xt_train.shape)

encoded train (1076248, 134)


### Save Data

In [18]:
processed = Path("../../data/processed"); processed.mkdir(exist_ok=True)
sparse.save_npz(processed/"X_train.npz",Xt_train_sparse)
sparse.save_npz(processed/"X_test.npz",Xt_test_sparse)
y_train.to_csv(processed/"y_train.csv",index=False)
y_test.to_csv(processed/"y_test.csv",index=False)
joblib.dump(preprocessor, processed/"preprocessor.joblib")
joblib.dump(vt, processed/"vt.joblib")
print("saved to", processed.resolve())

saved to /Users/pranavrao/Documents/ai-ml-projects/github-repos/aura-xai-finrisk-llm/data/processed


In [23]:
readme = f"""\
# Lending Club – Preprocessed Data Manifest

_All data in this folder was generated on {{pd.Timestamp.utcnow().date()}}._

## 1. Raw Input  
`{data_filepath.name}` — full **accepted loans 2007–2018Q4** CSV from Kaggle.

## 2. Target Definition  
* **loan_status → default**  
  * 1 = “Charged Off”  
  * 0 = “Fully Paid”  
* All other statuses were **dropped**.

## 3. Column Filtering  
* **Identifiers & URLs**: `id`, `member_id`, `url`, `policy_code` → _dropped_  
* **Free-text / high-cardinality**: `emp_title`, `title`, `desc` → _dropped_  
* **Post-issuance leakage** (payments, recoveries, balances, hardship flags) → _dropped_  
* **Constant columns** (`nunique()==1`) → _dropped_  
* **High-missing** (> {na_threshold:.0%} NaN) → _dropped_  
* Final shape after drops: **{df.shape}**.

## 4. Feature Engineering  
| Derived Feature | Formula | Notes |
|-----------------|---------|-------|
| `loan_to_income` | `loan_amnt / (annual_inc+1)` | ratio |
| `installment_to_income` | `installment / (annual_inc+1)` | ratio |
| `credit_age_months` | months between `issue_d` and `earliest_cr_line` | borrower history |
| `term_months` / `long_term` | extracted from text “36 months” / `(term==60)` | duration flag |
| Log-transform | `loan_amnt`, `annual_inc` → `log1p()` | reduce skew |

## 5. Missing-Value Strategy  
* **Numeric** → `median` imputation  
* **Ordinal** (`grade`, `sub_grade`, `emp_length`) → sentinel **–1** via `OrdinalEncoder`  
* **Nominal** → `most_frequent` (then rare-lump)  
* Missing-indicator flags added for every column that contained NaNs.

## 6. Encoding  
* **OrdinalEncoder** with explicit category order  
* **OneHotEncoder** with `min_frequency={rare_min}` (rare values lumped into “other”)  
* Output kept **sparse CSR**.

## 7. Scaling  
* `StandardScaler(with_mean=False)` applied to numeric features (safe for CSR).

## 8. Variance Filter  
* `VarianceThreshold(0.0)` removed zero-variance columns created after train/test split.

## 9. Train/Test Split  
* Stratified 80 / 20 on `default`  
* Shapes: `Xt_train` {Xt_train.shape}   `Xt_test` {Xt_test.shape}

## 10. Class Imbalance  
* Ratio ≈ {round(y_train.value_counts()[0]/y_train.value_counts()[1],2)} : 1    
  * Handled with **`class_weight="balanced"`** for linear & tree models.

## 11. Saved Artefacts  
| File | Description |
|------|-------------|
| `X_train.npz` | CSR matrix – encoded train features |
| `X_test.npz`  | CSR matrix – encoded test features |
| `y_train.csv` | Train labels |
| `y_test.csv`  | Test labels |
| `preprocessor.joblib` | Fitted impute/scale/encode pipeline |
| `vt.joblib` | Fitted variance-threshold mask |

---
"""
save_path = Path("../../data/processed")
with open(save_path / "README.md", "w", encoding="utf-8") as f:
    f.write(readme)

print("README.md written to", (processed / "README.md").resolve())

README.md written to /Users/pranavrao/Documents/ai-ml-projects/github-repos/aura-xai-finrisk-llm/data/processed/README.md
