In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder, StandardScaler
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:

data_path = r"D:\Infosys Intern\CreditPathAI_Oct_Batch\kaggle_dataset\data\02_interim\loan_data_clean.csv" 
preprocessed_path = r"D:\Infosys Intern\CreditPathAI_Oct_Batch\kaggle_dataset\data\03_processed\loan_data_preprocessed.csv"
df = pd.read_csv(data_path)

print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
print("Missing values:\n", df.isnull().sum()[df.isnull().sum() > 0])
print("Duplicate rows:", df.duplicated().sum())


Shape: (38480, 37)
Columns: ['Unnamed: 0', 'id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term', 'int_rate', 'installment', 'emp_length', 'home_ownership', 'annual_inc', 'verification_status', 'issue_d', 'loan_status', 'purpose', 'zip_code', 'addr_state', 'dti', 'delinq_2yrs', 'earliest_cr_line', 'inq_last_6mths', 'mths_since_last_delinq', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int', 'last_pymnt_d', 'last_pymnt_amnt', 'next_pymnt_d', 'last_credit_pull_d', 'repay_fail']
Missing values:
 loan_amnt                     1
funded_amnt                   1
funded_amnt_inv               1
installment                   1
emp_length                  993
annual_inc                    2
delinq_2yrs                   1
inq_last_6mths                1
mths_since_last_delinq    24363
open_acc                      1
pub_rec                       1
revol_bal                     4
revol_util       

In [4]:
#Handling Missing Values

numeric_cols = df.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
    df[col] = df[col].fillna(df[col].median())

categorical_cols = df.select_dtypes(include=["object"]).columns
for col in categorical_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

df.to_csv(preprocessed_path, index=False)

print("Missing values handled")
print(df.isnull().sum().sum())


Missing values handled
0


In [9]:
# Data Type Conversion



drop_cols = [
    'Unnamed: 0', 'id', 'member_id', 'zip_code', 'addr_state',
    'issue_d', 'earliest_cr_line', 'mths_since_last_delinq',
    'last_pymnt_d', 'next_pymnt_d', 'last_credit_pull_d'
]

df = df.drop(columns=[c for c in drop_cols if c in df.columns], errors='ignore')
print("After Dropping Columns:", df.shape)

df = df.dropna(subset=['loan_amnt'])
df = df[df['loan_amnt'] > 0]

emp_map = {
    '< 1 year': 0,
    '1 year': 1, '2 years': 2, '3 years': 3, '4 years': 4,
    '5 years': 5, '6 years': 6, '7 years': 7, '8 years': 8,
    '9 years': 9, '10+ years': 10
}

df['emp_length_years'] = df['emp_length'].map(emp_map).fillna(-1)


df['term'] = (
    df['term']
    .astype(str)
    .str.replace(" months", "", regex=False)
    .str.replace(" month", "", regex=False)
    .str.extract(r"(\d+)", expand=False)
    .astype(int)
)

le = LabelEncoder()
for col in df.select_dtypes(include="object").columns:
    df[col] = le.fit_transform(df[col])

print("After Encoding:")
print(df.head())



df.to_csv(preprocessed_path, index=False)
print(" Saved after type conversion")

After Dropping Columns: (38479, 27)
After Encoding:
   loan_amnt  funded_amnt  funded_amnt_inv  term  int_rate  installment  \
1     2500.0       2500.0           2500.0    36     13.98        85.42   
2     5000.0       5000.0           5000.0    36     15.95       175.67   
3     7000.0       7000.0           7000.0    36      9.91       225.58   
4     2000.0       2000.0           2000.0    36      5.42        60.32   
5     3600.0       3600.0           3600.0    36     10.25       116.59   

   emp_length  home_ownership  annual_inc  verification_status  ...  \
1           4               4     20004.0                    0  ...   
2           4               4     59000.0                    0  ...   
3           1               0     53796.0                    0  ...   
4           1               4     30000.0                    0  ...   
5           1               0    675048.0                    0  ...   

   revol_bal  revol_util  total_acc  total_pymnt  total_pymnt_inv  \
1

In [None]:
#Encode Features


# Ordinal encoding for emp_length
"""
# One-hot encoding for selected categorical variables
onehot_cols = ["home_ownership", "purpose", "verification_status", "addr_state"]
existing_cols = [col for col in onehot_cols if col in df.columns]
df = pd.get_dummies(df, columns=existing_cols, drop_first=True)

print(" Categorical features encoded")
df.head()

df.to_csv(preprocessed_path, index=False)
print(" Saved after encoding")
"""

 Categorical features encoded
 Saved after encoding


In [None]:
#Feature Engineering

"""
# Example features
if all(col in df.columns for col in ["annual_inc", "loan_amnt"]):
    df["income_to_loan_ratio"] = df["annual_inc"] / df["loan_amnt"]

if all(col in df.columns for col in ["revol_bal", "annual_inc"]):
    df["credit_utilization_ratio"] = df["revol_bal"] / df["annual_inc"]

if "issue_d" in df.columns:
    today = pd.to_datetime("today")
    df["loan_age_in_months"] = (today - df["issue_d"]).dt.days // 30

if all(col in df.columns for col in ["total_pymnt", "loan_amnt"]):
    df["payment_to_loan_ratio"] = df["total_pymnt"] / df["loan_amnt"]

print("Feature engineering done")
df.head()

# Save
df.to_csv(preprocessed_path, index=False)
print("Saved after feature engineering")

"""

In [11]:
print(df.head())

   loan_amnt  funded_amnt  funded_amnt_inv  term  int_rate  installment  \
1     2500.0       2500.0           2500.0    36     13.98        85.42   
2     5000.0       5000.0           5000.0    36     15.95       175.67   
3     7000.0       7000.0           7000.0    36      9.91       225.58   
4     2000.0       2000.0           2000.0    36      5.42        60.32   
5     3600.0       3600.0           3600.0    36     10.25       116.59   

   emp_length  annual_inc  loan_status    dti  ...  purpose_6  purpose_7  \
1         NaN     20004.0            4  19.86  ...      False      False   
2         NaN     59000.0            0  19.57  ...      False      False   
3         NaN     53796.0            5  10.80  ...      False      False   
4         NaN     30000.0            5   3.60  ...      False      False   
5         NaN    675048.0            4   1.55  ...      False      False   

   purpose_8  purpose_9  purpose_10  purpose_11  purpose_12  purpose_13  \
1      False     