In [128]:
import pandas as pd
df = pd.read_csv("/content/loan.csv", low_memory=False)
df.shape


(39717, 111)

In [129]:
# Keep only completed loans
df = df[df['loan_status'].isin(['Fully Paid', 'Charged Off'])]

# Create target variable
df['default'] = df['loan_status'].apply(lambda x: 1 if x == 'Charged Off' else 0)

# Check result
df.shape, df['default'].value_counts(normalize=True)


((38577, 112),
 default
 0    0.854136
 1    0.145864
 Name: proportion, dtype: float64)

In [130]:
risk_columns = [
    'loan_amnt', 'term', 'int_rate', 'installment', 'purpose',
    'annual_inc', 'emp_length', 'home_ownership', 'dti',
    'delinq_2yrs', 'revol_util', 'open_acc', 'total_acc', 'pub_rec',
    'grade', 'sub_grade', 'default'
]

df = df[risk_columns]

df.shape


(38577, 17)

In [131]:
# Check missing values percentage
missing_pct = (df.isna().sum() / len(df)) * 100
missing_pct.sort_values(ascending=False)


Unnamed: 0,0
emp_length,2.677761
revol_util,0.129611
loan_amnt,0.0
installment,0.0
term,0.0
purpose,0.0
annual_inc,0.0
home_ownership,0.0
int_rate,0.0
dti,0.0


In [132]:
# Fill emp_length
df['emp_length'] = df['emp_length'].fillna('Unknown')

# Convert revol_util to numeric (remove % if present)
df['revol_util'] = df['revol_util'].str.replace('%', '').astype(float)

# Fill revol_util with median
df['revol_util'] = df['revol_util'].fillna(df['revol_util'].median())

# Re-check missing values
(df.isna().sum() / len(df)) * 100


Unnamed: 0,0
loan_amnt,0.0
term,0.0
int_rate,0.0
installment,0.0
purpose,0.0
annual_inc,0.0
emp_length,0.0
home_ownership,0.0
dti,0.0
delinq_2yrs,0.0


In [133]:
import numpy as np

# --- Convert term ---
df['term'] = df['term'].str.extract('(\d+)').astype(int)

# --- Robust emp_length conversion ---
def emp_length_to_num(x):
    if pd.isna(x):
        return -1
    x = str(x)
    if 'Unknown' in x:
        return -1
    if '<' in x:
        return 0
    if '10+' in x:
        return 10
    if 'year' in x:
        return int(x.split()[0])
    return -1

df['emp_length'] = df['emp_length'].apply(emp_length_to_num)

# Quick check
df[['term', 'emp_length']].head(), df['emp_length'].value_counts().sort_index()


  df['term'] = df['term'].str.extract('(\d+)').astype(int)


(   term  emp_length
 0    36          10
 1    60           0
 2    36          10
 3    36          10
 5    36           3,
 emp_length
 -1     1033
  0     4508
  1     3169
  2     4291
  3     4012
  4     3342
  5     3194
  6     2168
  7     1711
  8     1435
  9     1226
  10    8488
 Name: count, dtype: int64)

In [134]:
# Separate features and target
X = df.drop(columns=['default'])
y = df['default']

X.shape, y.shape



((38577, 16), (38577,))

In [135]:
# Identify categorical columns
cat_cols = ['purpose', 'home_ownership', 'grade', 'sub_grade']

# One-hot encode (drop first to avoid multicollinearity)
X_encoded = pd.get_dummies(X, columns=cat_cols, drop_first=True)

X_encoded.shape


(38577, 69)

In [136]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_encoded,
    y,
    test_size=0.3,
    random_state=42,
    stratify=y
)

X_train.shape, X_test.shape, y_train.mean(), y_test.mean()


((27003, 69),
 (11574, 69),
 np.float64(0.14587268081324298),
 np.float64(0.14584413340245378))

In [137]:
# Convert interest rate to numeric
df['int_rate'] = df['int_rate'].str.replace('%', '').astype(float)

# Update X and encoded X (because df changed)
X = df.drop(columns=['default'])

X_encoded = pd.get_dummies(
    X,
    columns=['purpose', 'home_ownership', 'grade', 'sub_grade'],
    drop_first=True
)

# Recreate train-test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_encoded,
    y,
    test_size=0.3,
    random_state=42,
    stratify=y
)


In [138]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

# Train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict probabilities
train_pred = model.predict_proba(X_train)[:, 1]
test_pred = model.predict_proba(X_test)[:, 1]

# AUC scores
roc_auc_score(y_train, train_pred), roc_auc_score(y_test, test_pred)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


(np.float64(0.6143736091771348), np.float64(0.6142442685476996))

In [139]:
from sklearn.preprocessing import StandardScaler

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Retrain logistic regression
model = LogisticRegression(max_iter=2000)
model.fit(X_train_scaled, y_train)

# Re-evaluate
train_auc = roc_auc_score(y_train, model.predict_proba(X_train_scaled)[:, 1])
test_auc = roc_auc_score(y_test, model.predict_proba(X_test_scaled)[:, 1])

train_auc, test_auc


(np.float64(0.7022172240437042), np.float64(0.7042888454447047))

In [140]:
# Generate PD scores on test set
test_pd = model.predict_proba(X_test_scaled)[:, 1]

# Create risk buckets
pd_bins = [0, 0.05, 0.10, 0.20, 1.0]
pd_labels = ['Low Risk', 'Medium Risk', 'High Risk', 'Very High Risk']

risk_bucket = pd.cut(test_pd, bins=pd_bins, labels=pd_labels)

# Create summary
risk_summary = pd.DataFrame({
    'PD': test_pd,
    'Risk_Bucket': risk_bucket,
    'Actual_Default': y_test.values
})

risk_summary['Risk_Bucket'].value_counts(), risk_summary.groupby('Risk_Bucket')['Actual_Default'].mean()


  risk_summary['Risk_Bucket'].value_counts(), risk_summary.groupby('Risk_Bucket')['Actual_Default'].mean()


(Risk_Bucket
 High Risk         4741
 Medium Risk       3160
 Very High Risk    2518
 Low Risk          1155
 Name: count, dtype: int64,
 Risk_Bucket
 Low Risk          0.034632
 Medium Risk       0.073101
 High Risk         0.144062
 Very High Risk    0.291501
 Name: Actual_Default, dtype: float64)

In [141]:
# Define approval policy
def approval_policy(bucket):
    if bucket == 'Low Risk':
        return 'Approve'
    if bucket == 'Medium Risk':
        return 'Approve with Limits'
    return 'Reject'

risk_summary['Decision'] = risk_summary['Risk_Bucket'].apply(approval_policy)

# Policy summary
policy_summary = risk_summary.groupby('Decision')['Actual_Default'].agg(['count', 'mean'])

policy_summary


Unnamed: 0_level_0,count,mean
Decision,Unnamed: 1_level_1,Unnamed: 2_level_1
Approve,1155,0.034632
Approve with Limits,3160,0.073101
Reject,7259,0.195206
