In [131]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm


In [160]:
pd.set_option('display.max_columns',1000)

In [161]:
df = pd.read_csv('dummy_credit_risk_data.csv')

In [162]:
df.columns.tolist()

['ID',
 'sourcing_month',
 'state',
 'channel',
 'income_seg',
 'ticket_size',
 'age',
 'branch_visit_freq',
 'referral_source',
 'doc_complete_flag',
 'dpd_90_ever']

In [163]:
#1 setting features

if 'ID' not in df.columns:
    df['ID'] = np.arange(len(df))

features = [
    'state', 'channel', 'income_seg',
    'ticket_size', 'age', 'branch_visit_freq',
    'referral_source', 'doc_complete_flag'
]
target = 'dpd_90_ever'

df = df.dropna(subset=features + [target])


In [164]:
#2
# woe tranformation of vars

def calculate_woe_iv(df, feature, target, bins=5):
    if df[feature].dtype in ['int64', 'float64']:
        binned = pd.qcut(df[feature], q=bins, duplicates='drop')
    else:
        binned = df[feature]

    grouped = df.groupby(binned)[target].agg(['count', 'sum'])
    grouped['good'] = grouped['count'] - grouped['sum']
    grouped['bad'] = grouped['sum']
    grouped['dist_good'] = grouped['good'] / grouped['good'].sum()
    grouped['dist_bad'] = grouped['bad'] / grouped['bad'].sum()
    grouped['woe'] = np.log((grouped['dist_good'] + 1e-6) / (grouped['dist_bad'] + 1e-6))
    grouped['iv'] = (grouped['dist_good'] - grouped['dist_bad']) * grouped['woe']

    woe_map = grouped['woe'].to_dict()
    df[f'{feature}_woe'] = binned.map(woe_map)

    return df, grouped[['woe', 'iv']]


In [165]:
# applying the same
woe_tables = {}
woe_features = []

for feature in features:
    df, woe_table = calculate_woe_iv(df, feature, target)
    if df[f'{feature}_woe'].nunique() > 1:  # Skip constant WoE
        woe_features.append(f'{feature}_woe')
        woe_tables[feature] = woe_table



  grouped = df.groupby(binned)[target].agg(['count', 'sum'])
  grouped = df.groupby(binned)[target].agg(['count', 'sum'])
  grouped = df.groupby(binned)[target].agg(['count', 'sum'])


In [166]:
#3
# getting VIF

X_vif = df[woe_features].apply(pd.to_numeric, errors='coerce')
X_vif = X_vif.dropna()
y = df.loc[X_vif.index, target]
df = df.loc[X_vif.index].reset_index(drop=True)

vif_df = pd.DataFrame()
vif_df["Variable"] = X_vif.columns
vif_df["VIF"] = [variance_inflation_factor(X_vif.values, i) for i in range(X_vif.shape[1])]
print("📊 VIF for WoE Variables:\n", vif_df.sort_values(by="VIF", ascending=False))


📊 VIF for WoE Variables:
                 Variable       VIF
3        ticket_size_woe  9.380611
4                age_woe  8.603886
2         income_seg_woe  2.745699
1            channel_woe  2.440204
0              state_woe  1.591374
6    referral_source_woe  1.398850
5  branch_visit_freq_woe  1.368175
7  doc_complete_flag_woe  1.025801


In [167]:
#4

#fit data and print var stats 

X_const = sm.add_constant(X_vif)

model = sm.Logit(y, X_const)
result = model.fit(
    method='newton',     # Reliable for small datasets
    maxiter=500,         # Increased iterations
    disp=True            # Show convergence output
)

summary_df = result.summary2().tables[1][['Coef.', 'Std.Err.', 'P>|z|']]
print("\n📈 Coefficients, Std Errors, and P-values:\n", summary_df)


Optimization terminated successfully.
         Current function value: 0.214497
         Iterations 9

📈 Coefficients, Std Errors, and P-values:
                           Coef.   Std.Err.          P>|z|
const                 -2.678920   0.014376   0.000000e+00
state_woe             -0.879843   0.112108   4.221621e-15
channel_woe           -1.073752   0.040159  1.720416e-157
income_seg_woe        -0.961249   0.031949  7.144364e-199
ticket_size_woe       -0.008077   0.126070   9.489157e-01
age_woe               -0.161770   0.106848   1.300198e-01
branch_visit_freq_woe  0.024033   0.058763   6.825605e-01
referral_source_woe    0.029294   0.058677   6.176042e-01
doc_complete_flag_woe -4.744563  10.676486   6.567580e-01


In [168]:
#5 
#get variable level and final PDs and merging to main

coefs = result.params.drop('const')
intercept = result.params['const']
X_aligned = X_vif[coefs.index]

log_odds_matrix = X_aligned.multiply(coefs, axis=1)
log_odds_matrix.columns = log_odds_matrix.columns + '_pd'
log_odds_matrix['intercept'] = intercept
log_odds_matrix['total_log_odds'] = log_odds_matrix.sum(axis=1)
log_odds_matrix['PD'] = 1 / (1 + np.exp(-log_odds_matrix['total_log_odds']))
log_odds_matrix.head()


Unnamed: 0,state_woe_pd,channel_woe_pd,income_seg_woe_pd,ticket_size_woe_pd,age_woe_pd,branch_visit_freq_woe_pd,referral_source_woe_pd,doc_complete_flag_woe_pd,intercept,total_log_odds,PD
0,-0.007572,-0.089396,-0.146575,2.3e-05,-0.006957,-0.008548,0.007217,0.004003,-2.67892,-2.926725,0.050848
1,-0.15761,-0.089396,-0.146575,-0.001601,-0.040184,0.007057,-0.007513,-0.008865,-2.67892,-3.123607,0.042144
2,-0.15761,-0.089396,-0.146575,-0.00485,-0.040184,-0.003178,-0.00423,0.004003,-2.67892,-3.12094,0.042252
3,0.207518,-0.089396,-0.146575,2.3e-05,-0.006957,-0.003178,-0.007513,0.004003,-2.67892,-2.720995,0.061746
4,0.007442,-0.089396,-0.146575,0.002413,0.047804,0.007057,-0.007513,0.004003,-2.67892,-2.853685,0.054491


In [169]:
df = pd.concat([df, log_odds_matrix], axis=1)

In [170]:
df.head()

Unnamed: 0,ID,sourcing_month,state,channel,income_seg,ticket_size,age,branch_visit_freq,referral_source,doc_complete_flag,dpd_90_ever,state_woe,channel_woe,income_seg_woe,ticket_size_woe,age_woe,branch_visit_freq_woe,referral_source_woe,doc_complete_flag_woe,state_woe_pd,channel_woe_pd,income_seg_woe_pd,ticket_size_woe_pd,age_woe_pd,branch_visit_freq_woe_pd,referral_source_woe_pd,doc_complete_flag_woe_pd,intercept,total_log_odds,PD
0,0,2024-03-01,TN,ch2,med,10248.0,30,4,Friend,1_,0,0.008606,0.083255,0.152484,-0.002874,0.043006,-0.355693,0.246353,-0.000844,-0.007572,-0.089396,-0.146575,2.3e-05,-0.006957,-0.008548,0.007217,0.004003,-2.67892,-2.926725,0.050848
1,1,2024-05-01,MH,ch2,med,11917.0,32,1,Branch,0_,0,0.179134,0.083255,0.152484,0.19826,0.248399,0.293643,-0.256472,0.001868,-0.15761,-0.089396,-0.146575,-0.001601,-0.040184,0.007057,-0.007513,-0.008865,-2.67892,-3.123607,0.042144
2,2,2024-03-01,MH,ch2,med,12389.0,34,3,Agent,1_,0,0.179134,0.083255,0.152484,0.600512,0.248399,-0.13224,-0.144388,-0.000844,-0.15761,-0.089396,-0.146575,-0.00485,-0.040184,-0.003178,-0.00423,0.004003,-2.67892,-3.12094,0.042252
3,3,2024-05-01,WB,ch2,med,9147.0,29,3,Branch,1_,0,-0.235857,0.083255,0.152484,-0.002874,0.043006,-0.13224,-0.256472,-0.000844,0.207518,-0.089396,-0.146575,2.3e-05,-0.006957,-0.003178,-0.007513,0.004003,-2.67892,-2.720995,0.061746
4,4,2024-04-01,UP,ch2,med,8895.0,28,1,Branch,1_,0,-0.008459,0.083255,0.152484,-0.298718,-0.295504,0.293643,-0.256472,-0.000844,0.007442,-0.089396,-0.146575,0.002413,0.047804,0.007057,-0.007513,0.004003,-2.67892,-2.853685,0.054491


In [171]:
#6 getting scores at var and consumer level

# Step 1: Score parameters
factor = 50 / np.log(2)
offset = 0
base_score = 450

# Step 2: Extract variable-level log-odds (already renamed with '_pd')
log_odds_cols = [col for col in df.columns if col.endswith('_pd') and col not in ['intercept', 'total_log_odds']]

# Step 3: Calculate variable-level scores
score_contributions = df[log_odds_cols].apply(lambda x: factor * (offset - x))
score_contributions.columns = [col.replace('_pd', '_score') for col in score_contributions.columns]

# Step 4: Calculate intercept score
df['intercept_score'] = factor * (offset - df['intercept'])

# Step 5: Final score
df['final_score'] = base_score + score_contributions.sum(axis=1) + df['intercept_score']

# Step 6: Merge score contributions
df = pd.concat([df, score_contributions], axis=1)


In [172]:
#7 making score bins and risk ranking

df['score_bins'] = pd.qcut(df['final_score'], q=10)
t1 = df.groupby('score_bins').agg({'ID':'count','dpd_90_ever':'mean'}).reset_index()
t1['dpd_90_ever'] = t1['dpd_90_ever']*100
t1

  t1 = df.groupby('score_bins').agg({'ID':'count','dpd_90_ever':'mean'}).reset_index()


Unnamed: 0,score_bins,ID,dpd_90_ever
0,"(510.77700000000004, 585.028]",10020,19.121756
1,"(585.028, 604.999]",9985,11.767651
2,"(604.999, 636.766]",10003,8.887334
3,"(636.766, 656.841]",10012,6.552137
4,"(656.841, 661.984]",10119,5.277201
5,"(661.984, 674.392]",10128,4.561611
6,"(674.392, 697.443]",9824,3.857899
7,"(697.443, 761.15]",9937,2.264265
8,"(761.15, 783.382]",10064,1.02345
9,"(783.382, 907.043]",9908,0.807428


In [177]:
#8 adding reason code

# Step 1: Identify variable level score columns (excludes final and intercept)
score_cols = [col for col in df.columns if col.endswith('_score') and col not in ['final_score', 'intercept_score']]

# Step 2: Compute absolute impact
abs_scores = df[score_cols].abs()

# Step 3: Rank top drivers
top_n = 3
reason_codes = abs_scores.apply(lambda row: row.sort_values(ascending=False).index[:top_n].tolist(), axis=1)

# Step 4: Assign reason codes
df['reason_code_1'] = reason_codes.apply(lambda x: x[0] if len(x) > 0 else None)
df['reason_code_2'] = reason_codes.apply(lambda x: x[1] if len(x) > 1 else None)
df['reason_code_3'] = reason_codes.apply(lambda x: x[2] if len(x) > 2 else None)

In [178]:
df

Unnamed: 0,ID,sourcing_month,state,channel,income_seg,ticket_size,age,branch_visit_freq,referral_source,doc_complete_flag,dpd_90_ever,state_woe,channel_woe,income_seg_woe,ticket_size_woe,age_woe,branch_visit_freq_woe,referral_source_woe,doc_complete_flag_woe,state_woe_pd,channel_woe_pd,income_seg_woe_pd,ticket_size_woe_pd,age_woe_pd,branch_visit_freq_woe_pd,referral_source_woe_pd,doc_complete_flag_woe_pd,intercept,total_log_odds,PD,intercept_score,final_score,state_woe_score,channel_woe_score,income_seg_woe_score,ticket_size_woe_score,age_woe_score,branch_visit_freq_woe_score,referral_source_woe_score,doc_complete_flag_woe_score,score_bins,reason_code_1,reason_code_2,reason_code_3
0,0,2024-03-01,TN,ch2,med,10248.0,30,4,Friend,1_,0,0.008606,0.083255,0.152484,-0.002874,0.043006,-0.355693,0.246353,-0.000844,-0.007572,-0.089396,-0.146575,0.000023,-0.006957,-0.008548,0.007217,0.004003,-2.67892,-2.926725,0.050848,193.243237,661.118552,0.546172,6.448527,10.573156,-0.001675,0.501846,0.616622,-0.520579,-0.288754,"(656.841, 661.984]",income_seg_woe_score,channel_woe_score,branch_visit_freq_woe_score
1,1,2024-05-01,MH,ch2,med,11917.0,32,1,Branch,0_,0,0.179134,0.083255,0.152484,0.198260,0.248399,0.293643,-0.256472,0.001868,-0.157610,-0.089396,-0.146575,-0.001601,-0.040184,0.007057,-0.007513,-0.008865,-2.67892,-3.123607,0.042144,193.243237,675.320604,11.369175,6.448527,10.573156,0.115515,2.898631,-0.509054,0.541963,0.639455,"(674.392, 697.443]",state_woe_score,income_seg_woe_score,channel_woe_score
2,2,2024-03-01,MH,ch2,med,12389.0,34,3,Agent,1_,0,0.179134,0.083255,0.152484,0.600512,0.248399,-0.132240,-0.144388,-0.000844,-0.157610,-0.089396,-0.146575,-0.004850,-0.040184,-0.003178,-0.004230,0.004003,-2.67892,-3.120940,0.042252,193.243237,675.128219,11.369175,6.448527,10.573156,0.349884,2.898631,0.229248,0.305114,-0.288754,"(674.392, 697.443]",state_woe_score,income_seg_woe_score,channel_woe_score
3,3,2024-05-01,WB,ch2,med,9147.0,29,3,Branch,1_,0,-0.235857,0.083255,0.152484,-0.002874,0.043006,-0.132240,-0.256472,-0.000844,0.207518,-0.089396,-0.146575,0.000023,-0.006957,-0.003178,-0.007513,0.004003,-2.67892,-2.720995,0.061746,193.243237,646.278321,-14.969227,6.448527,10.573156,-0.001675,0.501846,0.229248,0.541963,-0.288754,"(636.766, 656.841]",state_woe_score,income_seg_woe_score,channel_woe_score
4,4,2024-04-01,UP,ch2,med,8895.0,28,1,Branch,1_,0,-0.008459,0.083255,0.152484,-0.298718,-0.295504,0.293643,-0.256472,-0.000844,0.007442,-0.089396,-0.146575,0.002413,0.047804,0.007057,-0.007513,0.004003,-2.67892,-2.853685,0.054491,193.243237,655.849866,-0.536858,6.448527,10.573156,-0.174046,-3.448305,-0.509054,0.541963,-0.288754,"(636.766, 656.841]",income_seg_woe_score,channel_woe_score,age_woe_score
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,99995,2024-06-01,WB,ch2,med,8404.0,26,1,Friend,1_,0,-0.235857,0.083255,0.152484,-0.298718,-0.295504,0.293643,0.246353,-0.000844,0.207518,-0.089396,-0.146575,0.002413,0.047804,0.007057,0.007217,0.004003,-2.67892,-2.638880,0.066678,193.243237,640.354955,-14.969227,6.448527,10.573156,-0.174046,-3.448305,-0.509054,-0.520579,-0.288754,"(636.766, 656.841]",state_woe_score,income_seg_woe_score,channel_woe_score
99996,99996,2024-04-01,WB,ch2,med,8258.0,26,2,Agent,0_,0,-0.235857,0.083255,0.152484,-0.298718,-0.295504,-0.077685,-0.144388,0.001868,0.207518,-0.089396,-0.146575,0.002413,0.047804,-0.001867,-0.004230,-0.008865,-2.67892,-2.672118,0.064639,193.243237,642.752585,-14.969227,6.448527,10.573156,-0.174046,-3.448305,0.134674,0.305114,0.639455,"(636.766, 656.841]",state_woe_score,income_seg_woe_score,channel_woe_score
99997,99997,2024-02-01,MH,ch4,med,7347.0,23,0,Friend,0_,0,0.179134,1.642352,0.152484,-0.298718,-0.281871,0.293643,0.246353,0.001868,-0.157610,-1.763478,-0.146575,0.002413,0.045598,0.007057,0.007217,-0.008865,-2.67892,-4.693164,0.009075,193.243237,788.540194,11.369175,127.208069,10.573156,-0.174046,-3.289219,-0.509054,-0.520579,0.639455,"(783.382, 907.043]",channel_woe_score,state_woe_score,income_seg_woe_score
99998,99998,2024-03-01,WB,ch2,high,10933.0,32,2,Branch,0_,0,-0.235857,0.083255,1.774253,0.198260,0.248399,-0.077685,-0.256472,0.001868,0.207518,-0.089396,-1.705500,-0.001601,-0.040184,-0.001867,-0.007513,-0.008865,-2.67892,-4.326328,0.013044,193.243237,762.078573,-14.969227,6.448527,123.025799,0.115515,2.898631,0.134674,0.541963,0.639455,"(761.15, 783.382]",income_seg_woe_score,state_woe_score,channel_woe_score


In [179]:
#### stress testing 

monthly_counts = df['sourcing_month'].value_counts().sort_index()
print("📊 Volume by sourcing month:\n", monthly_counts)


📊 Volume by sourcing month:
 sourcing_month
2024-01-01    10000
2024-02-01    15000
2024-03-01    20000
2024-04-01    20000
2024-05-01    20000
2024-06-01    15000
Name: count, dtype: int64


In [181]:
monthly_summary = df.groupby('sourcing_month').agg({
    'PD': ['mean', 'std'],
    'final_score': ['mean', 'std'],
    'ID': 'count'
}).reset_index()

monthly_summary.columns = ['sourcing_month', 'avg_PD', 'std_PD', 'avg_score', 'std_score', 'volume']
print("Monthly PD & Score Summary:\n", monthly_summary)


Monthly PD & Score Summary:
   sourcing_month    avg_PD    std_PD   avg_score  std_score  volume
0     2024-01-01  0.064252  0.056499  673.576109  77.408278   10000
1     2024-02-01  0.064643  0.056936  673.255760  77.730502   15000
2     2024-03-01  0.064474  0.057101  673.767045  78.112669   20000
3     2024-04-01  0.064195  0.057111  674.418705  78.372052   20000
4     2024-05-01  0.063339  0.055951  674.878238  77.596507   20000
5     2024-06-01  0.064445  0.057138  673.532155  77.580304   15000


In [184]:
stress_month = '2024-02-01'
stable_month = '2024-05-01'

stress_df = df[df['sourcing_month'] == stress_month]
stable_df = df[df['sourcing_month'] == stable_month]

stress_metrics = {
    'avg_PD': stress_df['PD'].mean(),
    'avg_score': stress_df['final_score'].mean(),
    'volume': len(stress_df)
}

stable_metrics = {
    'avg_PD': stable_df['PD'].mean(),
    'avg_score': stable_df['final_score'].mean(),
    'volume': len(stable_df)
}

print("Stress Month Metrics:", stress_metrics)
print("Stable Month Metrics:", stable_metrics)


Stress Month Metrics: {'avg_PD': 0.0646429512747669, 'avg_score': 673.2557597241129, 'volume': 15000}
Stable Month Metrics: {'avg_PD': 0.06333855478006052, 'avg_score': 674.8782375592319, 'volume': 20000}


In [189]:
monthly_summary['PD_drift_flag'] = ((monthly_summary['avg_PD'] - monthly_summary['avg_PD'].median()) / monthly_summary['avg_PD'].median()).abs()  
monthly_summary['score_drift_flag'] = ((monthly_summary['avg_score'] - monthly_summary['avg_score'].median()) / monthly_summary['avg_score'].median()).abs()
monthly_summary

Unnamed: 0,sourcing_month,avg_PD,std_PD,avg_score,std_score,volume,PD_drift_flag,score_drift_flag
0,2024-01-01,0.064252,0.056499,673.576109,77.408278,10000,0.001499,0.000142
1,2024-02-01,0.064643,0.056936,673.25576,77.730502,15000,0.004571,0.000617
2,2024-03-01,0.064474,0.057101,673.767045,78.112669,20000,0.001949,0.000142
3,2024-04-01,0.064195,0.057111,674.418705,78.372052,20000,0.002394,0.001109
4,2024-05-01,0.063339,0.055951,674.878238,77.596507,20000,0.0157,0.001791
5,2024-06-01,0.064445,0.057138,673.532155,77.580304,15000,0.001499,0.000207


In [193]:
### scenario testing


woe_cols = [col for col in df.columns if col.endswith('_woe')]
df[woe_cols] = df[woe_cols].apply(lambda x: pd.to_numeric(x, errors='coerce'))



df_stressed = df.copy()
df_adverse = df.copy()
df_optimistic = df.copy()

In [194]:
df_stressed['income_seg_woe'] -= 0.3
df_stressed['age_woe'] -= 0.2
df_stressed['ticket_size_woe'] += 0.4
df_stressed['doc_complete_flag_woe'] -= 0.5


In [195]:
df_adverse['channel_woe'] += 0.3
df_adverse['branch_visit_freq_woe'] -= 0.2


In [196]:
df_optimistic['income_seg_woe'] += 0.3
df_optimistic['age_woe'] += 0.2
df_optimistic['doc_complete_flag_woe'] += 0.5


In [197]:
def recalculate(df_scenario, coefs, intercept, factor, base_score):
    X = df_scenario[coefs.index]
    log_odds = X.multiply(coefs, axis=1).sum(axis=1) + intercept
    df_scenario['PD_scenario'] = 1 / (1 + np.exp(-log_odds))
    df_scenario['score_scenario'] = base_score + factor * (0 - log_odds)
    return df_scenario

In [198]:
df_stressed = recalculate(df_stressed, coefs, intercept, factor, base_score)
df_adverse = recalculate(df_adverse, coefs, intercept, factor, base_score)
df_optimistic = recalculate(df_optimistic, coefs, intercept, factor, base_score)


In [199]:
summary = pd.DataFrame({
    'Scenario': ['Stressed', 'Adverse', 'Optimistic'],
    'Avg PD': [
        df_stressed['PD_scenario'].mean(),
        df_adverse['PD_scenario'].mean(),
        df_optimistic['PD_scenario'].mean()
    ],
    'Avg Score': [
        df_stressed['score_scenario'].mean(),
        df_adverse['score_scenario'].mean(),
        df_optimistic['score_scenario'].mean()
    ]
})
print(summary)


     Scenario    Avg PD   Avg Score
0    Stressed  0.423410  479.962042
1     Adverse  0.047838  697.571757
2  Optimistic  0.004895  868.248207
