## Examples of usage

In [1]:
import numpy as np
import random
import pandas as pd
import meliora.src.meliora.core as vt
from scipy.stats import norm

In [2]:
# Import data
test_data_main = pd.read_csv('../data/pd_test_data_main.csv')
test_data_add = pd.read_csv('../data/pd_test_data_period_2.csv')
pd_test_data = pd.read_csv('../data/pd_test_data_main.csv')
test_pd_psi = pd.read_csv('../data/pd_bellini.csv')
german_data = pd.read_csv('../data/german_data.csv')
lgd_t_test = pd.read_csv('../data/lgd_t_test.csv')
pd_transition_matrix = pd.read_csv('../data/pd_transition_matrix.csv')
pd_calibration = pd.read_csv('../data/pd_calibration.csv')
lgd_dataset = pd.read_csv('../data/lgd_dataset.csv')

In [39]:
lgd_t_test.head()

Unnamed: 0,facility,predicted_lgd,realised_lgd,segment,ead
0,1,0.8,0.83,8,5860
1,2,0.75,0.81,7,6653
2,3,0.42,0.45,4,3192
3,4,0.99,1.0,9,7322
4,5,0.9,0.86,9,5513


In [3]:
lgd_dataset.head()

Unnamed: 0,realised_outcome,predicted_outcome
0,1,3
1,1,3
2,1,3
3,2,2
4,2,2


In [4]:
pd_transition_matrix.head()

Unnamed: 0,period_1_ratings,period_2_ratings
0,6,4
1,6,5
2,3,2
3,5,2
4,7,8


In [5]:
# Binomial test
vt.binomial_test(test_data_main, "ratings", "default_flag", "predicted_pd")

Unnamed: 0,Rating class,Predicted PD,Total count,Defaults,Actual Default Rate,p_value,Reject H0
0,A,0.103429,406,55,0.135468,0.023892,True
1,B,0.099524,480,41,0.085417,0.867441,False
2,C,0.095719,114,10,0.087719,0.660553,False


In [6]:
# Jeffrey's test
vt.jeffreys_test(test_data_main, 'ratings', 'default_flag', 'predicted_pd', 0.05)

Unnamed: 0,Rating class,Predicted PD,Total count,Defaults,Actual Default Rate,p_value,Reject H0
0,A,0.103429,406,55,0.135468,0.019959,True
1,B,0.099524,480,41,0.085417,0.849552,False
2,C,0.095719,114,10,0.087719,0.598649,False


In [7]:
# Brier score
vt.brier_score(test_data_main, "ratings", "default_flag", "predicted_pd")

0.0012895084997917349

In [8]:
# Hosmer-Lemeshow test
vt.hosmer_test(test_data_main, "ratings", "default_flag", "predicted_pd")

[0.1302500370047408, False]

In [9]:
# Spiegelhalter test
vt.spiegelhalter_test(test_data_main, "ratings", "default_flag", "predicted_pd")

(-0.6637590511485174, False)

In [10]:
# ROC test
vt.roc_auc(test_data_main, "default_flag", "predicted_pd")

0.5008547549702419

In [11]:
# Spearman correlation
x = [1,2,3,4,5]
y = [5,6,7,8,7]
vt.spearman_correlation(x, y).correlation

0.8207826816681233

In [12]:
# Somers D
x = [0, 1, 1, 1, 1] 
y = [1, 1, 1, 0, 1]
vt.somersd(x, y)

SomersDResult(statistic=-0.25, pvalue=0.3613104285261787, table=array([[0, 1],
       [1, 3]]))

In [13]:
# Kendall tau
x = [1, 2, 3, 2, 1, 3, 4, 2, 5, 2, 6, 5, 5]
y = [5, 5, 6, 2, 1, 4, 4, 2, 1, 2, 1, 5, 5]

tau, pvalue = vt.kendall_tau(x, y)
tau

0.030306509211290782

In [14]:
# PSI test
test_pd_psi['remaining_mat' + "_bin"] = pd.cut(test_pd_psi['remaining_mat'], bins=10, labels=False)
test_pd_psi['year_bins'] = np.where(test_pd_psi['vintage_year'] < 2007, 'period_1', 'period_2')

# Export data
test_pd_psi.to_csv('test_pd_psi.csv', index=False)

# Show results
kala = vt.population_stability_index(test_pd_psi, 'year_bins', 'remaining_mat_bin')
kala[1]

1.0344129494141174

In [15]:
# IV
zen = vt.information_value(german_data, 'checkingstatus', 'GoodCredit')

zen[1]

0.6660115033513336

In [16]:
zen[0]

Unnamed: 0,Variable,Value,All,Bad,Share,Bad Rate,Distribution Good,Distribution Bad,WoE,IV
0,checkingstatus,A11,274,135,0.274,0.492701,0.198571,0.45,-0.818099,0.205693
1,checkingstatus,A12,269,105,0.269,0.390335,0.234286,0.35,-0.401392,0.046447
3,checkingstatus,A13,63,14,0.063,0.222222,0.07,0.046667,0.405465,0.009461
2,checkingstatus,A14,394,46,0.394,0.116751,0.497143,0.153333,1.176263,0.40441


In [17]:
# LGD t test
df = vt.lgd_t_test(lgd_t_test, 
                                'predicted_lgd', 
                                'realised_lgd',
                                level="segment",
                                segment_col='segment'
                               ).sort_values(by='segment')
df

Unnamed: 0,segment,N,realised_lgd_mean,pred_lgd_mean,s2,mean_error,t_stat,p_value
9,0,92,0.043913,0.052065,0.002571,-0.008152,-1.542031,0.936732
4,1,91,0.141758,0.144286,0.003266,-0.002527,-0.421905,0.66295
8,2,103,0.245728,0.232427,0.002932,0.013301,2.49293,0.007139
7,3,107,0.344673,0.338879,0.003593,0.005794,1.0,0.159794
2,4,105,0.448952,0.44819,0.003265,0.000762,0.136637,0.445791
6,5,97,0.548247,0.54866,0.003404,-0.000412,-0.069611,0.527676
5,6,114,0.645614,0.634561,0.003372,0.011053,2.032136,0.022243
1,7,108,0.744815,0.749815,0.003264,-0.005,-0.909441,0.817419
0,8,92,0.844348,0.839239,0.003223,0.005109,0.863116,0.195171
3,9,91,0.946044,0.93011,0.002358,0.015934,3.130404,0.001177


In [40]:
df['p_value'].sum()

3.7760920875724846

In [18]:
# Migration matrix stats
vt.migration_matrices_statistics(pd_transition_matrix, 'period_1_ratings', 'period_2_ratings')

(0.43581081081081086, 0.8108108108108109)

In [19]:
# Bayesian error rate
vt.bayesian_error_rate(test_data_main, "default_flag", "predicted_pd")

0.106

In [20]:
# cier
vt.conditional_information_entropy_ratio(pd_calibration, "realised_pd", "count")

0.024548595310375846

In [21]:
# kullback_leibler_dist
vt.kullback_leibler_dist(pd_calibration, "realised_pd", "count")

0.006240325352140225

In [22]:
# test Gini
vt.gini(pd_test_data, "default_flag", "predicted_pd")

0.0017095099404838

In [23]:
# KS
vt.kolmogorov_smirnov_stat(test_data_main, "default_flag", "predicted_pd")

KstestResult(statistic=0.869, pvalue=0.0)

In [34]:
a = vt.kolmogorov_smirnov_stat(test_data_main, "default_flag", "predicted_pd")

In [35]:
a[0]

0.869

In [52]:
# Herfhindahl multiple period
# vt.herfindahl_multiple_period_test(pd_calibration, "rating", "realised_pd", "count")

AttributeError: 'str' object has no attribute 'columns'

In [25]:
# Loss Capture Ratio
vt.loss_capture_ratio(lgd_t_test['ead'], lgd_t_test['predicted_lgd'], lgd_t_test['realised_lgd'])

1.0000874459653837

In [26]:
lgd_t_test.head()

Unnamed: 0,facility,predicted_lgd,realised_lgd,segment,ead
0,1,0.8,0.83,8,5860
1,2,0.75,0.81,7,6653
2,3,0.42,0.45,4,3192
3,4,0.99,1.0,9,7322
4,5,0.9,0.86,9,5513


In [38]:
# CLAR
vt.cumulative_lgd_accuracy_ratio(lgd_dataset, 'predicted_outcome', 'realised_outcome')

3.1999999999999997

In [28]:
# Migration Matrix Stability
vt.migration_matrix_stability(pd_transition_matrix, 'period_1_ratings', 'period_2_ratings')

  z_ij = num / np.sqrt(den_a + den_b + den_c)
  z_ij = num / np.sqrt(den_a + den_b + den_c)


(period_2_ratings         1         2         3         4         5         6  \
 period_1_ratings                                                               
 1                      NaN  5.441072 -1.315587  3.585686       NaN       NaN   
 2                -1.465656       NaN  0.301985 -0.301985  2.750494       NaN   
 3                -2.466006  0.000000       NaN  0.000000  0.635724  2.111195   
 4                 0.378968  0.918559 -0.580948       NaN  0.334021  0.000000   
 5                 2.905370  0.501531 -1.420463 -0.378625       NaN -2.493813   
 6                      NaN  3.877284 -0.448308 -1.799123  1.290339       NaN   
 7                      NaN       NaN  3.119251 -1.978141  2.689264 -1.538968   
 8                      NaN       NaN       NaN  1.816590  1.562315 -1.562315   
 
 period_2_ratings         7         8  
 period_1_ratings                      
 1                      NaN       NaN  
 2                      NaN       NaN  
 3                      NaN 

In [47]:
q = vt.migration_matrix_stability(pd_transition_matrix, 'period_1_ratings', 'period_2_ratings')

  z_ij = num / np.sqrt(den_a + den_b + den_c)
  z_ij = num / np.sqrt(den_a + den_b + den_c)


In [50]:
q[0].sum().sum()

23.81875738112634

In [51]:
q[1].sum().sum()

23.11530838816691

In [29]:
# Pearson correlation
x = [1,2,3,4,5]
y = [5,6,7,8,7]
vt.pearson_correlation(x, y)

SpearmanrResult(correlation=0.8207826816681233, pvalue=0.08858700531354381)

In [30]:
# ELBE t test
vt.elbe_t_test(lgd_t_test, "predicted_lgd", "realised_lgd")

Unnamed: 0,facilities,lgd_mean,t_stat,p_value
0,1000,0.49357,2.072427,0.038482


In [31]:
# Loss Shortfall
vt.loss_shortfall(lgd_t_test, "ead", "predicted_lgd", "realised_lgd")

-0.008480989922580617

In [32]:
# Mean Absolute Deviation
vt.mean_absolute_deviation(pd_calibration, "rating", "realised_pd", "count")

6999.929781818181

In [33]:
# Normal test
vt.normal_test(pd_calibration['predicted_pd'], pd_calibration['realised_pd'])

Unnamed: 0,estimate,t_stat,p_value,outcome
0,0.001,0.015093,0.505873,


In [41]:
q = vt.normal_test(pd_calibration['predicted_pd'], pd_calibration['realised_pd'])


0.5058725359972235

#### Relevant links
- https://machinelearningmastery.com/divergence-between-probability-distributions/
- https://medium.com/@monadsblog/the-kullback-leibler-divergence-5071c707a4a6
- https://documentation.sas.com/doc/en/pgmsascdc/v_011/statug/statug_code_logiex19.htm