## Examples of usage

In [1]:
import numpy as np
import random
import pandas as pd
import meliora.src.meliora.core as vt
from scipy.stats import norm

In [2]:
# Import data
test_data_main = pd.read_csv('../data/pd_test_data_main.csv')
test_data_add = pd.read_csv('../data/pd_test_data_period_2.csv')
test_pd_psi = pd.read_csv('../data/pd_bellini.csv')
german_data = pd.read_csv('../data/german_data.csv')

In [3]:
# Binomial test
vt.binomial_test(test_data_main, "ratings", "default_flag", "predicted_pd")

Unnamed: 0,Rating class,Predicted PD,Total count,Defaults,Actual Default Rate,p_value,Reject H0
0,A,0.103429,406,55,0.135468,0.023892,True
1,B,0.099524,480,41,0.085417,0.867441,False
2,C,0.095719,114,10,0.087719,0.660553,False


In [4]:
# Jeffrey's test
vt.jeffreys_test(test_data_main, 'ratings', 'default_flag', 'predicted_pd', 0.05)

Unnamed: 0,Rating class,Predicted PD,Total count,Defaults,Actual Default Rate,p_value,Reject H0
0,A,0.103429,406,55,0.135468,0.019959,True
1,B,0.099524,480,41,0.085417,0.849552,False
2,C,0.095719,114,10,0.087719,0.598649,False


In [5]:
# Brier score
vt.brier_score(test_data_main, "ratings", "default_flag", "predicted_pd")

0.0012895084997917349

In [6]:
# Hosmer-Lemeshow test
vt.hosmer_test(test_data_main, "ratings", "default_flag", "predicted_pd")

[0.1302500370047408, False]

In [7]:
# Spiegelhalter test
vt.spiegelhalter_test(test_data_main, "ratings", "default_flag", "predicted_pd")

(-0.6637590511485174, False)

In [8]:
# ROC test
vt.roc_auc(test_data_main, "default_flag", "predicted_pd")

0.5008547549702419

In [9]:
# Spearman correlation
x = [1,2,3,4,5]
y = [5,6,7,8,7]
vt.spearman_corr(x, y).correlation

0.8207826816681233

In [10]:
# Somers D
x = [0, 1, 1, 1, 1] 
y = [1, 1, 1, 0, 1]
vt.somersd(x, y)

SomersDResult(statistic=-0.25, pvalue=0.3613104285261787, table=array([[0, 1],
       [1, 3]]))

In [11]:
# Kendall tau
x = [1, 2, 3, 2, 1, 3, 4, 2, 5, 2, 6, 5, 5]
y = [5, 5, 6, 2, 1, 4, 4, 2, 1, 2, 1, 5, 5]

tau, pvalue = vt.kendall_tau(x, y)
tau

0.030306509211290782

In [12]:
# PSI test
test_pd_psi['remaining_mat' + "_bin"] = pd.cut(test_pd_psi['remaining_mat'], bins=10, labels=False)
test_pd_psi['year_bins'] = np.where(test_pd_psi['vintage_year'] < 2007, 'period_1', 'period_2')

# Export data
test_pd_psi.to_csv('test_pd_psi.csv', index=False)

# Show results
kala = vt.psi(test_pd_psi, 'year_bins', 'remaining_mat_bin')
kala[1]

1.0344129494141174

In [13]:
# IV
zen = vt.calc_iv(german_data, 'checkingstatus', 'GoodCredit')

zen[1]

0.6660115033513336

In [14]:
zen[0]

Unnamed: 0,Variable,Value,All,Bad,Share,Bad Rate,Distribution Good,Distribution Bad,WoE,IV
0,checkingstatus,A11,274,135,0.274,0.492701,0.198571,0.45,-0.818099,0.205693
1,checkingstatus,A12,269,105,0.269,0.390335,0.234286,0.35,-0.401392,0.046447
3,checkingstatus,A13,63,14,0.063,0.222222,0.07,0.046667,0.405465,0.009461
2,checkingstatus,A14,394,46,0.394,0.116751,0.497143,0.153333,1.176263,0.40441


In [15]:
# Transform input data into the required format
df = test_data_main.groupby('ratings').agg({'predicted_pd': "mean", 'default_flag': ["count", "sum", "mean"]})
df.columns = ["PD", "N", "D", "Default Rate"]

realised_values = df["Default Rate"]
predicted_values = df["PD"]

# Calculate mean squared error
errors = realised_values - predicted_values
mse = (errors**2).sum() / len(errors)

# # Calculate null expectation and variance of MSE
expectations = sum(predicted_values * (1 - predicted_values)) / len(realised_values)
variances = sum(predicted_values * (1 - 2 * predicted_values)**2 * (1 - predicted_values)) / len(realised_values)**2

# Calculate standardized statistic
z_score = (mse - expectations) / np.sqrt(variances)  # todo: check formula

# Calculate standardized MSE as test statistic, then its p-value
outcome = z_score > norm.ppf(1 - 0.05/2)

In [16]:
errors

ratings
A    0.032039
B   -0.014107
C   -0.008000
dtype: float64

In [17]:
mse

0.00042983616659724496

In [18]:
norm.ppf(1 - 0.05/2)

1.959963984540054

#### Relevant links
- https://machinelearningmastery.com/divergence-between-probability-distributions/
- https://medium.com/@monadsblog/the-kullback-leibler-divergence-5071c707a4a6
- https://documentation.sas.com/doc/en/pgmsascdc/v_011/statug/statug_code_logiex19.htm