# Early Warning Models for Financial Crisis using Machine Learning Techniques

### Yelebe Desta and Seth Tenberg

# Lit Review

# Cleaning the Data

In [54]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


In [55]:
data = pd.read_excel("C:/Users/sjten/Downloads/JSTdatasetR6 (1).xlsx")

# Exploring the Data

In [56]:
data.describe()

Unnamed: 0,year,country,iso,ifs,pop,rgdpmad,rgdpbarro,rconsbarro,gdp,iy,...,eq_capgain,eq_dp,eq_capgain_interp,eq_tr_interp,eq_dp_interp,bond_rate,eq_div_rtn,capital_tr,risky_tr,safe_tr
0,1870,Australia,AUS,193,1775.000000,3273.239437,13.836157,21.449734,208.78,0.109266,...,-0.070045,0.071417,,,,0.049118,0.066415,,,
1,1871,Australia,AUS,193,1675.000000,3298.507463,13.936864,19.930801,211.56,0.104579,...,0.041654,0.065466,,,,0.048446,0.068193,,,
2,1872,Australia,AUS,193,1722.000000,3553.426249,15.044247,21.085006,227.40,0.130438,...,0.108945,0.062997,,,,0.047373,0.069861,,,
3,1873,Australia,AUS,193,1769.000000,3823.629169,16.219443,23.254910,266.54,0.124986,...,0.083086,0.064484,,,,0.046720,0.069842,,,
4,1874,Australia,AUS,193,1822.000000,3834.796926,16.268228,23.458050,287.58,0.141960,...,0.119389,0.063503,,,,0.046533,0.071085,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2713,2016,USA,USA,111,322701.246359,33012.767871,108.756035,108.051774,18695.10,0.195831,...,0.014868,0.021309,,,,0.018417,0.021626,0.048536,0.067177,0.003139
2714,2017,USA,USA,111,324756.935396,33543.679862,110.591099,109.978043,19479.60,0.204547,...,0.170363,0.019363,,,,0.023300,0.022662,0.121381,0.160881,0.017584
2715,2018,USA,USA,111,326497.132575,34338.770756,113.306861,112.567306,20527.20,0.208586,...,0.121093,0.018716,,,,0.029100,0.020982,0.092210,0.129585,0.000581
2716,2019,USA,USA,111,328018.681916,34961.835051,115.229692,114.509455,21372.60,0.210205,...,0.061144,0.019312,,,,0.021442,0.020492,0.080606,0.084776,0.070134


We want to make a early warning indicator model, thus our dependent variable of interest is not when the crisis occurs, but a year or two *before* the crisis occurs. Let's code that

In [57]:
data['indicator'] = ((data['crisisJST'].shift(-1) == 1) | (data['crisisJST'].shift(-2) == 1)).astype(int)


Now we will remove the year of the crisis along with the following four years to remove post crisis bias, which has been shown to affect analysis in previous research.

In [58]:
crisis_rows = data[data['crisisJST'] == 1].index.tolist()
crisis_rows_incremented1 = [row + 1 for row in crisis_rows]
crisis_rows_incremented2 = [row + 2 for row in crisis_rows]
crisis_rows_incremented3 = [row + 3 for row in crisis_rows]
crisis_rows_incremented4 = [row + 4 for row in crisis_rows]

crisis_rows_df = pd.DataFrame(
    {'crisis_rows': crisis_rows,
     'crisis_rows_incremented1': crisis_rows_incremented1,
     'crisis_rows_incremented2': crisis_rows_incremented2,
     'crisis_rows_incremented3': crisis_rows_incremented3,
     'crisis_rows_incremented4': crisis_rows_incremented4})

rows_to_remove = pd.concat([crisis_rows_df[col] for col in crisis_rows_df.columns])

# Sort the values in the combined column
rows_to_remove = rows_to_remove.sort_values()
# remove the rows from the data to account for crisis bias
data = data.drop(data.index[rows_to_remove])
# we have removed 435 from the dataset

Unnamed: 0,year,country,iso,ifs,pop,rgdpmad,rgdpbarro,rconsbarro,gdp,iy,...,eq_dp,eq_capgain_interp,eq_tr_interp,eq_dp_interp,bond_rate,eq_div_rtn,capital_tr,risky_tr,safe_tr,indicator
0,1870,Australia,AUS,193,1775.000000,3273.239437,13.836157,21.449734,208.78,0.109266,...,0.071417,,,,0.049118,0.066415,,,,0
1,1871,Australia,AUS,193,1675.000000,3298.507463,13.936864,19.930801,211.56,0.104579,...,0.065466,,,,0.048446,0.068193,,,,0
2,1872,Australia,AUS,193,1722.000000,3553.426249,15.044247,21.085006,227.40,0.130438,...,0.062997,,,,0.047373,0.069861,,,,0
3,1873,Australia,AUS,193,1769.000000,3823.629169,16.219443,23.254910,266.54,0.124986,...,0.064484,,,,0.046720,0.069842,,,,0
4,1874,Australia,AUS,193,1822.000000,3834.796926,16.268228,23.458050,287.58,0.141960,...,0.063503,,,,0.046533,0.071085,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2713,2016,USA,USA,111,322701.246359,33012.767871,108.756035,108.051774,18695.10,0.195831,...,0.021309,,,,0.018417,0.021626,0.048536,0.067177,0.003139,0
2714,2017,USA,USA,111,324756.935396,33543.679862,110.591099,109.978043,19479.60,0.204547,...,0.019363,,,,0.023300,0.022662,0.121381,0.160881,0.017584,0
2715,2018,USA,USA,111,326497.132575,34338.770756,113.306861,112.567306,20527.20,0.208586,...,0.018716,,,,0.029100,0.020982,0.092210,0.129585,0.000581,0
2716,2019,USA,USA,111,328018.681916,34961.835051,115.229692,114.509455,21372.60,0.210205,...,0.019312,,,,0.021442,0.020492,0.080606,0.084776,0.070134,0


Now we wil account for extraordinary events in history, and remove the years during which World War 1, the Great Depression, and World War 2. Then we will also create two new variables for our analysis. We create a measure of global credit creation, and a measure for the global slope of the yield curve.

In [79]:
# we will create a variable the measures the slope of the yield curve, subtracting short term interest rates from long term interest rates
data['slope'] = (data['ltrate'] - data['stir'])
# removing years for WW1, WW2, and Great Depression
data_filtered = data[(data['year'] <= 1914) | (data['year'] >= 1918)]
data = data_filtered[(data_filtered['year'] <= 1933) | (data_filtered['year'] >= 1945)]
data['year']
# now we will create our global slope and credit variables
grouped = data.groupby('year')
data = grouped.apply(lambda x: x.assign(
    global_credit=x.loc[x['country'] != x['country'].iloc[0], 'tloans'].mean(),
    global_slope=x.loc[x['country'] != x['country'].iloc[0], 'slope'].mean()
))

# Ungroup
data= data.reset_index(drop=True)

#making Debt Servicing Ratio Variable
data['DSR'] = (data['tloans'] - (data['ltrate']/data['gdp']))


In [82]:
data_rate = data.groupby('country').apply(lambda group: 
    group.assign(
        cpi_rate = (group['cpi'] - group['cpi'].shift(1)) / group['cpi'].shift(1),
        rconsbarro_rate = (group['rconsbarro'] - group['rconsbarro'].shift(1)) / group['rconsbarro'].shift(1),
        hpnom_rate = (group['hpnom'] - group['hpnom'].shift(1)) / group['hpnom'].shift(1),
        money_rate = (group['money'] - group['money'].shift(1)) / group['money'].shift(1),
        DSR_rate = (group['DSR'] / group['gdp'] - group['DSR'].shift(1) / group['gdp'].shift(1)),
        tloans_rate = (group['tloans'] / group['gdp'] - group['tloans'].shift(1) / group['gdp'].shift(1)),
        ca_rate = (group['ca'] / group['gdp'] - group['ca'].shift(1) / group['gdp'].shift(1)),
        iy_rate = (group['iy'] - group['iy'].shift(1)),
        stock_rate =  (group['capital_tr'] - group['capital_tr'].shift(1)),
        debtgdp_rate = (group['debtgdp'] - group['debtgdp'].shift(1))
    )
).reset_index(drop=True)


In [69]:
# remova nas
# explore data
# make logistic model 
# eventually add more variables 

# Computing Baseline Prediction (yield curve) (logistic model)

In [72]:
data

Unnamed: 0,year,country,iso,ifs,pop,rgdpmad,rgdpbarro,rconsbarro,gdp,iy,...,eq_dp_interp,bond_rate,eq_div_rtn,capital_tr,risky_tr,safe_tr,indicator,slope,global_credit,global_slope
0,1870,Australia,AUS,193,1775.000000,3273.239437,13.836157,21.449734,2.087800e+02,0.109266,...,,0.049118,0.066415,,,,0,0.031817,2.463521e+01,0.953905
1,1870,Canada,CAN,156,3781.000000,1694.525258,7.014665,,3.825530e-01,,...,,,,,,,0,,2.463521e+01,0.953905
2,1870,Germany,DEU,134,40804.000000,1839.079503,11.063086,9.480000,1.250757e-11,0.123486,...,,0.046269,0.068695,,,0.050492,0,-2.123067,2.463521e+01,0.953905
3,1870,Denmark,DNK,128,1888.000000,2003.177966,8.920334,13.060000,6.690000e-01,0.121076,...,,0.048000,,,,,0,,2.463521e+01,0.953905
4,1870,Spain,ESP,184,16349.645454,1133.324528,6.603847,9.368696,5.970432e+03,0.047005,...,,0.111940,,,,,0,6.194030,2.463521e+01,0.953905
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2061,2020,Netherlands,NLD,138,17663.002528,25358.151649,109.820750,95.508774,1.763177e+06,0.213010,...,,-0.003768,0.016600,0.120931,0.140567,0.016912,0,0.048483,1.756434e+07,0.407939
2062,2020,Norway,NOR,142,5224.650291,29086.336860,102.769261,110.062421,3.413450e+06,0.300770,...,,0.008175,0.037000,,,0.023002,0,0.214687,1.756434e+07,0.407939
2063,2020,Portugal,PRT,182,10446.582135,14315.341284,103.543886,100.823087,4.011396e+07,0.190832,...,,0.004167,0.000381,,,-0.003109,0,0.841817,1.756434e+07,0.407939
2064,2020,Sweden,SWE,144,10142.400065,26912.147706,112.242402,105.423534,4.983360e+06,0.247787,...,,-0.000383,0.015700,,,0.008296,0,0.096667,1.756434e+07,0.407939


# ML technique one 

# ML technique two etc

# showcase prediction capabilities and compare