In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.plotting import lag_plot
import seaborn as sns
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import roc_curve
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv('modelling_dataset_final.csv')
y_regime=df[['date','two_factor_regimes']]

In [3]:
# read feature and turn it into a dict
df_col = pd.read_excel('Final Features.xlsx')
D_col ={'date':'date'}
for i in df_col.iterrows():
    D_col[i[1][0]]=i[1][1]

In [4]:
print('original # col: {}'.format(len(df.columns)))

original # col: 316


In [5]:
# change column names using dict created earlier
df.columns = df.columns.to_series().map(D_col)
print('unique # col: {}'.format(len(set(df.columns))))

unique # col: 30


In [6]:
df_final = df.groupby(df.columns, axis=1).sum()
# 28 columns without nan
print('new shape {}'.format(df_final.shape))
df_final.set_index('date', inplace=True)
df_final.head()

new shape (808, 29)


Unnamed: 0_level_0,accounting,accounting_neg,accounting_pos,disaster,disaster_neg,economy,economy_neg,economy_pos,finance,finance_neg,...,operations_neg,operations_pos,other,political,political_neg,political_pos,product,tech,tech_neg,tech_pos
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1999-04-23,24.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,4.0,24.0,...,0.0,140.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,4.0
1999-07-23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,28.0,...,24.0,92.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,8.0
1999-10-22,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,34.0,...,4.0,116.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2000-04-28,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,84.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,32.0
2000-07-28,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,8.0,...,8.0,52.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0


In [7]:
y_regime.set_index('date', inplace=True)
y_regime.head()

Unnamed: 0_level_0,two_factor_regimes
date,Unnamed: 1_level_1
1999-04-23,0.0
1999-07-23,0.0
1999-10-22,1.0
2000-04-28,1.0
2000-07-28,1.0


In [8]:
# y value is not highly imbalanced
y_regime.two_factor_regimes.value_counts()

0.0    571
1.0    237
Name: two_factor_regimes, dtype: int64

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.3)

In [9]:
df_t_minus_3 = df_final.shift(3)
df_t_minus_3.columns = [str(col) + '_t-3' for col in df_t_minus_3.columns]

df_t_minus_2 = df_final.shift(2)
df_t_minus_2.columns = [str(col) + '_t-2' for col in df_t_minus_2.columns]

df_t_minus_1 = df_final.shift(1)
df_t_minus_1.columns = [str(col) + '_t-1' for col in df_t_minus_1.columns]

X = pd.concat([df_t_minus_3,df_t_minus_2,df_t_minus_1], axis=1)

In [10]:
X.shape

(808, 84)

In [11]:
X = X.iloc[3:]
X.head()

Unnamed: 0_level_0,accounting_t-3,accounting_neg_t-3,accounting_pos_t-3,disaster_t-3,disaster_neg_t-3,economy_t-3,economy_neg_t-3,economy_pos_t-3,finance_t-3,finance_neg_t-3,...,operations_neg_t-1,operations_pos_t-1,other_t-1,political_t-1,political_neg_t-1,political_pos_t-1,product_t-1,tech_t-1,tech_neg_t-1,tech_pos_t-1
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-04-28,24.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,4.0,24.0,...,4.0,116.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2000-07-28,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,28.0,...,0.0,84.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,32.0
2000-10-27,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,34.0,...,8.0,52.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
2001-01-26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,48.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
2001-04-27,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,8.0,...,0.0,55.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0


In [12]:
len(X)

805

In [51]:
y_regime_t = y_regime.copy(deep=False) 
y_regime_t_minus_1 = y_regime_t.shift(1)
y_difference = y_regime_t - y_regime_t_minus_1
y = y_difference.iloc[3:].astype(int)

In [52]:
y.head()

Unnamed: 0_level_0,two_factor_regimes
date,Unnamed: 1_level_1
2000-04-28,0
2000-07-28,0
2000-10-27,0
2001-01-26,0
2001-04-27,0


In [53]:
y.two_factor_regimes.unique()

array([0, 1], dtype=int64)

In [56]:
len(y)

805

In [30]:
train_len = int(np.round(0.7 * len(X)))
test_len  = int(np.round(0.3 * len(X)))

X_train = X[:train_len]
X_test  = X[-test_len:]
y_train = y[:train_len]
y_test  = y[-test_len:]

In [61]:
linear_model=sm.OLS(y_train,X_train)
result=linear_model.fit()
print(result.summary2())

                  Results: Ordinary least squares
Model:              OLS                Adj. R-squared:     0.024    
Dependent Variable: two_factor_regimes AIC:                -556.5336
Date:               2019-11-14 00:31   BIC:                -192.3891
No. Observations:   564                Log-Likelihood:     362.27   
Df Model:           84                 F-statistic:        1.164    
Df Residuals:       480                Prob (F-statistic): 0.168    
R-squared:          0.169              Scale:              0.019039 
--------------------------------------------------------------------
                      Coef.  Std.Err.    t    P>|t|   [0.025  0.975]
--------------------------------------------------------------------
accounting_t-3       -0.0000   0.0000 -1.4496 0.1478 -0.0001  0.0000
accounting_neg_t-3    0.0004   0.0003  1.1019 0.2711 -0.0003  0.0010
accounting_pos_t-3   -0.0001   0.0001 -1.1731 0.2413 -0.0002  0.0000
disaster_t-3          0.0001   0.0001  1.2153 0.2248 