In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.plotting import lag_plot
import seaborn as sns
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import roc_curve
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv('modelling_dataset_final.csv')
y_regime=df[['date','two_factor_regimes']]

In [3]:
# read feature and turn it into a dict
df_col = pd.read_excel('Final Features.xlsx')
D_col ={'date':'date'}
for i in df_col.iterrows():
    D_col[i[1][0]]=i[1][1]

In [4]:
print('original # col: {}'.format(len(df.columns)))

original # col: 316


In [5]:
# change column names using dict created earlier
df.columns = df.columns.to_series().map(D_col)
print('unique # col: {}'.format(len(set(df.columns))))

unique # col: 30


In [6]:
df_final = df.groupby(df.columns, axis=1).sum()
# 28 columns without nan
print('new shape {}'.format(df_final.shape))
df_final.set_index('date', inplace=True)
df_final.head()

new shape (808, 29)


Unnamed: 0_level_0,accounting,accounting_neg,accounting_pos,disaster,disaster_neg,economy,economy_neg,economy_pos,finance,finance_neg,...,operations_neg,operations_pos,other,political,political_neg,political_pos,product,tech,tech_neg,tech_pos
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1999-04-23,24.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,4.0,24.0,...,0.0,140.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,4.0
1999-07-23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,28.0,...,24.0,92.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,8.0
1999-10-22,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,34.0,...,4.0,116.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2000-04-28,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,84.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,32.0
2000-07-28,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,8.0,...,8.0,52.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0


In [7]:
important_features = ['economy_neg','industry_neg','management_pos','disaster','operations_neg','finance_pos','management',
                      'product','finance_neg','tech_pos']
important_columns = df_final.columns[df_final.columns.isin(important_features)].tolist()
df_final = df_final[[col for col in df_final.columns if col in important_columns]]
print('new shape {}'.format(df_final.shape))
df_final.head()

new shape (808, 10)


Unnamed: 0_level_0,disaster,economy_neg,finance_neg,finance_pos,industry_neg,management,management_pos,operations_neg,product,tech_pos
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1999-04-23,0.0,0.0,24.0,4.0,0.0,92.0,184.0,0.0,8.0,4.0
1999-07-23,0.0,0.0,28.0,12.0,0.0,80.0,140.0,24.0,8.0,8.0
1999-10-22,0.0,0.0,34.0,4.0,4.0,40.0,188.0,4.0,0.0,0.0
2000-04-28,0.0,0.0,0.0,24.0,8.0,16.0,104.0,0.0,0.0,32.0
2000-07-28,0.0,0.0,8.0,4.0,16.0,36.0,84.0,8.0,0.0,4.0


In [8]:
corr = df_final.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,disaster,economy_neg,finance_neg,finance_pos,industry_neg,management,management_pos,operations_neg,product,tech_pos
disaster,1.0,0.726438,0.64377,0.690666,0.705445,0.66547,0.77219,0.768862,0.736404,0.732681
economy_neg,0.726438,1.0,0.920381,0.876759,0.930784,0.847048,0.807716,0.895931,0.76682,0.70286
finance_neg,0.64377,0.920381,1.0,0.954127,0.969406,0.904203,0.820914,0.912841,0.760636,0.717124
finance_pos,0.690666,0.876759,0.954127,1.0,0.930304,0.931595,0.890529,0.937602,0.838139,0.797116
industry_neg,0.705445,0.930784,0.969406,0.930304,1.0,0.87907,0.880051,0.927656,0.817737,0.799698
management,0.66547,0.847048,0.904203,0.931595,0.87907,1.0,0.860882,0.937038,0.872653,0.803093
management_pos,0.77219,0.807716,0.820914,0.890529,0.880051,0.860882,1.0,0.922266,0.924731,0.948529
operations_neg,0.768862,0.895931,0.912841,0.937602,0.927656,0.937038,0.922266,1.0,0.914944,0.8502
product,0.736404,0.76682,0.760636,0.838139,0.817737,0.872653,0.924731,0.914944,1.0,0.927022
tech_pos,0.732681,0.70286,0.717124,0.797116,0.799698,0.803093,0.948529,0.8502,0.927022,1.0


In [9]:
y_regime.set_index('date', inplace=True)
y_regime.head()

Unnamed: 0_level_0,two_factor_regimes
date,Unnamed: 1_level_1
1999-04-23,0.0
1999-07-23,0.0
1999-10-22,1.0
2000-04-28,1.0
2000-07-28,1.0


In [10]:
# y value is not highly imbalanced
y_regime.two_factor_regimes.value_counts()

0.0    571
1.0    237
Name: two_factor_regimes, dtype: int64

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.3)

In [11]:
df_t_minus_3 = df_final.shift(3)
df_t_minus_3.columns = [str(col) + '_t-3' for col in df_t_minus_3.columns]

df_t_minus_2 = df_final.shift(2)
df_t_minus_2.columns = [str(col) + '_t-2' for col in df_t_minus_2.columns]

df_t_minus_1 = df_final.shift(1)
df_t_minus_1.columns = [str(col) + '_t-1' for col in df_t_minus_1.columns]

X = pd.concat([df_t_minus_3,df_t_minus_2,df_t_minus_1], axis=1)

In [12]:
type(df_final.columns)

pandas.core.indexes.base.Index

In [13]:
X.shape

(808, 30)

In [14]:
X = X.iloc[3:]
X.head()

Unnamed: 0_level_0,disaster_t-3,economy_neg_t-3,finance_neg_t-3,finance_pos_t-3,industry_neg_t-3,management_t-3,management_pos_t-3,operations_neg_t-3,product_t-3,tech_pos_t-3,...,disaster_t-1,economy_neg_t-1,finance_neg_t-1,finance_pos_t-1,industry_neg_t-1,management_t-1,management_pos_t-1,operations_neg_t-1,product_t-1,tech_pos_t-1
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-04-28,0.0,0.0,24.0,4.0,0.0,92.0,184.0,0.0,8.0,4.0,...,0.0,0.0,34.0,4.0,4.0,40.0,188.0,4.0,0.0,0.0
2000-07-28,0.0,0.0,28.0,12.0,0.0,80.0,140.0,24.0,8.0,8.0,...,0.0,0.0,0.0,24.0,8.0,16.0,104.0,0.0,0.0,32.0
2000-10-27,0.0,0.0,34.0,4.0,4.0,40.0,188.0,4.0,0.0,0.0,...,0.0,0.0,8.0,4.0,16.0,36.0,84.0,8.0,0.0,4.0
2001-01-26,0.0,0.0,0.0,24.0,8.0,16.0,104.0,0.0,0.0,32.0,...,0.0,0.0,8.0,0.0,4.0,28.0,28.0,0.0,0.0,4.0
2001-04-27,0.0,0.0,8.0,4.0,16.0,36.0,84.0,8.0,0.0,4.0,...,0.0,0.0,12.0,8.0,16.0,48.0,60.0,0.0,0.0,8.0


In [15]:
len(X)

805

In [16]:
y_regime_t = y_regime.copy(deep=False) 
y_regime_t_minus_1 = y_regime_t.shift(1)
y_difference = y_regime_t - y_regime_t_minus_1
y = y_difference.iloc[3:].astype(int)

In [17]:
y.head()

Unnamed: 0_level_0,two_factor_regimes
date,Unnamed: 1_level_1
2000-04-28,0
2000-07-28,0
2000-10-27,0
2001-01-26,0
2001-04-27,0


In [18]:
y.two_factor_regimes.unique()

array([ 0, -1,  1], dtype=int64)

In [19]:
len(y)

805

In [20]:
train_len = int(np.round(0.7 * len(X)))
test_len  = int(np.round(0.3 * len(X)))

X_train = X[:train_len]
X_test  = X[-test_len:]
y_train = y[:train_len]
y_test  = y[-test_len:]

In [21]:
linear_model=sm.OLS(y_train,X_train)
result=linear_model.fit()
print(result.summary2())

                  Results: Ordinary least squares
Model:              OLS                Adj. R-squared:     0.024    
Dependent Variable: two_factor_regimes AIC:                -604.5538
Date:               2019-11-14 13:45   BIC:                -474.5022
No. Observations:   564                Log-Likelihood:     332.28   
Df Model:           30                 F-statistic:        1.463    
Df Residuals:       534                Prob (F-statistic): 0.0553   
R-squared:          0.076              Scale:              0.019034 
--------------------------------------------------------------------
                       Coef.  Std.Err.    t    P>|t|   [0.025 0.975]
--------------------------------------------------------------------
disaster_t-3           0.0000   0.0000  1.0950 0.2740 -0.0000 0.0001
economy_neg_t-3        0.0000   0.0000  0.8641 0.3879 -0.0000 0.0001
finance_neg_t-3        0.0000   0.0000  1.2241 0.2214 -0.0000 0.0000
finance_pos_t-3        0.0000   0.0000  0.7406 0.4592