In [2]:
import numpy as np
import pandas as pd
from linearmodels import PanelOLS
from linearmodels import RandomEffects

#data is unique by gvkey and fyear
data = pd.read_csv('fundamentals_annual.csv')

income = pd.read_csv('income.csv')
data = data.merge(income, on = ['gvkey','fyear'],  suffixes=('', '_drop'))
data = data[[c for c in data.columns if not c.endswith('_drop')]]

shares = pd.read_csv('shares.csv')
data = data.merge(shares, on = ['gvkey','fyear'],  suffixes=('', '_drop'))
data = data[[c for c in data.columns if not c.endswith('_drop')]]

data.drop(['consol','popsrc','indfmt'],axis=1,inplace=True) #same for all rows
data.drop(['dvpd','opiti','tii','uopi'], axis=1, inplace=True) #NaN values only
data.drop(['datadate','tic','conm','fyr'],axis=1,inplace=True) #dont need these fields

data.sort_values(by=['gvkey','fyear'],inplace=True) #sort by gvkey and fyear

In [3]:
#define a threshold for missing values
perc = 5.0 
min_count =  int(((100-perc)/100)*data.shape[0] + 1)
data = data.dropna( axis=1, thresh=min_count)

#drop columns with zero variance
for i in data.columns:
    if(len(data[i].unique()) == 1):
        data.drop(i, axis=1,inplace=True)

#row-wise NA dropping
data.dropna(inplace=True)

In [4]:
#shifting the NI column to remove look ahead bias
df = data[data.columns]
ni = df.groupby('gvkey')['ni'].shift(-1)
df['ni'] = ni
df.dropna(inplace=True)

In [5]:
ind = df.groupby(['gvkey']).tail(2).index
df_test = df.filter(items = ind , axis=0)
df_train = df.drop(df.groupby(['gvkey']).tail(2).index)

In [6]:
""" Perform a forward-backward feature selection 
    based on p-value from statsmodels.api.OLS
    Arguments:
        X - pandas.DataFrame with candidate features
        y - list-like with the target
        initial_list - list of features to start with (column names of X)
        threshold_in - include a feature if its p-value < threshold_in
        threshold_out - exclude a feature if its p-value > threshold_out
        verbose - whether to print the sequence of inclusions and exclusions
    Returns: list of selected features 
    Always set threshold_in < threshold_out to avoid infinite looping.
"""
def stepwise_selection(df, X, y, reg_model, initial_list=[], threshold_in=0.01, threshold_out = 0.05, verbose=True):
    included = list(initial_list)
    while True:
        changed=False
        # forward step
        excluded = list(set(X.columns)-set(included))
        new_pval = pd.Series(index=excluded)
        for new_column in excluded:
            exog = sm.add_constant(pd.DataFrame(X[included+[new_column]]))
            model = reg_model(df.ni, exog).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.idxmin()
            included.append(best_feature)
            changed=True
            if verbose:
                print('Add  {:30} with p-value {:.6}'.format(best_feature, best_pval))

        # backward step
        model = reg_model(df.ni, sm.add_constant(pd.DataFrame(X[included]))).fit()
        # use all coefs except intercept
        pvalues = model.pvalues.iloc[1:]
        worst_pval = pvalues.max() # null if pvalues is empty
        if worst_pval > threshold_out:
            changed=True
            worst_feature = pvalues.idxmax()
            included.remove(worst_feature)
            if verbose:
                print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval))
        if not changed:
            break
    return included



In [7]:
#Panel Regression data preparation
import statsmodels.api as sm
from linearmodels.panel import PooledOLS
cols = list(df.columns)
cols.remove('gvkey')
cols.remove('fyear')
cols.remove('ni')
gvkey = data['gvkey']
fyear = data['fyear']
df_train['fyear'] = fyear
df_train['gvkey'] = gvkey
df_train = df_train.set_index(['gvkey','fyear'])

df_test['fyear'] = fyear
df_test['gvkey'] = gvkey
df_test = df_test.set_index(['gvkey','fyear'])

X = df_train.drop('ni',axis=1)
y = df_train['ni']

In [12]:
from sklearn.metrics import r2_score
def test_pred(exog, model):
    ans = model.predict(exog)
    y_pred = ans.reset_index()['predictions']
    print(r2_score(df_test['ni'],y_pred))
    rmse = sum((df_test['ni'].reset_index()['ni']**2 - y_pred**2))**(1/2)
    print(rmse)

In [13]:
result = stepwise_selection(df_train,X,y,reg_model = PooledOLS)
exog_vars = result
exog = sm.add_constant(df_train[exog_vars])
pooled_res = PooledOLS(df_train.ni, exog).fit()
exog = sm.add_constant(df_test[exog_vars])
test_pred(exog, pooled_res)

  new_pval = pd.Series(index=excluded)
  x = pd.concat(x[::order], 1)


Add  ebit                           with p-value 0.0
Add  ebitda                         with p-value 0.0
Add  at                             with p-value 0.0
Add  dltt                           with p-value 2.22045e-16
Add  dvt                            with p-value 4.55636e-13
Add  pi                             with p-value 9.77887e-08
Add  cshpri                         with p-value 3.5142e-07
Add  gp                             with p-value 4.25471e-07
Add  ap                             with p-value 5.50495e-07
Drop at                             with p-value 0.243381
Add  seq                            with p-value 5.62209e-06
Add  icapt                          with p-value 5.12923e-14
Add  txt                            with p-value 0.000270234
Add  ch                             with p-value 0.000797542
Add  revt                           with p-value 0.000375084
Add  csho                           with p-value 0.00367426
0.6965735155723121
10981.678606823869


In [14]:
#RandomEffects Regression
from linearmodels.panel import RandomEffects
result = stepwise_selection(df_train,X,y,reg_model = RandomEffects)
exog_vars = result
exog = sm.add_constant(df_train[exog_vars])
re_res = RandomEffects(df_train.ni, exog).fit()

exog = sm.add_constant(df_test[exog_vars])
test_pred(exog, re_res)

  new_pval = pd.Series(index=excluded)
  x = pd.concat(x[::order], 1)


Add  ebit                           with p-value 0.0
Add  ebitda                         with p-value 0.0
Add  at                             with p-value 0.0
Add  dltt                           with p-value 2.22045e-16
Add  dvt                            with p-value 4.55636e-13
Add  pi                             with p-value 9.77887e-08
Add  cshpri                         with p-value 3.5142e-07
Add  gp                             with p-value 4.25471e-07
Add  ap                             with p-value 5.50495e-07
Drop at                             with p-value 0.243381
Add  seq                            with p-value 5.62209e-06
Add  icapt                          with p-value 5.12923e-14
Add  txt                            with p-value 0.000270234
Add  ch                             with p-value 0.000797542
Add  revt                           with p-value 0.000375084
Add  csho                           with p-value 0.00367426
0.6965735155723121
10981.678606823869


In [15]:
#Between OLS regression
from linearmodels.panel import BetweenOLS
result = stepwise_selection(df_train,X,y,reg_model = BetweenOLS)
exog_vars = result
exog = sm.add_constant(df_train[exog_vars])
be_res = BetweenOLS(df_train.ni, exog).fit()
exog = sm.add_constant(df_test[exog_vars])
test_pred(exog, be_res)

  new_pval = pd.Series(index=excluded)
  x = pd.concat(x[::order], 1)


Add  ebit                           with p-value 0.0
Add  ebitda                         with p-value 0.0
Add  at                             with p-value 0.0
Add  icapt                          with p-value 0.0
Add  pi                             with p-value 0.0
Drop icapt                          with p-value 0.95356
Add  txt                            with p-value 5.35759e-07
Drop ebitda                         with p-value 0.636802
Add  revt                           with p-value 3.69627e-09
Add  ap                             with p-value 0.0066993
Add  ebitda                         with p-value 0.00047346
0.6038566080637257
(5.791922776270194e-15+94.58927717449237j)


In [17]:
#PanelOLS regression - entity effects is true
# exog = sm.add_constant(df_train[result])
# mod = PanelOLS(df_train.ni, exog, entity_effects=True, drop_absorbed=True).fit()

# exog = sm.add_constant(df_test.drop('ni',axis=1))
# test_pred(exog, mod)

In [None]:
# #time effects is true

# exog = sm.add_constant(X)
# mod = PanelOLS(df_train.ni, exog, entity_effects=True, time_effects=True, drop_absorbed =True).fit()

# exog = sm.add_constant(df_test[exog_vars])
# test_pred(exog, ans, mod)