In [14]:
import numpy as np
import pandas as pd
from linearmodels import PanelOLS
from linearmodels import RandomEffects

#data is unique by gvkey and fyear
data = pd.read_csv('fundamentals_annual/fundamentals_annual.csv')

income = pd.read_csv('fundamentals_annual/income.csv')
data = data.merge(income, on = ['gvkey','fyear'],  suffixes=('', '_drop'))
data = data[[c for c in data.columns if not c.endswith('_drop')]]

shares = pd.read_csv('shares.csv')
data = data.merge(shares, on = ['gvkey','fyear'],  suffixes=('', '_drop'))
data = data[[c for c in data.columns if not c.endswith('_drop')]]

data.drop(['consol','popsrc','indfmt'],axis=1,inplace=True) #same for all rows
data.drop(['dvpd','opiti','tii','uopi'], axis=1, inplace=True) #NaN values only
data.drop(['datadate','tic','conm','fyr'],axis=1,inplace=True) #dont need these fields

data.sort_values(by=['gvkey','fyear'],inplace=True) #sort by gvkey and fyear

In [15]:
market_data = pd.read_csv('market_data.csv')
data = data.merge(market_data, left_on= 'fyear', right_on='caldt')

In [16]:
#define a threshold for missing values
perc = 5.0 
min_count =  int(((100-perc)/100)*data.shape[0] + 1)
data = data.dropna( axis=1, thresh=min_count)

#drop columns with zero variance
for i in data.columns:
    if(len(data[i].unique()) == 1):
        data.drop(i, axis=1,inplace=True)

#row-wise NA dropping
data.dropna(inplace=True)

In [17]:
#shifting the NI column to remove look ahead bias
df = data[data.columns]
ni = df.groupby('gvkey')['ni'].shift(-1)
df['ni'] = ni
df.dropna(inplace=True)

In [18]:
df

Unnamed: 0,gvkey,fyear,acominc,ap,at,ch,cshpri,dltt,dvt,ebit,...,sic,ni,pi,csho,caldt,t90ret,b5ret,cpiret,SNP500,SNP500_CD
0,2080,2001,3.047,15.010,301.403,5.347,11.702,7.482,9.378,-2.206,...,2511,6.741,-3.684,11.727,2001,0.044809,0.082663,0.015517,-0.015676,0.012427
1,2113,2001,0.000,2.430,9.289,0.192,1.065,0.000,1.440,1.763,...,2330,0.116,1.698,1.065,2001,0.044809,0.082663,0.015517,-0.015676,0.012427
2,2444,2001,-44.700,214.500,3157.500,108.500,87.800,600.200,43.800,191.100,...,3510,135.200,132.200,87.799,2001,0.044809,0.082663,0.015517,-0.015676,0.012427
4,3622,2001,0.010,3.695,60.200,0.388,9.167,36.773,0.000,5.056,...,2211,2.487,0.114,9.421,2001,0.044809,0.082663,0.015517,-0.015676,0.012427
5,3647,2001,0.007,24.327,287.713,31.993,11.230,107.001,0.000,15.228,...,2211,-24.887,-6.140,11.320,2001,0.044809,0.082663,0.015517,-0.015676,0.012427
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2560,62262,2020,0.000,0.197,19.356,2.925,9.583,3.902,0.000,-0.562,...,3100,0.524,-1.766,9.884,2020,0.006920,0.073770,0.013621,0.013940,0.010226
2567,65850,2020,0.000,3.748,48.905,13.993,28.644,0.975,0.000,-5.888,...,3290,12.811,-6.160,28.949,2020,0.006920,0.073770,0.013621,0.013940,0.010226
2568,66261,2020,0.000,12.935,197.259,8.984,1.707,95.491,0.001,20.422,...,5700,31.197,15.616,1.589,2020,0.006920,0.073770,0.013621,0.013940,0.010226
2576,137432,2020,-1.322,49.800,420.231,16.458,6.921,170.680,0.000,17.092,...,2300,20.296,-14.080,6.890,2020,0.006920,0.073770,0.013621,0.013940,0.010226


In [19]:
""" Perform a forward-backward feature selection 
    based on p-value from statsmodels.api.OLS
    Arguments:
        X - pandas.DataFrame with candidate features
        y - list-like with the target
        initial_list - list of features to start with (column names of X)
        threshold_in - include a feature if its p-value < threshold_in
        threshold_out - exclude a feature if its p-value > threshold_out
        verbose - whether to print the sequence of inclusions and exclusions
    Returns: list of selected features 
    Always set threshold_in < threshold_out to avoid infinite looping.
"""
def stepwise_selection(df, X, y, reg_model, initial_list=[], threshold_in=0.01, threshold_out = 0.05, verbose=True):
    included = list(initial_list)
    while True:
        changed=False
        # forward step
        excluded = list(set(X.columns)-set(included))
        new_pval = pd.Series(index=excluded)
        for new_column in excluded:
            exog = sm.add_constant(pd.DataFrame(X[included+[new_column]]))
            model = reg_model(df.ni, exog).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.idxmin()
            included.append(best_feature)
            changed=True
            if verbose:
                print('Add  {:30} with p-value {:.6}'.format(best_feature, best_pval))

        # backward step
        model = reg_model(df.ni, sm.add_constant(pd.DataFrame(X[included]))).fit()
        # use all coefs except intercept
        pvalues = model.pvalues.iloc[1:]
        worst_pval = pvalues.max() # null if pvalues is empty
        if worst_pval > threshold_out:
            changed=True
            worst_feature = pvalues.idxmax()
            included.remove(worst_feature)
            if verbose:
                print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval))
        if not changed:
            break
    return included



In [20]:
#Panel Regression data preparation
import statsmodels.api as sm
from linearmodels.panel import PooledOLS
cols = list(df.columns)
cols.remove('gvkey')
cols.remove('fyear')
cols.remove('ni')
gvkey = data['gvkey']
fyear = data['fyear']
df['fyear'] = fyear
df['gvkey'] = gvkey
df = df.set_index(['gvkey','fyear'])

In [21]:
X = df.drop('ni',axis=1)
y=df['ni']
result = stepwise_selection(df,X,y,reg_model = PooledOLS)
exog_vars = result
exog = sm.add_constant(df[exog_vars])
mod = PooledOLS(df.ni, exog)
pooled_res = mod.fit()
print(pooled_res)
print()
print('Rsquared')
print(pooled_res.rsquared)

  new_pval = pd.Series(index=excluded)


Add  pi                             with p-value 0.0
Add  acominc                        with p-value 1.33227e-15
Add  at                             with p-value 0.0
Add  icapt                          with p-value 0.0
Add  ebit                           with p-value 0.0
Add  ebitda                         with p-value 0.0
Drop pi                             with p-value 0.29865
Add  ch                             with p-value 0.0
Add  dvt                            with p-value 6.6536e-08
Add  txt                            with p-value 0.000280418
Add  txp                            with p-value 0.00199581
Add  seq                            with p-value 0.00228002
Add  dltt                           with p-value 1.69997e-12
Add  ap                             with p-value 3.58724e-07
Add  cshpri                         with p-value 0.00049775
                          PooledOLS Estimation Summary                          
Dep. Variable:                     ni   R-squared:          

In [22]:
#RandomEffects Regression
from linearmodels.panel import RandomEffects
result = stepwise_selection(df,X,y,reg_model = RandomEffects)
exog_vars = result
exog = sm.add_constant(df[exog_vars])
mod = RandomEffects(df.ni, exog)
re_res = mod.fit()
print(re_res)
print()
print('Rsquared')
print(re_res.rsquared)

  new_pval = pd.Series(index=excluded)


Add  pi                             with p-value 0.0
Add  acominc                        with p-value 1.33227e-15
Add  at                             with p-value 0.0
Add  icapt                          with p-value 0.0
Add  ebit                           with p-value 0.0
Add  ebitda                         with p-value 0.0
Drop pi                             with p-value 0.29865
Add  ch                             with p-value 0.0
Add  dvt                            with p-value 6.6536e-08
Add  txt                            with p-value 0.000280418
Add  txp                            with p-value 0.00199581
Add  seq                            with p-value 0.00228002
Add  dltt                           with p-value 1.69997e-12
Add  ap                             with p-value 3.58724e-07
Add  cshpri                         with p-value 0.00049775
                        RandomEffects Estimation Summary                        
Dep. Variable:                     ni   R-squared:          

In [23]:
#Between OLS regression
from linearmodels.panel import BetweenOLS
result = stepwise_selection(df,X,y,reg_model = BetweenOLS)
exog_vars = result
exog = sm.add_constant(df[exog_vars])
mod = BetweenOLS(df.ni, exog)
be_res = mod.fit()
print(be_res)

  new_pval = pd.Series(index=excluded)


Add  pi                             with p-value 0.0
Add  at                             with p-value 0.0
Add  ebitda                         with p-value 0.0
Drop at                             with p-value 0.405037
Add  revt                           with p-value 1.80966e-13
Add  txt                            with p-value 4.44089e-16
Add  at                             with p-value 2.16227e-12
Add  gp                             with p-value 0.000605426
Add  seq                            with p-value 0.000263377
Add  opeps                          with p-value 0.000451913
Add  SNP500                         with p-value 0.00303204
Add  invt                           with p-value 0.00296824
                         BetweenOLS Estimation Summary                          
Dep. Variable:                     ni   R-squared:                        0.9966
Estimator:                 BetweenOLS   R-squared (Between):              0.9966
No. Observations:                 185   R-squared (Wit

In [24]:
def stepwise_selection_panel(df, X, y, reg_model, drop_absorbed =True, entity_effects =False, time_effects = False, initial_list=[], threshold_in=0.01, threshold_out = 0.07, verbose=True):
    included = list(initial_list)
    while True:
        changed=False
        # forward step
        excluded = list(set(X.columns)-set(included))
        new_pval = pd.Series(index=excluded)
        for new_column in excluded:
            exog = sm.add_constant(pd.DataFrame(X[included+[new_column]]))
            model = reg_model(df.ni, exog, entity_effects = entity_effects, time_effects = time_effects, drop_absorbed=True).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.idxmin()
            included.append(best_feature)
            changed=True
            if verbose:
                print('Add  {:30} with p-value {:.6}'.format(best_feature, best_pval))

        # backward step
        model = reg_model(df.ni, sm.add_constant(pd.DataFrame(X[included])), drop_absorbed =True, entity_effects =False, time_effects = False).fit()
        # use all coefs except intercept
        pvalues = model.pvalues.iloc[1:]
        worst_pval = pvalues.max() # null if pvalues is empty
        if worst_pval > threshold_out:
            changed=True
            worst_feature = pvalues.idxmax()
            included.remove(worst_feature)
            if verbose:
                print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval))
        if not changed:
            break
    return included

In [25]:
#PanelOLS regression - entity effects is true
# result = stepwise_selection_panel(df.drop(['sic','invt'],axis=1),X.drop(['sic','invt'],axis=1),y,reg_model = PanelOLS, entity_effects = True)
exog_vars = result #df.drop('ni',axis=1).columns
exog = sm.add_constant(df[exog_vars])
mod = PanelOLS(df.ni, exog, entity_effects=True, drop_absorbed=True)
mod = mod.fit()
print(mod)

                          PanelOLS Estimation Summary                           
Dep. Variable:                     ni   R-squared:                        0.4650
Estimator:                   PanelOLS   R-squared (Between):             -8.1345
No. Observations:                2199   R-squared (Within):               0.4650
Date:                Mon, Feb 07 2022   R-squared (Overall):             -2.7872
Time:                        13:00:30   Log-likelihood                -1.648e+04
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      174.15
Entities:                         185   P-value                           0.0000
Avg Obs:                       11.886   Distribution:                 F(10,2004)
Min Obs:                       1.0000                                           
Max Obs:                       20.000   F-statistic (robust):             174.15
                            

In [33]:
#time effects is true
result = stepwise_selection_panel(df.drop(['sic','invt','cpiret','SNP500','SNP500_CD','caldt','t90ret','b5ret'],axis=1),
                                    X.drop(['sic','invt','cpiret','SNP500','SNP500_CD','caldt','t90ret','b5ret'],axis=1),y,reg_model = PanelOLS, entity_effects = True, time_effects=True)
exog_vars = result #df.drop('ni',axis=1).columns
exog = sm.add_constant(df[exog_vars])
mod = PanelOLS(df.ni, exog, entity_effects=True, time_effects=True, drop_absorbed =True)
fe_te_res = mod.fit()
print(fe_te_res)

  new_pval = pd.Series(index=excluded)


Add  pi                             with p-value 0.0
Add  dltt                           with p-value 0.0
Add  revt                           with p-value 0.0
Drop dltt                           with p-value 0.150845
Add  at                             with p-value 0.0
Add  dvt                            with p-value 0.0
Add  ebit                           with p-value 0.0
Add  ebitda                         with p-value 0.0
Drop revt                           with p-value 0.58808
Add  revt                           with p-value 0.0
Drop revt                           with p-value 0.58808
Add  revt                           with p-value 0.0
Drop revt                           with p-value 0.58808
Add  revt                           with p-value 0.0
Drop revt                           with p-value 0.58808
Add  revt                           with p-value 0.0
Drop revt                           with p-value 0.58808
Add  revt                           with p-value 0.0
Drop revt            

KeyboardInterrupt: 