In [1]:
import numpy as np
import pandas as pd
from linearmodels import PanelOLS
from linearmodels import RandomEffects

#data is unique by gvkey and fyear
data = pd.read_csv('fundamentals_annual.csv')

income = pd.read_csv('income.csv')
data = data.merge(income, on = ['gvkey','fyear'],  suffixes=('', '_drop'))
data = data[[c for c in data.columns if not c.endswith('_drop')]]

shares = pd.read_csv('shares.csv')
data = data.merge(shares, on = ['gvkey','fyear'],  suffixes=('', '_drop'))
data = data[[c for c in data.columns if not c.endswith('_drop')]]

data.drop(['consol','popsrc','indfmt'],axis=1,inplace=True) #same for all rows
data.drop(['dvpd','opiti','tii','uopi'], axis=1, inplace=True) #NaN values only
data.drop(['datadate','tic','conm','fyr'],axis=1,inplace=True) #dont need these fields

data.sort_values(by=['gvkey','fyear'],inplace=True) #sort by gvkey and fyear

In [17]:
data[data['ni']<=0]

Unnamed: 0,gvkey,fyear,ap,at,ch,cshpri,dltt,dvt,ebit,ebitda,...,opeps,revt,seq,txdi,txp,txt,sic,ni,pi,csho
1,2080,2001,15.010,301.403,5.347,11.702,7.482,9.378,-2.206,8.396,...,0.01,305.676,234.472,-0.824,0.000,-1.042,2511,-2.642,-3.684,11.727
3,2080,2003,15.127,280.380,15.181,11.609,0.000,9.261,1.970,12.119,...,0.56,316.857,220.018,-1.154,1.530,0.462,2511,-0.470,4.867,11.600
19,2080,2019,23.677,275.766,19.687,10.286,0.000,5.133,2.772,17.495,...,0.07,452.087,178.670,-2.854,0.000,0.188,2511,-1.928,-1.740,10.116
20,2080,2020,23.426,402.549,45.799,9.970,113.834,4.545,-13.202,0.277,...,-0.81,385.863,158.030,1.966,0.000,-6.365,2511,-10.421,-16.786,9.943
25,2113,2003,1.525,6.847,0.110,1.171,0.000,0.352,-1.079,-1.046,...,-0.83,28.440,4.904,-0.079,0.000,-0.425,2330,-0.970,-1.395,1.171
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2756,271841,2016,87.455,1601.527,9.078,181.488,200.920,0.000,-129.176,-118.758,...,-1.03,302.368,592.747,27.468,15.936,30.764,1520,-357.677,-154.291,181.953
2757,271841,2017,77.026,868.977,8.613,13.446,161.725,0.000,-159.837,-150.375,...,-19.14,198.579,228.120,-7.829,14.018,-6.974,1520,-256.591,-293.292,13.551
2758,271841,2018,60.239,652.563,8.344,20.574,139.751,0.000,-32.363,-26.864,...,-3.31,270.746,126.912,-6.483,14.795,-5.618,1520,-108.368,-114.438,19.892
2759,271841,2019,52.981,631.911,3.094,34.292,36.415,0.000,3.478,7.072,...,-0.13,108.789,219.276,-9.269,17.382,-8.776,1520,-6.478,-15.344,58.509


In [2]:
#define a threshold for missing values
perc = 5.0 
min_count =  int(((100-perc)/100)*data.shape[0] + 1)
data = data.dropna( axis=1, thresh=min_count)

#drop columns with zero variance
for i in data.columns:
    if(len(data[i].unique()) == 1):
        data.drop(i, axis=1,inplace=True)

data.dropna(inplace=True)

In [4]:
#create the eps column and drop inf values
df = data[data.columns]
ni = df.groupby('gvkey')['ni'].shift(-1)
df['ni'] = ni
df.dropna(inplace=True)

In [5]:
#PooledOLS regression
import statsmodels.api as sm
from linearmodels.panel import PooledOLS
cols = list(df.columns)
cols.remove('gvkey')
cols.remove('fyear')
cols.remove('ni')
gvkey = data['gvkey']
fyear = data['fyear']
df['fyear'] = fyear
df['gvkey'] = gvkey
df = df.set_index(['gvkey','fyear'])
exog_vars = cols
exog = sm.add_constant(df[exog_vars])
mod = PooledOLS(df.ni, exog)
pooled_res = mod.fit()
print(pooled_res)

                          PooledOLS Estimation Summary                          
Dep. Variable:                     ni   R-squared:                        0.6473
Estimator:                  PooledOLS   R-squared (Between):              0.9636
No. Observations:                2333   R-squared (Within):               0.4633
Date:                Mon, Feb 07 2022   R-squared (Overall):              0.6473
Time:                        03:24:27   Log-likelihood                 -1.75e+04
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      201.94
Entities:                         185   P-value                           0.0000
Avg Obs:                       12.611   Distribution:                 F(21,2311)
Min Obs:                       1.0000                                           
Max Obs:                       21.000   F-statistic (robust):             201.76
                            

  x = pd.concat(x[::order], 1)


In [6]:
#RandomEffects Regression
from linearmodels.panel import RandomEffects
mod = RandomEffects(df.ni, exog)
re_res = mod.fit()
print(re_res)

                        RandomEffects Estimation Summary                        
Dep. Variable:                     ni   R-squared:                        0.6473
Estimator:              RandomEffects   R-squared (Between):              0.9636
No. Observations:                2333   R-squared (Within):               0.4633
Date:                Mon, Feb 07 2022   R-squared (Overall):              0.6473
Time:                        03:24:32   Log-likelihood                 -1.75e+04
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      201.94
Entities:                         185   P-value                           0.0000
Avg Obs:                       12.611   Distribution:                 F(21,2311)
Min Obs:                       1.0000                                           
Max Obs:                       21.000   F-statistic (robust):             201.76
                            

In [7]:
#Between OLS regression
from linearmodels.panel import BetweenOLS
mod = BetweenOLS(df.ni, exog)
be_res = mod.fit()
print(be_res)

                         BetweenOLS Estimation Summary                          
Dep. Variable:                     ni   R-squared:                        0.9963
Estimator:                 BetweenOLS   R-squared (Between):              0.9963
No. Observations:                 185   R-squared (Within):               0.1512
Date:                Mon, Feb 07 2022   R-squared (Overall):              0.4557
Time:                        03:24:37   Log-likelihood                   -831.03
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      2107.0
Entities:                         185   P-value                           0.0000
Avg Obs:                       12.611   Distribution:                  F(21,163)
Min Obs:                       1.0000                                           
Max Obs:                       21.000   F-statistic (robust):             2151.1
                            

In [8]:
#PanelOLS regression - entity effects is true
mod = PanelOLS(df.ni, exog, entity_effects=True, drop_absorbed=True)
fe_res = mod.fit()
print(fe_res)

Variables have been fully absorbed and have removed from the regression:

sic



                          PanelOLS Estimation Summary                           
Dep. Variable:                     ni   R-squared:                        0.5550
Estimator:                   PanelOLS   R-squared (Between):             -5.9493
No. Observations:                2333   R-squared (Within):               0.5550
Date:                Mon, Feb 07 2022   R-squared (Overall):             -1.8531
Time:                        03:24:42   Log-likelihood                -1.725e+04
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      132.69
Entities:                         185   P-value                           0.0000
Avg Obs:                       12.611   Distribution:                 F(20,2128)
Min Obs:                       1.0000                                           
Max Obs:                       21.000   F-statistic (robust):             132.75
                            

In [9]:
#time effects is true
mod = PanelOLS(df.ni, exog, entity_effects=True, time_effects=True, drop_absorbed =True)
fe_te_res = mod.fit()
print(fe_te_res)

                          PanelOLS Estimation Summary                           
Dep. Variable:                     ni   R-squared:                        0.5473
Estimator:                   PanelOLS   R-squared (Between):             -5.8887
No. Observations:                2333   R-squared (Within):               0.5546
Date:                Mon, Feb 07 2022   R-squared (Overall):             -1.8257
Time:                        03:24:48   Log-likelihood                -1.722e+04
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      127.37
Entities:                         185   P-value                           0.0000
Avg Obs:                       12.611   Distribution:                 F(20,2107)
Min Obs:                       1.0000                                           
Max Obs:                       21.000   F-statistic (robust):             127.32
                            