In [71]:
import numpy as np
import pandas as pd
from linearmodels import PanelOLS
from linearmodels import RandomEffects

#data is unique by gvkey and fyear
data = pd.read_csv('fundamentals_annual.csv')

fyear = pd.read_csv('income.csv')
data = data.merge(fyear, on = ['gvkey','fyear'],  suffixes=('', '_drop'))
data = data[[c for c in data.columns if not c.endswith('_drop')]]

data.drop(['consol','popsrc','indfmt'],axis=1,inplace=True) #same for all rows
data.drop(['dvpd','opiti','tii','uopi'], axis=1, inplace=True) #NaN values only
data.drop(['gld','gleps','glp'], axis=1, inplace=True) #more than 90% values are NaN

data.sort_values(by=['gvkey','fyear','fyr'],inplace=True) #sort by gvkey and fyear

perc = 10.0 
min_count =  int(((100-perc)/100)*data.shape[0] + 1)
data = data.dropna( axis=1, thresh=min_count)


for i in data.columns:
    if(len(data[i].unique()) == 1):
        data.drop(i, axis=1,inplace=True)



In [72]:
df = pd.DataFrame()
df[['gvkey','fyear']] = data[['gvkey','fyear']]
df[data.columns[6:]] = data[data.columns[6:]]

#fill missing values using forward fill and backward fill and take average
#what this means is that the asset value in that year lied between the the asset value the year before and the one
#the year after
temp = df.groupby('gvkey').fillna(method='ffill')
temp = temp.fillna(0)
temp2 = df.groupby('gvkey').fillna(method='bfill')
temp2 = temp2.fillna(0)

cols = data.columns[6:]
for i in cols:
    df[i] = (temp[i] + temp2[i])//2

 


In [73]:
ni = df.groupby('gvkey')['ni'].shift(-1)
df['ni'] = ni
df.dropna(inplace=True)

In [74]:
df.columns

Index(['gvkey', 'fyear', 'acominc', 'ap', 'at', 'ch', 'cshpri', 'dltt', 'dvt',
       'ebit', 'ebitda', 'gp', 'icapt', 'invt', 'lt', 'opeps', 'revt', 'seq',
       'txdi', 'txp', 'txt', 'sic', 'ni', 'pi'],
      dtype='object')

In [75]:
import statsmodels.api as sm
from linearmodels.panel import PooledOLS
gvkey = data['gvkey']
fyear = data['fyear']
df['fyear'] = fyear
df['gvkey'] = gvkey
df = df.set_index(['gvkey','fyear'])
exog_vars = ['acominc', 'ap', 'at', 'ch', 'cshpri', 'dltt', 'dvt','ebit', 'ebitda', 'gp', 'icapt', 'invt', 'lt', 'opeps', 'revt', 'seq','txdi', 'txp', 'txt', 'sic', 'pi']
exog = sm.add_constant(df[exog_vars])
mod = PooledOLS(df.ni, exog)
pooled_res = mod.fit()
print(pooled_res)

                          PooledOLS Estimation Summary                          
Dep. Variable:                     ni   R-squared:                        0.6453
Estimator:                  PooledOLS   R-squared (Between):              0.9649
No. Observations:                2568   R-squared (Within):               0.4749
Date:                Mon, Feb 07 2022   R-squared (Overall):              0.6453
Time:                        01:28:07   Log-likelihood                -1.923e+04
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      220.61
Entities:                         192   P-value                           0.0000
Avg Obs:                       13.375   Distribution:                 F(21,2546)
Min Obs:                       1.0000                                           
Max Obs:                       21.000   F-statistic (robust):             220.61
                            

In [76]:
from linearmodels.panel import RandomEffects

mod = RandomEffects(df.ni, exog)
re_res = mod.fit()
print(re_res)

                        RandomEffects Estimation Summary                        
Dep. Variable:                     ni   R-squared:                        0.6453
Estimator:              RandomEffects   R-squared (Between):              0.9649
No. Observations:                2568   R-squared (Within):               0.4749
Date:                Mon, Feb 07 2022   R-squared (Overall):              0.6453
Time:                        01:28:15   Log-likelihood                -1.923e+04
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      220.61
Entities:                         192   P-value                           0.0000
Avg Obs:                       13.375   Distribution:                 F(21,2546)
Min Obs:                       1.0000                                           
Max Obs:                       21.000   F-statistic (robust):             220.61
                            

In [77]:
from linearmodels.panel import BetweenOLS
mod = BetweenOLS(df.ni, exog)
be_res = mod.fit()
print(be_res)

                         BetweenOLS Estimation Summary                          
Dep. Variable:                     ni   R-squared:                        0.9974
Estimator:                 BetweenOLS   R-squared (Between):              0.9974
No. Observations:                 192   R-squared (Within):               0.0451
Date:                Mon, Feb 07 2022   R-squared (Overall):              0.3731
Time:                        01:28:16   Log-likelihood                   -827.54
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      3084.5
Entities:                         192   P-value                           0.0000
Avg Obs:                       13.375   Distribution:                  F(21,170)
Min Obs:                       1.0000                                           
Max Obs:                       21.000   F-statistic (robust):             3084.5
                            

In [78]:
mod = PanelOLS(df.ni, exog, entity_effects=True, drop_absorbed=True)
fe_res = mod.fit()
print(fe_res)

                          PanelOLS Estimation Summary                           
Dep. Variable:                     ni   R-squared:                        0.5446
Estimator:                   PanelOLS   R-squared (Between):             -4.7109
No. Observations:                2568   R-squared (Within):               0.5446
Date:                Mon, Feb 07 2022   R-squared (Overall):             -1.3146
Time:                        01:28:23   Log-likelihood                -1.901e+04
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      140.90
Entities:                         192   P-value                           0.0000
Avg Obs:                       13.375   Distribution:                 F(20,2356)
Min Obs:                       1.0000                                           
Max Obs:                       21.000   F-statistic (robust):             140.90
                            

Variables have been fully absorbed and have removed from the regression:

sic



In [60]:
mod = PanelOLS(df.ni, exog, entity_effects=True, time_effects=True, drop_absorbed =True)
fe_te_res = mod.fit()
print(fe_te_res)

                          PanelOLS Estimation Summary                           
Dep. Variable:                     ni   R-squared:                        0.5364
Estimator:                   PanelOLS   R-squared (Between):             -4.6313
No. Observations:                2568   R-squared (Within):               0.5440
Date:                Mon, Feb 07 2022   R-squared (Overall):             -1.2815
Time:                        01:22:22   Log-likelihood                -1.898e+04
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      135.06
Entities:                         192   P-value                           0.0000
Avg Obs:                       13.375   Distribution:                 F(20,2335)
Min Obs:                       1.0000                                           
Max Obs:                       21.000   F-statistic (robust):             135.06
                            