In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings("ignore")

In [2]:
#data is unique by gvkey and fyear
data = pd.read_csv('fundamentals_annual.csv')

fyear = pd.read_csv('income.csv')
data = data.merge(fyear, on = ['gvkey','fyear'],  suffixes=('', '_drop'))
data = data[[c for c in data.columns if not c.endswith('_drop')]]

# fyear and fyrc share same values
# fdate is the date when the data is finalized for the fiscal year. Just fiscal year alone will not tell us anything.

In [3]:
data.drop(['consol','popsrc','indfmt'],axis=1,inplace=True) #same for all rows
data.drop(['dvpd','opiti','tii','uopi'], axis=1, inplace=True) #NaN values only
data.drop(['gld','gleps','glp'], axis=1, inplace=True) #more than 90% values are NaN

data.sort_values(by=['gvkey','fyear','fyr'],inplace=True) #sort by gvkey and fyear

#drop columns with constant value 
for i in data.columns:
    if(len(data[i].unique()) == 1):
        data.drop(i, axis=1,inplace=True)
        
#drop columns with more than 10% missing values?
perc = 10.0 
min_count =  int(((100-perc)/100)*data.shape[0] + 1)
data = data.dropna( axis=1, thresh=min_count)
data

In [5]:
#methods
#1. filter by gvkey to get records for all fyear and then run a time series regression with lag variables. 
#2. Train a model with all firms in the dataset.
#3  Use datadate that falls in the previous fiscal year to predict the earnings of the company in the next year. 



FYEAR indicates the fiscal year that the company is in at the time of DATADATE.
Compustat it is pretty simple to understand, DATADATE gives us the annual close of fiscal period, in the case of MSFT
(June FY) this might be 6/30/2010, and the FYEAR is 2010

we can work with datadate and fyear 
fdate will give the end of the fiscal year. 
datadate will tell the date the data was recorded
fyr and fyear together should match the month and year of datadate

DATADATE is COMPUSTAT’s approximation of the fiscal period-end date, but I have seen cases where this is off a 
few days from the company’s actual fiscal year-end date in EDGAR filings because DATADATE will be the last day of the month. APDEDATE is defined by COMPUSTAT as, “This item represents the actual date the company closes accounting for the period, which can be different from the last day of the month in the period.” Both of these dates may be thought of as the Balance Sheet date one would find in the actual corporate fillings.

In [6]:
#why is there such a wide gap between the two dates?
(pd.to_datetime(data['fdate']) - pd.to_datetime(data['datadate'])).sort_values(ascending =False)

1806   1850 days
1757   1773 days
1472   1767 days
1817   1609 days
1776   1602 days
          ...   
2717         NaT
2718         NaT
2719         NaT
2720         NaT
2721         NaT
Length: 2761, dtype: timedelta64[ns]

In [22]:
df = pd.DataFrame()
df['gvkey'] = data['gvkey']
df[data.columns[6:]] = data[data.columns[6:]]

#fill missing values using forward fill and backward fill and take average
#what this means is that the asset value in that year lied between the the asset value the year before and the one
#the year after
temp = df.groupby('gvkey').fillna(method='ffill')
temp = temp.fillna(0)
temp2 = df.groupby('gvkey').fillna(method='bfill')
temp2 = temp2.fillna(0)

cols = data.columns[6:]
for i in cols:
    df[i] = (temp[i] + temp2[i])//2

#another way of filling in missing values could be just taking a percentage for example, income is 20% of assets etc
#df.isna().sum()

In [23]:
#removing the look ahead bias by shifting the data
ni = df.groupby('gvkey')['ni'].shift(-1)
df['ni'] = ni
df.dropna(inplace=True)

#very high correlations between features. its expected since they are mathematically derived from each other
df[cols].corr().style.background_gradient(cmap='coolwarm')

Unnamed: 0,acominc,ap,at,ch,cshpri,dltt,dvt,ebit,ebitda,gp,icapt,invt,lt,opeps,revt,seq,txdi,txp,txt,sic,ni,pi
acominc,1.0,-0.718536,-0.675922,-0.654351,-0.268496,-0.520493,0.013072,-0.118594,-0.348207,-0.262957,-0.517325,-0.275763,-0.722819,0.002224,-0.671743,-0.460392,-0.207568,-0.482783,-0.290332,-0.054193,0.157472,0.064563
ap,-0.718536,1.0,0.928935,0.844927,0.499046,0.665371,0.362642,0.527357,0.700782,0.553419,0.78626,0.516661,0.945894,0.006805,0.852936,0.753803,0.141089,0.792612,0.564477,0.026076,0.283494,0.394897
at,-0.675922,0.928935,1.0,0.902123,0.574203,0.771709,0.506335,0.659981,0.823301,0.660834,0.920939,0.648782,0.985066,0.009429,0.912819,0.901284,0.098957,0.883674,0.660263,-0.014125,0.348553,0.51812
ch,-0.654351,0.844927,0.902123,1.0,0.615507,0.738849,0.474014,0.616551,0.794106,0.74605,0.899384,0.643362,0.85701,0.033132,0.943579,0.886533,0.114978,0.810649,0.615171,-0.009096,0.297659,0.466237
cshpri,-0.268496,0.499046,0.574203,0.615507,1.0,0.596596,0.567446,0.598643,0.67944,0.727072,0.672181,0.532608,0.507951,-0.024134,0.651905,0.663876,0.021891,0.553031,0.536055,-0.022275,0.319851,0.441703
dltt,-0.520493,0.665371,0.771709,0.738849,0.596596,1.0,0.555954,0.625458,0.753803,0.699917,0.865831,0.803599,0.7292,0.027154,0.802845,0.78416,0.010182,0.664681,0.558231,-0.123693,0.247954,0.419736
dvt,0.013072,0.362642,0.506335,0.474014,0.567446,0.555954,1.0,0.846608,0.803024,0.801133,0.657158,0.535914,0.416845,0.016776,0.527164,0.674758,-0.081626,0.521353,0.646249,-0.025138,0.581507,0.738206
ebit,-0.118594,0.527357,0.659981,0.616551,0.598643,0.625458,0.846608,1.0,0.945299,0.84228,0.77011,0.659711,0.57724,0.111172,0.661997,0.787412,-0.029635,0.668711,0.751891,-0.078155,0.676646,0.899414
ebitda,-0.348207,0.700782,0.823301,0.794106,0.67944,0.753803,0.803024,0.945299,1.0,0.905211,0.90759,0.711742,0.74545,0.076775,0.852004,0.914817,0.015123,0.798815,0.784858,-0.044001,0.547346,0.794718
gp,-0.262957,0.553419,0.660834,0.74605,0.727072,0.699917,0.801133,0.84228,0.905211,1.0,0.842353,0.670039,0.54898,0.034557,0.805821,0.851404,0.015864,0.640329,0.712123,-0.040915,0.426401,0.643005


In [35]:
def prediction(df1, components):
    df1 = df1.drop('gvkey',axis=1)
    alpha_vals = np.arange(0.01,1)
    X_train, X_test, y_train, y_test = train_test_split(df1.loc[:, df1.columns != 'ni'], df1['ni'], 
                                                        test_size=0.33, random_state=42)
    pipe = Pipeline(steps=[('scaler', StandardScaler()), ('pca', PCA(n_components= components)), 
                           ('ridge', Ridge(fit_intercept=True))])
    gsc = GridSearchCV(pipe, param_grid={ 'ridge__alpha': alpha_vals},cv=10, scoring='r2')
    gsc.fit(X_train, y_train)
    y_pred = gsc.predict(X_test)
    RMSE = (mean_squared_error(y_test,y_pred)**(1/2))
    print(r2_score(y_test, y_pred))
    print(RMSE)


In [36]:
#passing the entire dataframe for scaling, PCA and prediction
prediction(df, 11)

0.4712088880125791
525.8314053295571


In [37]:
#using the between estimates regression method. 
be = df.groupby('gvkey').mean()
be.reset_index(inplace=True)
prediction(be,11)

0.9509331763239879
66.38957376898622


In [34]:
be['ni'].std()

353.019727003478

In [None]:
#analysis of PCA
def PCA_analysis(temp, components):
    pca = PCA()
    dataset = pd.DataFrame()

    #checking correlation of target variable with different principal components.
    transformed = pca.fit_transform(temp.drop('ni',axis=1))
    for i in range(0, len(transformed[0])):
        dataset[i] = transformed[:,i]
        print(i, temp['ni'].corr(dataset[i]))

    pca = PCA(n_components=components)
    transformed = pca.fit_transform(temp.drop('ni',axis=1))
    print(pca.explained_variance_ratio_)
    print(sum(pca.explained_variance_ratio_))