In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings("ignore")

In [2]:
#data is unique by gvkey and fyear
data = pd.read_csv('fundamentals_annual/fundamentals_annual.csv')

income = pd.read_csv('fundamentals_annual/income.csv')
data = data.merge(income, on = ['gvkey','fyear'],  suffixes=('', '_drop'))
data = data[[c for c in data.columns if not c.endswith('_drop')]]

shares = pd.read_csv('shares.csv')
data = data.merge(shares, on = ['gvkey','fyear'],  suffixes=('', '_drop'))
data = data[[c for c in data.columns if not c.endswith('_drop')]]
# fyear and fyrc share same values
# fdate is the date when the data is finalized for the fiscal year. Just fiscal year alone will not tell us anything.

In [3]:
data.drop(['consol','popsrc','indfmt'],axis=1,inplace=True) #same for all rows


data.sort_values(by=['gvkey','fyear','fyr'],inplace=True) #sort by gvkey and fyear

#drop columns with constant value 
for i in data.columns:
    if(len(data[i].unique()) == 1):
        data.drop(i, axis=1,inplace=True)
        
#drop columns with more than 10% missing values?
perc = 10.0 
min_count =  int(((100-perc)/100)*data.shape[0] + 1)
data = data.dropna( axis=1, thresh=min_count)

#drop rest row-wise
data.drop('acominc',axis=1,inplace=True)
data.dropna(inplace=True)

In [4]:
#methods
#1. filter by gvkey to get records for all fyear and then run a time series regression with lag variables. 
#2. Train a model with all firms in the dataset.
#3  Use datadate that falls in the previous fiscal year to predict the earnings of the company in the next year. 



FYEAR indicates the fiscal year that the company is in at the time of DATADATE.
Compustat it is pretty simple to understand, DATADATE gives us the annual close of fiscal period, in the case of MSFT
(June FY) this might be 6/30/2010, and the FYEAR is 2010

we can work with datadate and fyear 
fdate will give the end of the fiscal year. 
datadate will tell the date the data was recorded
fyr and fyear together should match the month and year of datadate

DATADATE is COMPUSTAT’s approximation of the fiscal period-end date, but I have seen cases where this is off a 
few days from the company’s actual fiscal year-end date in EDGAR filings because DATADATE will be the last day of the month. APDEDATE is defined by COMPUSTAT as, “This item represents the actual date the company closes accounting for the period, which can be different from the last day of the month in the period.” Both of these dates may be thought of as the Balance Sheet date one would find in the actual corporate fillings.

In [5]:
df = pd.DataFrame()
df[data.columns] = data[data.columns]
df.drop(['datadate','tic','conm','fyr'],axis=1,inplace=True)

In [6]:
#removing the look ahead bias by shifting the data
ni = df.groupby('gvkey')['ni'].shift(-1)
df['ni'] = ni
df.dropna(inplace=True)

#very high correlations between features. its expected since they are mathematically derived from each other
df.corr().style.background_gradient(cmap='coolwarm')

Unnamed: 0,gvkey,fyear,ap,at,ch,cshpri,dltt,dvt,ebit,ebitda,gp,icapt,invt,lt,opeps,revt,seq,txdi,txp,txt,sic,ni,pi,csho
gvkey,1.0,0.093994,-0.102891,-0.12457,-0.119337,-0.049194,-0.16136,-0.107572,-0.113632,-0.122425,-0.120906,-0.148656,-0.17922,-0.112621,-0.08132,-0.145939,-0.13945,0.000408,-0.113769,-0.093099,0.042263,-0.069673,-0.08943,-0.036029
fyear,0.093994,1.0,0.035932,0.033548,0.037787,0.050585,0.078132,0.086228,0.070435,0.045318,0.044615,0.034395,0.061589,0.037745,0.002256,0.013565,0.020395,0.001661,-0.024652,0.01768,-0.026218,0.082848,0.06214,0.052115
ap,-0.102891,0.035932,1.0,0.9306,0.846262,0.5062,0.681619,0.358655,0.526485,0.7003,0.551256,0.791149,0.535248,0.946901,0.003817,0.852902,0.758261,0.151669,0.79601,0.571256,0.024677,0.288938,0.402955,0.489348
at,-0.12457,0.033548,0.9306,1.0,0.904036,0.585383,0.784898,0.505154,0.657291,0.82209,0.658316,0.922252,0.663099,0.985368,0.005477,0.912852,0.90233,0.112617,0.887275,0.668594,-0.014546,0.355046,0.52976,0.566504
ch,-0.119337,0.037787,0.846262,0.904036,1.0,0.625524,0.758197,0.473611,0.615221,0.794602,0.745956,0.904904,0.660425,0.858686,0.016998,0.945506,0.891084,0.119175,0.814886,0.619906,-0.007958,0.296114,0.472763,0.604169
cshpri,-0.049194,0.050585,0.5062,0.585383,0.625524,1.0,0.609318,0.587891,0.61603,0.696442,0.742932,0.689669,0.565047,0.515194,-0.025566,0.662642,0.685253,0.029918,0.571445,0.558095,-0.027467,0.337774,0.469258,0.987958
dltt,-0.16136,0.078132,0.681619,0.784898,0.758197,0.609318,1.0,0.564145,0.625709,0.76382,0.710867,0.873262,0.82677,0.74159,0.017133,0.818216,0.798399,0.0537,0.689269,0.590822,-0.128825,0.25424,0.449295,0.587406
dvt,-0.107572,0.086228,0.358655,0.505154,0.473611,0.587891,0.564145,1.0,0.853205,0.806505,0.804639,0.66029,0.55938,0.414396,0.01843,0.526569,0.679357,-0.066585,0.523026,0.664346,-0.031143,0.605651,0.768793,0.567514
ebit,-0.113632,0.070435,0.526485,0.657291,0.615221,0.61603,0.625709,0.853205,1.0,0.94497,0.842645,0.766983,0.663586,0.575076,0.107894,0.658521,0.785913,-0.022771,0.674088,0.760303,-0.076483,0.68869,0.919906,0.593834
ebitda,-0.122425,0.045318,0.7003,0.82209,0.794602,0.696442,0.76382,0.806505,0.94497,1.0,0.905057,0.908474,0.725627,0.744215,0.073622,0.850664,0.916401,0.024317,0.803764,0.794329,-0.043625,0.557131,0.81234,0.672285


In [7]:

def prediction(df1, components):
    df1 = df1.drop('gvkey',axis=1)
    alpha_vals = np.arange(0.01,1)
    X_train, X_test, y_train, y_test = train_test_split(df1.loc[:, df1.columns != 'ni'], df1['ni'], 
                                                        test_size=0.33, random_state=42)
    pipe = Pipeline(steps=[('scaler', StandardScaler()), ('pca', PCA(n_components= components)), 
                           ('ridge', Ridge(fit_intercept=True))])
    gsc = GridSearchCV(pipe, param_grid={ 'ridge__alpha': alpha_vals},cv=10, scoring='r2')
    gsc.fit(X_train, y_train)
    y_pred = gsc.predict(X_test)
    RMSE = (mean_squared_error(y_test,y_pred)**(1/2))
    print(r2_score(y_test, y_pred))
    print(RMSE)


In [8]:
#passing the entire dataframe for scaling, PCA and prediction
prediction(df, 11)

0.4659481037899208
599.4174740803446


In [9]:
#using the between estimates regression method. 
be = df.groupby('gvkey').mean()
be.reset_index(inplace=True)
prediction(be,11)

0.879622175311344
108.60927948936948


In [10]:
be['ni'].std()                  

357.6582742779234

In [11]:
#analysis of PCA
def PCA_analysis(temp, components):
    pca = PCA()
    dataset = pd.DataFrame()

    #checking correlation of target variable with different principal components.
    transformed = pca.fit_transform(temp.drop('ni',axis=1))
    for i in range(0, len(transformed[0])):
        dataset[i] = transformed[:,i]
        print(i, temp['ni'].corr(dataset[i]))

    pca = PCA(n_components=components)
    transformed = pca.fit_transform(temp.drop('ni',axis=1))
    print(pca.explained_variance_ratio_)
    print(sum(pca.explained_variance_ratio_))

***