In [541]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


file_path = 'final_data.csv'
data = pd.read_csv(file_path).dropna()
data['Month'] = data['Month'].apply(lambda x : int(x))
data['Month'] = data['Month'].astype('category')


In [542]:
#Calculates multiplier for the following trading startegies on earnings days in X_test
#1. Given a prediction, take a long position for a day if prediction is positive, else short for a day
#2. Randomly go long or short for a day based on coin toss
#3. Always buy stock on earnings dal and sell it next day

def profit_compare(y_pred, y_test, X_test):
    X_test['DateTime'] = pd.to_datetime(X_test['Earnings Date and Time'])
    X_test.drop('Earnings Date and Time', axis = 1, inplace=True)
    X_test = X_test.reset_index()
    X_test = X_test.sort_values(by=['DateTime'])

    tickers = set(X_test['Ticker'].values)
    diffs = []

    for ticker in tickers:
        #print(ticker)
        mult_strat = 1
        mult_rand = 1
        mult_buy = 1
        X_ticker = X_test[X_test['Ticker']==ticker]
        indices = X_ticker.index

        for i in indices:
            #Update multiplier for strategy 1
            if y_pred[i] >= 0:
                mult_strat=mult_strat*(1+(y_test[i]/100))
            else:
                mult_strat=mult_strat*(1-(y_test[i]/100))
            #Update multiplier for strategy 2
            toss = np.random.binomial(1,0.5)
            if toss > 0:
                mult_rand=mult_rand*(1+(y_test[i]/100))
            else:
                mult_rand=mult_rand*(1-(y_test[i]/100))
            #Update multiplier for strategy 3
            mult_buy=mult_buy*(1+(y_test[i]/100))

        diffs.append([mult_strat, mult_rand, mult_buy])

    return(np.mean(diffs, axis=0))        

In [543]:
relevant_cols = ['average_volume_50_days', 'quant_score_NER',
       'financial_performance_score', 'market_position_score',
       'strategic_direction_score', 'operational_aspects_score',
       'financial_indicators_score', 'risks_challenges_score',
       'economic_factors_score', '% Change Revenue', '% Change EPS GAAP',
       '% Change EPS Normalized',
       'perc_change_prev_7', 'perc_change_7_15']

In [544]:
mses = []
mses_drop_outlier = []
signaccus = []
signaccus_drop_outlier = []
profits = []

for i in range(1000):
    data_train, data_test = train_test_split(data, test_size=0.2, shuffle=True, stratify=data['Ticker'])
    data_train = data_train.sort_values(by=['perc_change_next_prev'])

    #Uncomment the following for time based split
    #split_idx = int((1 - 0.2) * len(data))
    #data_train = data.iloc[:split_idx]
    #data_test = data.iloc[split_idx:]
    
    scaler = StandardScaler()
    scaler.fit(data_train[relevant_cols])
    data_train[relevant_cols] = scaler.transform(data_train[relevant_cols])
    
    months_onehot_train = pd.get_dummies(data=data_train['Month'], dtype=int)
    months_onehot_train.columns = months_onehot_train.columns.astype(str)
    data_train_final = pd.concat([data_train[relevant_cols], months_onehot_train], axis=1)

    linreg = LinearRegression()
    linreg.fit(data_train_final, data_train['perc_change_next_prev'])

    #linreg_drop_outlier = LinearRegression()
    #linreg_drop_outlier.fit(data_train_final.iloc[20:-20], data_train['perc_change_next_prev'].iloc[20:-20])

    data_test[relevant_cols] = scaler.transform(data_test[relevant_cols])

    months_onehot_test = pd.get_dummies(data=data_test['Month'], dtype=int)
    months_onehot_test.columns = months_onehot_test.columns.astype(str)
    data_test_final = pd.concat([data_test[relevant_cols], months_onehot_test], axis=1)

    y_predict = linreg.predict(data_test_final)
    #y_predict_drop_outlier = linreg_drop_outlier.predict(data_test_final)

    mses.append(mean_squared_error(data_test['perc_change_next_prev'], y_predict))
    #mses_drop_outlier.append(mean_squared_error(data_test['perc_change_next_prev'], y_predict_drop_outlier))
    
    signaccus.append((np.count_nonzero(np.sign(data_test['perc_change_next_prev'])+np.sign(y_predict))/len(y_predict)))
    #signaccus_drop_outlier.append((np.count_nonzero(np.sign(data_test['perc_change_next_prev'])+np.sign(y_predict_drop_outlier))/len(y_predict_drop_outlier)))

    profits.append(profit_compare(y_predict, data_test['perc_change_next_prev'].values, data_test))

In [545]:
np.mean(profits, axis=0)

array([1.03160187, 0.99901604, 1.02145182])

In [546]:
np.mean(mses)

26.008179706209337

In [547]:
np.mean(mses_drop_outlier)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


nan

In [548]:
np.mean(signaccus)

0.5626784922394678

In [549]:
np.mean(signaccus_drop_outlier)

nan

In [550]:
linreg.coef_

array([ 0.05648714, -0.15782029,  0.36386665, -0.12192343,  0.21666329,
       -0.11237763,  0.18758229, -0.32396728,  0.03033194,  0.69632008,
       -0.56234186, -0.1283891 , -0.59230456,  0.14426428, -0.4733881 ,
       -0.4124459 ,  1.18324947, -0.45529712, -0.21522229,  0.4973902 ,
        0.15477917,  1.0730595 , -1.70711824, -0.15849942,  0.95008337,
       -0.43659064])