In [9]:
import pandas as pd
import statsmodels.api as sm
import warnings
import datetime
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
warnings.filterwarnings("ignore")

In [10]:
PATH = ''

# Connect to Google Drive

In [11]:
# #install python 3.9
# !sudo apt-get update -y
# !sudo apt-get install python3.10

# #change alternatives
# !sudo update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.7 1
# !sudo update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 2


In [12]:
! python --version

Python 3.7.13


In [13]:
from google.colab import drive
PATH = '/content/drive/'
drive.mount(PATH)

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [14]:
PATH += 'MyDrive/key_event_data/'

# Read Combined data

In [15]:
df = pd.read_csv(PATH+'combined_df.csv')
df.shape

(262377, 86)

In [16]:
df.drop(['indfmt','consol','popsrc','datafmt','curcdq'],axis=1,inplace=True)

In [17]:
df.columns

Index(['gvkey', 'datadate', 'fyearq', 'fqtr', 'datacqtr', 'datafqtr', 'fdateq',
       'actq', 'atq', 'chq', 'cogsq', 'cshoq', 'dlcq', 'dlttq', 'dpq', 'drcq',
       'intaccq', 'invtq', 'lctq', 'ltq', 'nimq', 'niq', 'npatq', 'oiadpq',
       'oibdpq', 'opepsq', 'req', 'revtq', 'saleq', 'txdiq', 'txpq', 'txtq',
       'uinvq', 'wcapq', 'xaccq', 'costat', 'gind', 'gsector', 'quarter',
       'pos_Executive/Board Changes - Other', 'pos_Client Announcements',
       'pos_Announcements of Earnings',
       'pos_Corporate Guidance - New/Confirmed', 'pos_Business Expansions',
       'pos_Product-Related Announcements', 'pos_Dividend Affirmations',
       'pos_Dividend Increases', 'pos_Earnings Calls',
       'pos_Company Conference Presentations', 'pos_Earnings Release Date',
       'pos_Annual General Meeting', 'pos_Ex-Div Date (Regular)',
       'pos_Board Meeting', 'pos_M&A Transaction Announcements',
       'pos_M&A Transaction Closings', 'pos_Private Placements',
       'pos_Fixed Income

In [18]:
df['niq'].isna().sum()/ len(df)

0.1085194205284762

In [19]:
#drop missing values in the NI column
df.dropna(subset = ['niq'],inplace=True)

#fill missing values in the sentiment columns with 0 (or neutral sentiment)
df.iloc[:,39:] = df.iloc[:,39:].fillna(0)

In [20]:
len(df)

233904

In [21]:
df.dropna(how='all',inplace=True)
drop_cols = []
for i in df:
    if(df[i].isna().sum()/len(df) >0.1):
        drop_cols.append(i)

df.drop(drop_cols,axis=1,inplace=True)

In [22]:
df.columns

Index(['gvkey', 'datadate', 'fyearq', 'fqtr', 'datacqtr', 'datafqtr', 'atq',
       'cogsq', 'cshoq', 'dlcq', 'dlttq', 'invtq', 'ltq', 'niq', 'oiadpq',
       'oibdpq', 'opepsq', 'req', 'revtq', 'saleq', 'txtq', 'costat', 'gind',
       'gsector', 'quarter', 'pos_Executive/Board Changes - Other',
       'pos_Client Announcements', 'pos_Announcements of Earnings',
       'pos_Corporate Guidance - New/Confirmed', 'pos_Business Expansions',
       'pos_Product-Related Announcements', 'pos_Dividend Affirmations',
       'pos_Dividend Increases', 'pos_Earnings Calls',
       'pos_Company Conference Presentations', 'pos_Earnings Release Date',
       'pos_Annual General Meeting', 'pos_Ex-Div Date (Regular)',
       'pos_Board Meeting', 'pos_M&A Transaction Announcements',
       'pos_M&A Transaction Closings', 'pos_Private Placements',
       'pos_Fixed Income Offerings',
       'pos_Special/Extraordinary Shareholders Meeting', 'pos_Conferences',
       'pos_Buyback Tranche Update', 'neg_Exe

In [23]:
#adding the shifted eps (next quarter eps)
shifted = df.groupby('gvkey',as_index=False)['niq'].shift(-1)
shifted.reset_index(inplace=True)
df.reset_index(inplace=True)
df = df.merge(shifted, on='index', suffixes =['','_shifted'])
df.dropna(subset=['niq_shifted'],inplace=True)

In [24]:
#adding change_eps and direction_change cols
df['change_niq'] = df['niq_shifted'] - df['niq']
df['dir'] = df['change_niq'].apply(lambda x: 0 if x < 0 else 1)
df.drop('niq_shifted',axis=1,inplace=True)

In [25]:
#dropping columns we will not use in prediction
df.drop(['fyearq','fqtr','datacqtr','datafqtr','gind','gsector','quarter','costat'],axis=1,inplace=True)

In [26]:
#adding percent change columns (from previous year values) for our predictors
predictors = list(df.columns[3:18])
temp = df.groupby('gvkey').shift(1)[predictors].reset_index()
df = df.merge(temp, on='index', suffixes=['','_shifted'])
df.dropna(how='all',inplace=True)

for i in predictors:
    df[i+'_perc_change'] =  (df[i] - df[i+'_shifted'])/abs(df[i+'_shifted'])
    
new = []
for i in predictors:
    new.append(i + '_perc_change')
predictors = predictors + new

In [27]:
#running univariate regression and keeping only those columns whose tvalue is statisically significant
keep_cols = []
def run_unireg(col):
    temp = df[[col,'dir']]
    temp.replace([np.inf, -np.inf], np.nan, inplace=True)
    temp.dropna(inplace=True)
    X = sm.add_constant(temp[col])
    y = temp['dir']
    log_reg = sm.Logit(y,X).fit()
    if(log_reg.tvalues[1] >= 2 or log_reg.tvalues[1] <= -2):
        keep_cols.append(col)

for i in predictors:
    print(i)
    run_unireg(i)

atq
Optimization terminated successfully.
         Current function value: 0.691590
         Iterations 3
cogsq
Optimization terminated successfully.
         Current function value: 0.691563
         Iterations 3
cshoq
Optimization terminated successfully.
         Current function value: 0.691606
         Iterations 3
dlcq
Optimization terminated successfully.
         Current function value: 0.691673
         Iterations 3
dlttq
Optimization terminated successfully.
         Current function value: 0.691587
         Iterations 3
invtq
Optimization terminated successfully.
         Current function value: 0.691629
         Iterations 3
ltq
Optimization terminated successfully.
         Current function value: 0.691589
         Iterations 3
niq
Optimization terminated successfully.
         Current function value: 0.688978
         Iterations 5
oiadpq
Optimization terminated successfully.
         Current function value: 0.691386
         Iterations 4
oibdpq
Optimization terminated suc

In [28]:
ratios = pd.read_csv(PATH+'ratios.csv')
ratios.drop('divyield',axis=1,inplace=True)
final = ratios.merge(df, left_on = ['gvkey','qdate'], right_on = ['gvkey','datadate'])
final.drop_duplicates(subset=['gvkey','qdate'],inplace=True)
cols = ratios.columns[4:-1]
final_pred = predictors + list(cols)
final.head()

Unnamed: 0,gvkey,adate,qdate,public_date,bm,evm,pe_op_basic,pe_op_dil,pe_exi,pe_inc,...,invtq_perc_change,ltq_perc_change,niq_perc_change,oiadpq_perc_change,oibdpq_perc_change,opepsq_perc_change,req_perc_change,revtq_perc_change,saleq_perc_change,txtq_perc_change
0,1004,05/31/2001,08/31/2001,10/31/2001,0.859,9.923,10.556,10.556,12.881,12.881,...,,,,,,,,,,
3,1004,05/31/2001,11/30/2001,01/31/2002,1.55,11.856,18.696,18.696,-5.375,-5.375,...,-0.21975,0.027508,-113.106996,-1.031405,-0.473398,-6.0,-0.274636,-0.286236,-0.286236,-224.277778
6,1004,05/31/2001,02/28/2002,04/30/2002,1.57,14.446,70.778,70.778,-6.813,-6.813,...,0.020781,-0.125656,0.957969,1.251462,0.03791,0.2,-0.024638,-0.009883,-0.009883,0.957977
9,1004,05/31/2002,05/31/2002,07/31/2002,0.935,22.965,-36.417,-36.417,-4.202,-4.202,...,0.056187,0.064161,-0.157642,-15.093023,-0.038273,0.0,-0.032537,0.02736,0.02736,-0.158553
12,1004,05/31/2002,08/31/2002,10/31/2002,1.682,24.601,-9.537,-9.537,-1.671,-1.671,...,-0.041352,0.041054,-0.840438,-3.976898,-0.27912,-0.875,-0.026633,0.025668,0.025668,-0.492334


In [29]:
#running univariate regression and keeping only those columns whose tvalue is statisically significant
keep_cols = []
def run_unireg(col):
    temp = final[[col,'dir']]
    temp.replace([np.inf, -np.inf], np.nan, inplace=True)
    temp.dropna(inplace=True)
    X = sm.add_constant(temp[col])
    y = temp['dir']
    log_reg = sm.Logit(y,X).fit()
    if(log_reg.tvalues[1] >= 2 or log_reg.tvalues[1] <= -2):
        keep_cols.append(col)

for i in final_pred:
    print(i)
    run_unireg(i)

atq
Optimization terminated successfully.
         Current function value: 0.691504
         Iterations 3
cogsq
Optimization terminated successfully.
         Current function value: 0.691491
         Iterations 3
cshoq
Optimization terminated successfully.
         Current function value: 0.691504
         Iterations 3
dlcq
Optimization terminated successfully.
         Current function value: 0.691592
         Iterations 3
dlttq
Optimization terminated successfully.
         Current function value: 0.691513
         Iterations 3
invtq
Optimization terminated successfully.
         Current function value: 0.691535
         Iterations 3
ltq
Optimization terminated successfully.
         Current function value: 0.691502
         Iterations 3
niq
Optimization terminated successfully.
         Current function value: 0.689615
         Iterations 5
oiadpq
Optimization terminated successfully.
         Current function value: 0.691341
         Iterations 4
oibdpq
Optimization terminated suc

In [30]:
keep_cols.remove('saleq')

In [31]:
#keeping only the statistically significant columns
data = final[['dir','public_date']+keep_cols].replace([np.inf, -np.inf], np.nan).dropna()

In [32]:
X = sm.add_constant(data.iloc[:,2:])
y = data['dir']
log_reg = sm.Logit(y,X).fit()
log_reg.summary()

Optimization terminated successfully.
         Current function value: 0.675266
         Iterations 7


0,1,2,3
Dep. Variable:,dir,No. Observations:,34383.0
Model:,Logit,Df Residuals:,34350.0
Method:,MLE,Df Model:,32.0
Date:,"Tue, 03 May 2022",Pseudo R-squ.:,0.02475
Time:,01:06:20,Log-Likelihood:,-23218.0
converged:,True,LL-Null:,-23807.0
Covariance Type:,nonrobust,LLR p-value:,3.6749999999999997e-227

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.2745,0.062,4.419,0.000,0.153,0.396
cogsq,-3.547e-05,1.91e-05,-1.862,0.063,-7.28e-05,1.87e-06
niq,-0.0015,9.88e-05,-14.881,0.000,-0.002,-0.001
oiadpq,0.0011,0.000,9.309,0.000,0.001,0.001
oibdpq,-7.609e-05,7.07e-05,-1.076,0.282,-0.000,6.25e-05
opepsq,-0.3382,0.018,-19.096,0.000,-0.373,-0.303
revtq,3.768e-05,1.78e-05,2.116,0.034,2.78e-06,7.26e-05
invtq_perc_change,1.964e-05,7.93e-06,2.477,0.013,4.1e-06,3.52e-05
niq_perc_change,-1.367e-06,2.29e-06,-0.597,0.551,-5.86e-06,3.12e-06


In [33]:
#keeping only the fields that were significant in multivariate regression
sig = log_reg.tvalues[1:].reset_index()
sig = sig[(sig[0]>=2) | (sig[0]<=-2)]
keep= list(sig['index'].values)

In [34]:
sent_cols = df.columns[18:60]
keep += list(sent_cols)

In [35]:
final.columns

Index(['gvkey', 'adate', 'qdate', 'public_date', 'bm', 'evm', 'pe_op_basic',
       'pe_op_dil', 'pe_exi', 'pe_inc',
       ...
       'invtq_perc_change', 'ltq_perc_change', 'niq_perc_change',
       'oiadpq_perc_change', 'oibdpq_perc_change', 'opepsq_perc_change',
       'req_perc_change', 'revtq_perc_change', 'saleq_perc_change',
       'txtq_perc_change'],
      dtype='object', length=165)

In [36]:
#preparing clean final data
final = final[['cusip','dir','public_date']+keep].replace([np.inf, -np.inf], np.nan).dropna()
final.sort_values('public_date',inplace=True)
final['public_date'] = pd.to_datetime(final['public_date'])
final.reset_index(drop=True,inplace=True)
final.head()

Unnamed: 0,cusip,dir,public_date,niq,oiadpq,opepsq,revtq,invtq_perc_change,bm,ps,...,neg_Annual General Meeting,neg_Ex-Div Date (Regular),neg_Board Meeting,neg_M&A Transaction Announcements,neg_M&A Transaction Closings,neg_Private Placements,neg_Fixed Income Offerings,neg_Special/Extraordinary Shareholders Meeting,neg_Conferences,neg_Buyback Tranche Update
0,03759810,0,2002-01-31,5.844,9.93,0.21,200.293,-0.972966,0.379,0.515,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,88033G10,1,2002-01-31,89.0,527.0,0.79,3394.0,0.422034,0.299,1.605,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,05333210,0,2002-01-31,84.077,155.504,0.78,1176.052,0.13513,0.127,1.465,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,00724F10,1,2002-01-31,34.289,64.324,0.18,264.54,-1.0,0.081,6.488,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,01623010,1,2002-01-31,3.004,0.367,0.43,8.119,-0.712313,0.566,4.429,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
final.to_csv(PATH+'final.csv')

In [None]:
X = final.iloc[:,3:]
y = final.iloc[:,1]
clf = RandomForestClassifier().fit(X, y)
fi = clf.feature_importances_.sort_values(ascending = False)
figure(figsize=(10, 15), dpi=80)
plt.title('feature_importance')
plt.barh(X.columns, fi)

AttributeError: ignored

In [None]:
prices = pd.read_csv(PATH+'prices.csv')
prices['date'] = pd.to_datetime(prices['date'])
prices['PRC'] = abs(prices['PRC'])
prices.head()

In [None]:
ff = pd.read_csv(PATH+'ff_factors.csv')
ff.dropna()

In [None]:
#creating datelist to loop over while creating portfolios
datelist = pd.date_range(pd.to_datetime('2002-12-31'), periods=73, freq='Q').tolist()

#store results for different kinds of cut-offs
results = pd.DataFrame(columns = ['50-50','60-40','70-30','80-20','90-10','95-05'])

# ROC_AUC Curve & Confusion matrix

In [None]:
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns

def print_confusion(model, train_X, train_y, test_X, test_y, plots = True):
    '''
    Prints the confusion matrix and accuracy score for the model
    Plots the confusion matrix if plots = True
    Plots the ROC curve
    '''
    y_pred_proba = model.predict_proba(test_X)[::,1] # Predict the probability of the positive class (default)

    #create ROC curve
    fpr, tpr, _ = metrics.roc_curve(test_y,  y_pred_proba) # Calculate the ROC curve
    auc = metrics.roc_auc_score(test_y, y_pred_proba) # Calculate the AUC score for the model
    
    # # Youden’s J statistic
    # metric = tpr - fpr
    # GMean Statistic
    metric = np.sqrt(tpr*(1-fpr)) # Calculate the GMean score for the model, This helps to determine the optimal threshold

    # Find index of largest value
    ix = np.argmax(metric) # Find the index of the largest value in the metric. This is the index of the optimal threshold (from the )
    
    threshold = round(_[ix],4) # Get the threshold value for the optimal threshold
    
    y_pred = np.where(y_pred_proba > threshold, 1, 0) # Predict the class for the test data
    test_acc = (test_y == y_pred).mean() # Calculate the accuracy score for the model
    
    if (test_y == model.predict(test_X)).mean() > test_acc: # Edge case handling
        threshold = 0.5

    train_acc = (train_y == np.where(model.predict_proba(train_X)[::,1] > threshold, 1, 0)).mean() # Calculate the training accuracy score for the model
    # print('\tTraining accuracy', round(train_acc,2))
    

    # print('\tValidation accuracy', round(test_acc,2))
    

    # print('\tValidation AUC-Score', round(auc,2))
    
    cf_matrix = metrics.confusion_matrix(test_y, y_pred) # Calculate the confusion matrix

    if plots: # Plot the confusion matrix and ROC curve
        fig, ax = plt.subplots(1,2, figsize=(10, 5))

        ax[0].plot(fpr,tpr,label="AUC="+str(round(auc,4)))
        ax[0].plot([0,1], [0,1], linestyle='--', label='No Skill')
        ax[0].scatter(fpr[ix], tpr[ix], marker='o', color='black', label= f'Best at {round(threshold, 4)}')
        ax[0].legend(loc=4)
        ax[0].set_ylabel('True Positive Rate')
        ax[0].set_xlabel('False Positive Rate')
        ax[0].set_title('ROC Curve')
        # plt.show()

        group_names = ['True Neg','False Neg','False Pos','True Pos']
        group_counts = ['{0:0.0f}'.format(value) for value in cf_matrix.flatten()]
        labels = [f'{v1}\n{v2}' for v1, v2 in zip(group_names,group_counts)]
        # group_percentages = ['{0:.2%}'.format(value) for value in cf_matrix.flatten()/np.sum(cf_matrix)]
        # labels = [f'{v1}\n{v2}\n{v3}' for v1, v2, v3 in zip(group_names,group_counts,group_percentages)]
        labels = np.asarray(labels).reshape(2,2)
        
        ax[1] = sns.heatmap(cf_matrix, annot= labels, fmt= '', cmap='Blues')
        ax[1].set_xlabel('Actual')
        ax[1].set_ylabel('Predicted')
        ax[1].set_xticklabels(['No Default','Default'])
        ax[1].set_yticklabels(['No Default','Default'])
        ax[1].set_title(f'Confusion Matrix @ threshold = {round(threshold,2)}')
        plt.show()
        
    return {'train_acc': train_acc, 
            'test_acc': test_acc, 
            'threshold': threshold, 
            'auc': auc
        }


# Use it as: 
# 
# model_result = print_confusion(model = base_model, 
#                               train_X = X_train, train_y = y_train, 
#                               test_X = X_val, test_y = y_val, 
#                               plots = True)


# Long Short Portfolio

In [None]:
from sklearn.model_selection import GridSearchCV

def grid_search_cv(X_train: pd.DataFrame, y_train: pd.DataFrame, base: object,parameters: dict, cv = 3)-> GridSearchCV:
    '''
    Returns a grid search object with the specified parameters
    '''
    model = GridSearchCV(base, parameters, n_jobs = -1, cv = cv, verbose = 5)
    model.fit(X= X_train, y = y_train)
    return model

In [None]:
returns = pd.DataFrame()
def long_only(ub, lb):
    long = 1000000
    port_ret = []
    metric_list = []
    auc_list = []
    accuracy_list = []
    insample_accuracy_list = []
    
    for i in datelist:
        training = final[(final['public_date'] <= i) & (final['public_date'] >= i - datetime.timedelta(365))]
        predicting = final[(final['public_date'] <= i + datetime.timedelta(90)) & (final['public_date'] > i)]
        price = prices[(prices['date'] > i) & (prices['date'] <= i + datetime.timedelta(90))]

        X = training.iloc[:,3:]
        y = training['dir']
        clf = RandomForestClassifier(criterion = 'gini')

        grid_clf = grid_search_cv(X_train= X, y_train = y, 
                                    base = clf, 
                                    parameters= {
                                                    "n_estimators": [50, 100, 150],
                                                })
        clf = grid_clf.best_estimator_

        model_result = print_confusion(model = clf, 
                              train_X = X, train_y = y, 
                              test_X = predicting.iloc[:,3:], test_y = predicting['dir'], 
                              plots = False)
        auc_list.append(model_result['auc'])
        accuracy_list.append(model_result['test_acc'])
        insample_accuracy_list.append(model_result['train_acc'])

        temp = pd.DataFrame()
        temp['cusip'] = predicting['cusip']
        temp['Actual'] =  predicting['dir']
        temp[['Predicted_0','Predicted_1']] = clf.predict_proba(predicting.iloc[:,3:])
        temp = temp.merge(price, left_on =['cusip'],right_on= ['CUSIP'])

        initial = temp.groupby('cusip').first().reset_index()
        initial['inv'] = initial['Predicted_1'].apply(lambda x : True if x >= ub else np.nan)
        initial.dropna(inplace=True)
        if(len(initial[initial['inv'] == True])==0):
            port_ret.append(0)
            continue
        initial['shares'] = long/len(initial)
        initial['val'] = initial['PRC']*initial['shares']

        end = temp.groupby('cusip').last().reset_index()
        end = end.merge(initial, on='cusip')
        end['val_final'] = end['shares'] * end['PRC_x']
        end.dropna(subset=['val_final'],inplace=True)
        
        port_val_ini = sum(initial['val'])
        port_val_fin = sum(end['val_final'])
        
        ret = (port_val_fin - port_val_ini)/port_val_ini
        port_ret.append(round(ret,2))
    
    auc = np.round(np.mean(auc_list),4)
    print(f'Threshold: {ub}-{lb}')
    print('\tAverage AUC: ',auc)
    acc = np.round(np.mean(accuracy_list),4)
    print('\tAverage Validation Accuracy: ',acc)
    insample_acc = np.round(np.mean(insample_accuracy_list),4)
    print('\tAverage Train Accuracy: ',insample_acc)
    return port_ret

In [None]:
returns_long = pd.DataFrame()
returns_long['Dates'] = datelist
returns_long['50'] = long_only(0.5,0.5)
# returns_long['60'] = long_only(0.6,0.4)
returns_long['70'] = long_only(0.7,0.3)
# returns_long['80'] = long_only(0.8,0.2)
returns_long['90'] = long_only(0.9,0.1)
returns_long['95'] = long_only(0.95,0.05)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Fitting 3 folds for each of 3 candidates, totalling 9 fi

In [None]:
cum_ret_long = returns_long.iloc[:,1:]+1
cum_ret_long = cum_ret_long.cumprod()
cum_ret_long.plot()

In [None]:
returns_long['Dates'] = returns_long['Dates'].astype('str')
returns_long['Dates_M'] =returns_long['Dates'].apply(lambda x: x[0:4] + x[5:7])
returns_long['Dates_M'] = pd.to_numeric(returns_long['Dates_M'])
merged_long = returns_long.merge(ff, left_on=['Dates_M'],right_on=['Date'])

In [None]:
X = sm.add_constant(merged_long.iloc[:,9:])
y = merged_long['60']
reg = sm.OLS(y,X, fit_intercept=True).fit()
reg.summary()

In [None]:
returns = pd.DataFrame()
def long_short(ub, lb):
    port_ret = []
    long = 1000000
    short = 1000000
    auc_list = []
    accuracy_list = []
    
    for i in datelist:
        training = final[(final['public_date'] <= i) & (final['public_date'] >= i - datetime.timedelta(365))]
        predicting = final[(final['public_date'] <= i + datetime.timedelta(90)) & (final['public_date'] > i)]
        price = prices[(prices['date'] > i) & (prices['date'] <= i + datetime.timedelta(90))]

        X = training.iloc[:,3:]
        y = training['dir']
        clf = RandomForestClassifier(criterion = 'gini')

        grid_clf = grid_search_cv(X_train= X, y_train = y, 
                                    base = clf, 
                                    parameters= {
                                                    "n_estimators": [50, 100, 150],
                                                })
        clf = grid_clf.best_estimator_

        model_result = print_confusion(model = clf, 
                              train_X = training.iloc[:,3:], train_y = training['dir'], 
                              test_X = predicting.iloc[:,3:], test_y = predicting['dir'], 
                              plots = False)
        auc_list.append(model_result['auc'])
        accuracy_list.append(model_result['test_acc'])

        temp = pd.DataFrame()
        temp['cusip'] = predicting['cusip']
        temp['Actual'] =  predicting['dir']
        temp[['Predicted_0','Predicted_1']] = clf.predict_proba(predicting.iloc[:,3:])
        temp = temp.merge(price, left_on =['cusip'],right_on= ['CUSIP'])

        initial = temp.groupby('cusip').first().reset_index()
        initial['inv'] = initial['Predicted_1'].apply(lambda x : True if x >= ub else (False if x <= lb else np.nan))
        initial.dropna(inplace=True)
        if(len(initial[initial['inv'] == True])==0 or len(initial[initial['inv'] == False])==0):
            port_ret.append(0)
            continue
        long_stocks = long/len(initial[initial['inv'] == True])
        short_stocks = short/len(initial[initial['inv'] == False])
        initial['val'] = initial['inv'].apply(lambda x: long_stocks if x else short_stocks)
        initial['shares'] = initial['val']/initial['PRC']

        end = temp.groupby('cusip').last().reset_index()
        end = end.merge(initial, on='cusip')
        end['val_final'] = end['shares'] * end['PRC_x']
        end.dropna(subset=['val_final'],inplace=True)
        port_val = abs(sum(end['val_final']))
        port_val_initial = abs(sum(initial['val']))
        ret = (port_val - port_val_initial)/port_val_initial
        port_ret.append(round(ret,2))
    
    auc = np.round(np.mean(auc_list),4)
    print(f'Threshold: {ub}-{lb}')
    print('\tAverage AUC: ',auc)
    acc = np.round(np.mean(accuracy_list),4)
    print('\tAverage Validation Accuracy: ',acc)
    return port_ret

In [None]:
returns = pd.DataFrame()
returns['Dates'] = datelist
returns['50'] = long_short(0.5,0.5)
returns['60'] = long_short(0.6,0.4)
returns['70'] = long_short(0.7,0.3)
returns['80'] = long_short(0.8,0.2)
returns['90'] = long_short(0.9,0.5)
returns['95'] = long_short(0.95,0.05)

In [None]:
cum_ret = (returns.iloc[:,1:5]) + 1
cum_ret = cum_ret.cumprod()
cum_ret.plot()

In [None]:
returns['Dates'] = returns['Dates'].astype('str')
returns['Dates_M'] = returns['Dates'].apply(lambda x: x[0:4] + x[5:7])
returns['Dates_M'] = pd.to_numeric(returns['Dates_M'])


In [None]:
merged = returns.merge(ff, left_on=['Dates_M'],right_on=['Date'])
X = sm.add_constant(merged.iloc[:,9:])
y = merged['60']
reg = sm.OLS(y,X, fit_intercept=True).fit()
reg.summary()

***