In [16]:
import pandas as pd
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from config import * 
from data_cleaning_util import prepare_earnings_data
from baseline_models import call_baseline_model
from finbert_models_utils import call_model, call_model_fin, meanpooling_withsvm, meanpooling_withsvm_fin

In [2]:
raw_data = prepare_earnings_data()

In [5]:
# Create dataframe with the test_auc, test_auc_ci, test_se, and test_loss for both models under 1-day retirn with the AttenPoolTwoTower and the AttnMLPPoolClassifier architectures

model_bert, test_loss_bert, test_auc_bert, test_auc_ci_bert, test_se_bert = call_model(
    Model="AttnMLPPoolClassifier",
    dim=768,
    attn_hidden=256,
    hidden=256,
    dropout=0.2,
    return_period=1
)

model_fin, test_loss_fin, test_auc_fin, test_auc_ci_fin, test_se_fin = call_model_fin(
    Model="AttnPoolTwoTower",
    dim=768,
    fin_dim=4,
    hidden=256,
    dropout=0.2,
    return_period=1
)

# Create comparison dataframe
results_df = pd.DataFrame({
    'Model': ['AttnMLPPoolClassifier', 'AttnPoolTwoTower'],
    'Test AUC': [test_auc_bert, test_auc_fin],
    'Test AUC CI': [test_auc_ci_bert, test_auc_ci_fin],
    'Test SE': [test_se_bert, test_se_fin],
    'Test Loss': [test_loss_bert, test_loss_fin]
})

print(results_df)

epoch 01 | train_loss=0.6902 | val_loss=0.6912 | val_auc=0.464
epoch 02 | train_loss=0.6849 | val_loss=0.6905 | val_auc=0.479
epoch 03 | train_loss=0.6904 | val_loss=0.6951 | val_auc=0.483
epoch 04 | train_loss=0.7068 | val_loss=0.6989 | val_auc=0.495
epoch 05 | train_loss=0.7083 | val_loss=0.6979 | val_auc=0.489
epoch 06 | train_loss=0.7732 | val_loss=0.7258 | val_auc=0.515
epoch 07 | train_loss=0.7786 | val_loss=0.7060 | val_auc=0.498
epoch 08 | train_loss=0.6663 | val_loss=0.7175 | val_auc=0.490
epoch 09 | train_loss=0.7424 | val_loss=0.6997 | val_auc=0.499
epoch 10 | train_loss=0.7132 | val_loss=0.7249 | val_auc=0.499
epoch 11 | train_loss=0.7079 | val_loss=0.7175 | val_auc=0.493
epoch 12 | train_loss=0.6931 | val_loss=0.7247 | val_auc=0.497
epoch 13 | train_loss=0.6979 | val_loss=0.6986 | val_auc=0.509
Early stopping on AUC.
epoch 01 | train_loss=0.5801 | val_loss=0.7301 | val_auc=0.507
epoch 02 | train_loss=0.7209 | val_loss=0.7144 | val_auc=0.487
epoch 03 | train_loss=0.6549 | v

In [3]:
# Using the MeanPoolClassifier for 1- day return 

model_bert_mean, test_loss_bert_mean, test_auc_bert_mean, test_auc_ci_bert_mean, test_se_bert_mean = call_model(
    Model="MeanPoolClassifier",
    dim=768,
    attn_hidden=256,
    hidden=256,
    dropout=0.2,
    return_period=1
)

epoch 01 | train_loss=0.7147 | val_loss=0.6922 | val_auc=0.459
epoch 02 | train_loss=0.7006 | val_loss=0.7138 | val_auc=0.458
epoch 03 | train_loss=0.6848 | val_loss=0.6942 | val_auc=0.438
epoch 04 | train_loss=0.7024 | val_loss=0.7241 | val_auc=0.442
epoch 05 | train_loss=0.6873 | val_loss=0.7000 | val_auc=0.436
epoch 06 | train_loss=0.6747 | val_loss=0.7048 | val_auc=0.438
epoch 07 | train_loss=0.6792 | val_loss=0.7038 | val_auc=0.434
epoch 08 | train_loss=0.7765 | val_loss=0.7078 | val_auc=0.440
Early stopping on AUC.


In [6]:
# Get results for MeanPoolClassifier
results_df = meanpooling_withsvm(
    return_period=1
)

results_fin_df = meanpooling_withsvm_fin(
    return_period=1
)

print("Mean Pooling BERT Results:")
print(results_df)
print("Mean Pooling FinBERT Results:")
print(results_fin_df)

Mean Pooling BERT Results:
{'best_C': 100.0, 'val_auc': 0.5285116653258246, 'test_auc': 0.4227951535643843, 'test_auc_ci': (0.3504740017236426, 0.49157225752970435), 'test_se': 0.03501538014700204, 'test_scores': array([ 1.85242288e-01,  5.46275914e-01, -2.04438124e+00, -5.60165946e-01,
        6.65023452e-01,  1.68731220e+00,  1.51480309e+00, -4.56299742e-01,
       -4.60878499e-01,  1.97965702e+00,  2.68604999e-01,  1.98934218e+00,
       -3.99626855e-01, -6.05492267e-01,  1.28565378e+00, -1.27469145e+00,
        2.80091462e-01,  3.85151325e-01,  7.28861611e-01,  7.53499570e-01,
       -1.38206989e+00,  5.41685708e-01, -1.16283399e-01, -2.44961532e-01,
       -2.18264855e-01,  9.73082470e-01,  4.68819847e-01,  4.22195353e-02,
       -6.21712177e-02,  2.29052728e+00, -5.25411531e-01,  3.54136871e-01,
       -1.58598325e+00,  8.16441454e-02,  1.26729328e-01,  1.91081069e-01,
       -1.63572880e-01,  2.57459619e+00,  8.51828520e-01,  1.79600280e-01,
       -1.51171973e+00,  1.69273925e+

In [8]:
results_fin_df

{'best_C': 100.0,
 'val_auc': 0.5289782783588094,
 'test_auc': 0.4199304968535738,
 'test_auc_ci': (0.36079787738083957, 0.48111173603299584),
 'test_se': 0.03216895364898068,
 'test_scores': array([-9.17548905e-01,  2.09834210e+00,  2.51790815e-01, -1.92579807e+00,
         2.59340154e-01, -1.11584156e+00, -6.04852946e-02,  1.19164409e+00,
         2.08072017e+00,  8.93984622e-01,  2.18479925e-01,  9.58132386e-01,
         1.84147346e-01,  1.41922512e-01,  1.79528995e+00, -1.22489033e+00,
         1.28473899e-01,  7.16776956e-01, -1.12311546e+00,  8.33200483e-01,
        -1.18333409e+00, -7.96346945e-02,  2.45220854e-01, -8.13482633e-01,
         2.58050470e-01, -6.18586779e-01, -1.10036730e-01, -1.37084407e+00,
        -3.51863278e-01, -4.43978632e-01,  8.76231198e-01, -4.61488950e-01,
         7.64218807e-01, -7.71048366e-01,  1.63053833e+00,  7.86932506e-01,
         2.14795893e-01,  2.54595279e+00,  2.64468376e+00,  1.57535144e+00,
         8.79038561e-01,  2.56875485e-01, -1.2313

In [14]:
# Combine all results into a single dataframe
combined_results = pd.DataFrame({
    'Model': ['AttnMLPPoolClassifier (Transcript Only)', 
              'AttnPoolTwoTower (Transcript + Finance)', 
              'MeanPoolClassifier (Transcript Only)', 
              'SVM (Transcript Only)', 
              'SVM (Transcript + Finance)'],
    'Test AUC': [
        test_auc_bert,
        test_auc_fin,
        test_auc_bert_mean,
        results_df['test_auc'],
        results_fin_df['test_auc']
    ],
    'Test AUC CI': [
        test_auc_ci_bert,
        test_auc_ci_fin,
        test_auc_ci_bert_mean,
        results_df['test_auc_ci'],
        results_fin_df['test_auc_ci']
    ],
    'Test SE': [
        test_se_bert,
        test_se_fin,
        test_se_bert_mean,
        results_df['test_se'],
        results_fin_df['test_se']
    ]
})

print(combined_results)

                                     Model  Test AUC  \
0  AttnMLPPoolClassifier (Transcript Only)  0.507702   
1  AttnPoolTwoTower (Transcript + Finance)  0.438903   
2     MeanPoolClassifier (Transcript Only)  0.471213   
3                    SVM (Transcript Only)  0.422795   
4               SVM (Transcript + Finance)  0.419930   

                                  Test AUC CI   Test SE  
0    (0.4395588518589867, 0.5732286517398468)  0.034867  
1    (0.3748612652608213, 0.5106609808102345)  0.033948  
2    (0.4056058562897322, 0.5409186401833461)  0.034629  
3   (0.3504740017236426, 0.49157225752970435)  0.035015  
4  (0.36079787738083957, 0.48111173603299584)  0.032169  


In [15]:
combined_results

Unnamed: 0,Model,Test AUC,Test AUC CI,Test SE
0,AttnMLPPoolClassifier (Transcript Only),0.507702,"(0.4395588518589867, 0.5732286517398468)",0.034867
1,AttnPoolTwoTower (Transcript + Finance),0.438903,"(0.3748612652608213, 0.5106609808102345)",0.033948
2,MeanPoolClassifier (Transcript Only),0.471213,"(0.4056058562897322, 0.5409186401833461)",0.034629
3,SVM (Transcript Only),0.422795,"(0.3504740017236426, 0.49157225752970435)",0.035015
4,SVM (Transcript + Finance),0.41993,"(0.36079787738083957, 0.48111173603299584)",0.032169
