In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score

In [2]:
symbols = ['aapl', 'btc', 'eth', 'spx']

In [3]:
scores = []
for symbol in symbols:
    data = pd.read_pickle('data/stocktwits/datasets_clean/' + symbol + '_twits_012018_072019_clean.pickle')
    on_sent = pd.read_pickle('data/stocktwits/on_sentiment/on_' + symbol + '_sentiment.pickle')
    vader_sent = pd.read_pickle('data/stocktwits/vader_sentiment/vader_' + symbol + '_sentiment.pickle')
    bert_sent = pd.read_pickle('data/stocktwits/bert_sentiment/bert_' + symbol + '_sentiment.pickle')
    bert_unb_sent = pd.read_pickle('data/stocktwits/bert_unb_sentiment/bert_unb_' + symbol + '_sentiment.pickle')
    
    data = data.replace({'sentiment_declared' : {'Bearish' : 0, 'Bullish' : 1}})
    data['sentiment_on'] = (on_sent.sentiment >= 0) + 0
    data['sentiment_vader'] = (vader_sent.score >= 0) + 0
    data['sentiment_bert_unb'] = bert_unb_sent.replace({'pred_label' : {'Negative' : 0, 'Positive': 1}}).pred_label
    
    data = data.dropna()
    data.index = pd.to_datetime(data.created_at)
    data = data.tz_convert('America/New_York')
    data = data[data.index > pd.to_datetime('2018-07-01 04:00:00+00:00').tz_convert('America/New_York')]
    
    pos_ratio = round(data.sentiment_declared.sum() / len(data), 3)
    on_acc = accuracy_score(data.sentiment_declared, data.sentiment_on)
    vader_acc = accuracy_score(data.sentiment_declared, data.sentiment_vader)
    bert_unb_acc = accuracy_score(data.sentiment_declared, data.sentiment_bert_unb)
    
    on_f1 = f1_score(data.sentiment_declared, data.sentiment_on)
    vader_f1 = f1_score(data.sentiment_declared, data.sentiment_vader)
    bert_unb_f1 = f1_score(data.sentiment_declared, data.sentiment_bert_unb)
    scores.append([len(data), pos_ratio, on_acc, vader_acc, bert_unb_acc, on_f1, vader_f1, bert_unb_f1])                             

In [8]:
scores_df = pd.DataFrame(scores, index=symbols, columns=['data_size', 'pos_ratio', 'on_acc', 'vader_acc', 'bert_unb_acc', 'on_f1', 'vader_f1', 'bert_unb_f1'])

In [14]:
scores_df.reindex(['btc', 'eth', 'aapl', 'spx'])[['vader_acc', 'on_acc', 'bert_unb_acc', 'vader_f1', 'on_f1', 'bert_unb_f1']].round(3)

Unnamed: 0,vader_acc,on_acc,bert_unb_acc,vader_f1,on_f1,bert_unb_f1
btc,0.704,0.643,0.855,0.809,0.749,0.91
eth,0.744,0.686,0.891,0.841,0.784,0.932
aapl,0.651,0.588,0.813,0.755,0.671,0.869
spx,0.589,0.562,0.74,0.669,0.604,0.784
