In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score

In [2]:
symbols = ['aapl', 'btc', 'eth', 'spx']

In [3]:
data_agg = []

In [4]:
scores = []
for symbol in symbols:
    data = pd.read_pickle('data/stocktwits/datasets_clean/' + symbol + '_twits_012018_072019_clean.pickle')
    on_sent = pd.read_pickle('data/stocktwits/on_sentiment/on_' + symbol + '_sentiment.pickle')
    vader_sent = pd.read_pickle('data/stocktwits/vader_sentiment/vader_' + symbol + '_sentiment.pickle')
    bert_sent = pd.read_pickle('data/stocktwits/bert_sentiment/bert_' + symbol + '_sentiment.pickle')
    bert_unb_sent = pd.read_pickle('data/stocktwits/bert_unb_sentiment/bert_unb_' + symbol + '_sentiment.pickle')
    
    data = data.replace({'sentiment_declared' : {'Bearish' : 0, 'Bullish' : 1}})
    data['sentiment_on'] = (on_sent.sentiment >= 0) + 0
    data['sentiment_vader'] = (vader_sent.score >= 0) + 0
    data['sentiment_bert_unb'] = bert_unb_sent.replace({'pred_label' : {'Negative' : 0, 'Positive': 1}}).pred_label
    
    data = data.dropna()
    data.index = pd.to_datetime(data.created_at)
    data = data.tz_convert('America/New_York')
    data = data[data.index > pd.to_datetime('2018-07-01 04:00:00+00:00').tz_convert('America/New_York')]
    data_agg.append(data)
                               

In [5]:
data = pd.concat(data_agg, axis=0)

In [6]:
pos_ratio = round(data.sentiment_declared.sum() / len(data), 3)
on_acc = accuracy_score(data.sentiment_declared, data.sentiment_on)
vader_acc = accuracy_score(data.sentiment_declared, data.sentiment_vader)
bert_unb_acc = accuracy_score(data.sentiment_declared, data.sentiment_bert_unb)
    
on_f1 = f1_score(data.sentiment_declared, data.sentiment_on)
vader_f1 = f1_score(data.sentiment_declared, data.sentiment_vader)
bert_unb_f1 = f1_score(data.sentiment_declared, data.sentiment_bert_unb)
scores.append([len(data), pos_ratio, on_acc, vader_acc, bert_unb_acc, on_f1, vader_f1, bert_unb_f1])  

In [7]:
scores

[[454401,
  0.714,
  0.623062009106494,
  0.6825623183047572,
  0.8370470135409033,
  0.7206663002689273,
  0.7879793629562125,
  0.8930993760304449]]

In [8]:
scores_df = pd.DataFrame(scores, index=['symbols'], columns=['data_size', 'pos_ratio', 'on_acc', 'vader_acc', 'bert_unb_acc', 'on_f1', 'vader_f1', 'bert_unb_f1'])

In [10]:
scores_df

Unnamed: 0,data_size,pos_ratio,on_acc,vader_acc,bert_unb_acc,on_f1,vader_f1,bert_unb_f1
symbols,454401,0.714,0.623062,0.682562,0.837047,0.720666,0.787979,0.893099


In [11]:
scores_df[['vader_acc', 'on_acc', 'bert_unb_acc', 'vader_f1', 'on_f1', 'bert_unb_f1']].round(3)

Unnamed: 0,vader_acc,on_acc,bert_unb_acc,vader_f1,on_f1,bert_unb_f1
symbols,0.683,0.623,0.837,0.788,0.721,0.893
