In [35]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertModel, RobertaTokenizer, RobertaModel, AutoTokenizer, AutoModel, pipeline
import torch
from tqdm import tqdm
import re
import math
tqdm.pandas()

In [11]:
# Load the data from data.xlsx
data = pd.read_excel('final_data.xlsx')

# getting the 'headline' column
headlines = data['headline']

In [12]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta_model = RobertaModel.from_pretrained('roberta-base')

auto_tokenizer = AutoTokenizer.from_pretrained("chrommium/bert-base-multilingual-cased-finetuned-news-headlines")
auto_model = AutoModel.from_pretrained("chrommium/bert-base-multilingual-cased-finetuned-news-headlines")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
device = torch.device('mps')
bert_model = bert_model.to(device)
model_roberta = roberta_model.to(device)
auto_model = auto_model.to(device)

In [14]:
# setting embedding for empty headlines
empty_bert_embedding = np.zeros((bert_model.config.hidden_size,))
empty_roberta_embedding = np.zeros((roberta_model.config.hidden_size,))
empty_auto_embedding = np.zeros((auto_model.config.hidden_size,))

In [38]:
def get_embeddings(text, tokenizer, model, empty_embedding):
    if type(text) == float and math.isnan(text):
        return empty_embedding
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=50).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).cpu().numpy()

In [41]:
bert_embeddings = headlines.progress_apply(lambda x: get_embeddings(x, bert_tokenizer, bert_model, empty_bert_embedding))
bert_embeddings_reshaped = np.vstack(bert_embeddings.values).reshape(bert_embeddings.shape[0], -1)
bert_df = pd.DataFrame(bert_embeddings_reshaped, columns=[f'bert_{i}' for i in range(bert_embeddings_reshaped.shape[1])])

roberta_embeddings = headlines.progress_apply(lambda x: get_embeddings(x, roberta_tokenizer, model_roberta, empty_roberta_embedding))
roberta_embeddings_reshaped = np.vstack(roberta_embeddings.values).reshape(roberta_embeddings.shape[0], -1)
roberta_df = pd.DataFrame(roberta_embeddings_reshaped, columns=[f'roberta_{i}' for i in range(roberta_embeddings_reshaped.shape[1])])

auto_embeddings = headlines.progress_apply(lambda x: get_embeddings(x, auto_tokenizer, auto_model, empty_auto_embedding))
auto_embeddings_reshaped = np.vstack(auto_embeddings.values).reshape(auto_embeddings.shape[0], -1)
auto_df = pd.DataFrame(auto_embeddings_reshaped, columns=[f'auto_{i}' for i in range(auto_embeddings_reshaped.shape[1])])

100%|██████████| 541823/541823 [24:59<00:00, 361.38it/s]  
100%|██████████| 541823/541823 [31:55<00:00, 282.80it/s]  
100%|██████████| 541823/541823 [28:52<00:00, 312.79it/s]  


In [42]:
bert_df.to_csv('final_bert_df.csv', index=False)
roberta_df.to_csv('final_roberta_df.csv', index=False)
auto_df.to_csv('final_auto_df.csv', index=False)

In [43]:
bert_df_read = pd.read_csv('bert_df.csv')
roberta_df_read = pd.read_csv('roberta_df.csv')
auto_df_read = pd.read_csv('auto_df.csv')

In [45]:
# Sentiment analysis pipeline
sentiment_pipeline = pipeline("sentiment-analysis", device=device)


def get_sentiment_score(text):
    if type(text) == float and math.isnan(text):
        return np.nan, np.nan
    result = sentiment_pipeline(text)[0]
    return result['label'], result['score']

tqdm.pandas()
data[['sentiment_label', 'sentiment_score']] = data['headline'].progress_apply(lambda x: pd.Series(get_sentiment_score(x)))

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
100%|██████████| 541823/541823 [20:19<00:00, 444.30it/s]  


In [46]:
bert_df_read['combined'] =  bert_df_read.apply(lambda row: row.values.tolist(), axis=1)
roberta_df_read['combined'] =  roberta_df_read.apply(lambda row: row.values.tolist(), axis=1)
auto_df_read['combined'] =  auto_df_read.apply(lambda row: row.values.tolist(), axis=1)

In [49]:
data['bert'] = bert_df_read['combined']
data['roberta'] = roberta_df_read['combined']
data['auto'] = auto_df_read['combined']

In [50]:
data.to_csv('final_data_with_features.csv', index=False)

In [55]:
def parse_combined_column(s):
    return eval(s)

df_read = pd.read_csv('data_with_features.csv', converters={'bert': parse_combined_column, 'auto': parse_combined_column, 'roberta': parse_combined_column})

In [56]:
df_read.head(10)

Unnamed: 0,headline,date,stock,adjusted_datetime,opening_price,closing_price,bert,roberta,auto,sentiment_label,sentiment_score
0,Sprucegrove Investment Management Ltd Buys Rya...,2020-05-15,ABEV,2020-05-15,2.02,2.02,"[0.2559071, 0.10364858, 0.37824497, -0.0081513...","[-0.008698055, 0.047732446, -0.02077271, 0.043...","[-0.44617787, -1.0307866, 0.65105325, 0.131997...",NEGATIVE,0.809008
1,"Westwood Global Investments, LLC Buys Ambev SA...",2020-05-14,ABEV,2020-05-14,1.97,2.04,"[0.1890944, 0.014032838, 0.13237679, 0.0806400...","[0.012474053, 0.086053416, 0.037017934, 0.0866...","[-0.4658743, -1.1817627, 0.5621838, 0.36915895...",POSITIVE,0.876715
2,"First Eagle Investment Management, LLC Buys Am...",2020-05-12,ABEV,2020-05-12,2.11,2.02,"[0.086543754, 0.10937805, 0.28820986, 0.246597...","[0.018468218, 0.10328719, 0.036899947, 0.08108...","[-0.39889896, -1.0680475, 0.7733227, 0.1657631...",NEGATIVE,0.952856
3,Ambev Reports —…—… First Quarter Results Under...,2020-05-07,ABEV,2020-05-07,2.1,2.0,"[-0.30589682, 0.17540877, 0.046134617, 0.03900...","[0.046941273, 0.101946086, -0.021644266, 0.132...","[-0.32791406, -0.91616255, 0.53547114, 0.04071...",NEGATIVE,0.925539
4,5 Latin American Stocks to Consider in Honor o...,2020-05-05,ABEV,2020-05-05,2.18,2.15,"[-0.40894422, -0.1794778, -0.28505862, 0.09229...","[0.12876071, 0.20147394, 0.070045024, -0.10620...","[-0.1931915, -0.33910504, 0.7017856, 0.0992713...",POSITIVE,0.998113
5,CORRECTION - Ambev's —…–9 Annual Report on For...,2020-04-15,ABEV,2020-04-15,2.31,2.29,"[-0.19625764, 0.067467004, 0.30294913, 0.02172...","[0.0044318372, 0.0933271, 0.025640752, 0.03964...","[-0.19258928, -0.84624475, 0.5516808, 0.031516...",NEGATIVE,0.978848
6,"Thinking about buying stock in Ambev, Advanced...",2020-02-27,ABEV,2020-02-27,3.17,3.25,"[0.16650629, -0.07729488, 0.46726334, 0.101827...","[0.028588777, 0.047011923, -0.036552325, 0.107...","[-0.3209373, -0.90578127, 0.8343287, 0.2536151...",POSITIVE,0.505855
7,"Fairhaven Wealth Management, LLC Buys First Tr...",2020-02-07,ABEV,2020-02-07,3.92,3.9,"[0.24698141, -0.37326866, 0.32759267, 0.288165...","[0.0056773126, 0.09306396, -0.04043027, 0.0089...","[-0.5195535, -0.9528065, 0.8224277, 0.14866994...",NEGATIVE,0.827918
8,"Investec Asset Management North America, Inc. ...",2020-02-07,ABEV,2020-02-07,3.92,3.9,"[-0.15901707, -0.39555636, -0.014359138, 0.284...","[0.027121706, 0.06587887, 0.026765367, 0.12634...","[-0.5821713, -0.71371293, 0.6212287, 0.3346102...",POSITIVE,0.880395
9,Alta Capital Management Llc Buys PerkinElmer I...,2020-01-30,ABEV,2020-01-30,4.24,4.27,"[0.009920761, -0.16916458, 0.30622596, 0.43343...","[0.07105354, 0.041213304, -0.036283303, 0.0822...","[-0.5046917, -0.9496251, 0.7434372, 0.10432256...",NEGATIVE,0.886313


In [57]:
type(df_read['bert'][0])

list