In [11]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertModel, RobertaTokenizer, RobertaModel, AutoTokenizer, AutoModel, pipeline
import torch
from tqdm import tqdm
import re
import math
tqdm.pandas()

In [12]:
# Load the data from data.xlsx
data = pd.read_excel('final_data.xlsx')

# getting the 'headline' column
headlines = data['headline']

In [13]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta_model = RobertaModel.from_pretrained('roberta-base')

auto_tokenizer = AutoTokenizer.from_pretrained("chrommium/bert-base-multilingual-cased-finetuned-news-headlines")
auto_model = AutoModel.from_pretrained("chrommium/bert-base-multilingual-cased-finetuned-news-headlines")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
device = torch.device('mps')
bert_model = bert_model.to(device)
model_roberta = roberta_model.to(device)
auto_model = auto_model.to(device)

In [15]:
# setting embedding for empty headlines
empty_bert_embedding = np.zeros((bert_model.config.hidden_size,))
empty_roberta_embedding = np.zeros((roberta_model.config.hidden_size,))
empty_auto_embedding = np.zeros((auto_model.config.hidden_size,))

In [18]:
def get_embeddings(text, tokenizer, model, empty_embedding):
    if type(text) == float and math.isnan(text):
        return empty_embedding
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=50).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).cpu().numpy()

In [41]:
bert_embeddings = headlines.progress_apply(lambda x: get_embeddings(x, bert_tokenizer, bert_model, empty_bert_embedding))
bert_embeddings_reshaped = np.vstack(bert_embeddings.values).reshape(bert_embeddings.shape[0], -1)
bert_df = pd.DataFrame(bert_embeddings_reshaped, columns=[f'bert_{i}' for i in range(bert_embeddings_reshaped.shape[1])])

roberta_embeddings = headlines.progress_apply(lambda x: get_embeddings(x, roberta_tokenizer, model_roberta, empty_roberta_embedding))
roberta_embeddings_reshaped = np.vstack(roberta_embeddings.values).reshape(roberta_embeddings.shape[0], -1)
roberta_df = pd.DataFrame(roberta_embeddings_reshaped, columns=[f'roberta_{i}' for i in range(roberta_embeddings_reshaped.shape[1])])

auto_embeddings = headlines.progress_apply(lambda x: get_embeddings(x, auto_tokenizer, auto_model, empty_auto_embedding))
auto_embeddings_reshaped = np.vstack(auto_embeddings.values).reshape(auto_embeddings.shape[0], -1)
auto_df = pd.DataFrame(auto_embeddings_reshaped, columns=[f'auto_{i}' for i in range(auto_embeddings_reshaped.shape[1])])

100%|██████████| 541823/541823 [24:59<00:00, 361.38it/s]  
100%|██████████| 541823/541823 [31:55<00:00, 282.80it/s]  
100%|██████████| 541823/541823 [28:52<00:00, 312.79it/s]  


In [42]:
bert_df.to_csv('final_bert_df.csv', index=False)
roberta_df.to_csv('final_roberta_df.csv', index=False)
auto_df.to_csv('final_auto_df.csv', index=False)

In [2]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

In [2]:
bert_df_read = pd.read_csv('final_bert_df.csv')
roberta_df_read = pd.read_csv('final_roberta_df.csv')
auto_df_read = pd.read_csv('final_auto_df.csv')

In [5]:
auto_df_read['combined'] =  auto_df_read.progress_apply(lambda row: row.values.tolist(), axis=1)

100%|██████████| 541823/541823 [01:05<00:00, 34142.60it/s]IOStream.flush timed out
100%|██████████| 541823/541823 [01:31<00:00, 5926.14it/s] 


In [6]:
data = pd.read_csv('final_data_with_features.csv')

In [7]:
data['auto'] = auto_df_read['combined']

In [8]:
data.to_csv('final_data_with_features.csv', index=False)

In [45]:
# Sentiment analysis pipeline
sentiment_pipeline = pipeline("sentiment-analysis", device=device)


def get_sentiment_score(text):
    if type(text) == float and math.isnan(text):
        return np.nan, np.nan
    result = sentiment_pipeline(text)[0]
    return result['label'], result['score']

tqdm.pandas()
data[['sentiment_label', 'sentiment_score']] = data['headline'].progress_apply(lambda x: pd.Series(get_sentiment_score(x)))

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
100%|██████████| 541823/541823 [20:19<00:00, 444.30it/s]  


In [None]:
bert_df_read['combined'] =  bert_df_read.progress_apply(lambda row: row.values.tolist(), axis=1)
roberta_df_read['combined'] =  roberta_df_read.progress_apply(lambda row: row.values.tolist(), axis=1)
auto_df_read['combined'] =  auto_df_read.progress_apply(lambda row: row.values.tolist(), axis=1)

100%|██████████| 541823/541823 [02:48<00:00, 17258.84it/s]IOStream.flush timed out
100%|██████████| 541823/541823 [03:13<00:00, 2806.60it/s] 
 90%|████████▉ | 484932/541823 [07:40<01:24, 671.17it/s]  

In [49]:
data['bert'] = bert_df_read['combined']
data['roberta'] = roberta_df_read['combined']
data['auto'] = auto_df_read['combined']

In [50]:
data.to_csv('final_data_with_features.csv', index=False)

In [7]:
import pandas as pd
import numpy as np

In [8]:
def parse_combined_column(s):
    if s == '':
        return np.zeros(768)
    return eval(s)

df_read = pd.read_csv('final_data_with_features_2.csv', converters={'bert': parse_combined_column, 'auto': parse_combined_column, 'roberta': parse_combined_column})

In [None]:
import math

In [11]:
df_read.loc[df_read['headline'].apply(lambda x: isinstance(x, float)), 'bert'] = math.nan

In [13]:
df_read.loc[df_read['headline'].apply(lambda x: isinstance(x, float)), 'roberta'] = math.nan
df_read.loc[df_read['headline'].apply(lambda x: isinstance(x, float)), 'auto'] = math.nan

In [14]:
df_read.to_csv('final_data_with_features_2.csv', index=False)

In [6]:
df_read.head(10000)

Unnamed: 0,Open,Close,stock,Date,headline,bert,roberta,auto,sentiment_label,sentiment_score
0,0.000,0.882813,KR,1969-12-31,,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",,
1,0.000,0.882813,KR,1970-01-02,,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",,
2,0.000,0.890625,KR,1970-01-05,,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",,
3,0.000,0.898438,KR,1970-01-06,,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",,
4,0.000,0.882813,KR,1970-01-07,,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",,
...,...,...,...,...,...,...,...,...,...,...
9995,10.770,10.660000,KR,2009-08-06,,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",,
9996,10.705,10.450000,KR,2009-08-07,,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",,
9997,10.495,10.465000,KR,2009-08-10,,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",,
9998,10.465,10.475000,KR,2009-08-11,,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",,


In [5]:
type(df_read['bert'][0])

list