In [23]:
import pandas as pd
import torch
import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics import (accuracy_score, 
                            classification_report, 
                            confusion_matrix)
from transformers import BertTokenizer, BertForSequenceClassification
from makeDataset import *
from torch.utils.data import DataLoader, TensorDataset

from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
print(torch.cuda.is_available())
DEVICE = torch.device('cuda')
print(DEVICE)

df = pd.read_csv('../datasets/stock_news.csv')
x = df['title']
x2 = df['text']

True
cuda


# Finbert Inference

In [24]:
batch_size = 16
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone', do_lower_case=True)
model = BertForSequenceClassification.from_pretrained("saved_models/finbert_finetuned")
device = torch.device('cuda')
model = model.to(device)

test_dataset = CustomDataset(x, None, tokenizer, max_len=32)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

test_dataset2 = CustomDataset(x2, None, tokenizer, max_len=512)
test_loader2 = DataLoader(test_dataset2, batch_size=batch_size, shuffle=False)

mapper = {'negative': 0, 'neutral': 1, 'positive': 2}
reverse_mappers = {0: 'negative', 1: 'neutral', 2: 'positive'}

### Predictions based on header

In [None]:
# model.eval()
# predictions = []
# with torch.no_grad():
#     for batch in test_dataset:
#         input_ids = batch['input_ids'].to(device)
#         attention_mask = batch['attention_mask'].to(device)
#         outputs = model(input_ids.unsqueeze(0), attention_mask=attention_mask.unsqueeze(0))
#         _, predicted = torch.max(outputs.logits, 1)
#         predictions.append(reverse_mappers[predicted.item()])
# # df['finbert_title_sentiment'] = predictions
# print(predictions)

In [None]:
model.eval()
predictions = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(outputs.logits, 1)
        temp_list = [int(each) for each in predicted.cpu().numpy()]
        for each in temp_list:
            predictions.append(reverse_mappers[each])
# df['finbert_title_sentiment'] = predictions
predictions

model.eval()
predictions = []
with torch.no_grad():
    for batch in test_loader2:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(outputs.logits, 1)
        temp_list = [int(each) for each in predicted.cpu().numpy()]
        for each in temp_list:
            predictions.append(reverse_mappers[each])
df['finbert_text_sentiment'] = predictions
torch.cuda.empty_cache()
import gc
gc.collect()

# Flair Inference

In [9]:
# load the model
model2 = DistilBertForSequenceClassification.from_pretrained("../flair_model")
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
# Predict sentiment for x
x_encodings = tokenizer(list(x), truncation=True, padding=True, max_length=128)
x_dataset = TensorDataset(
    torch.tensor(x_encodings['input_ids']),
    torch.tensor(x_encodings['attention_mask'])
)
x_dataloader = DataLoader(x_dataset, batch_size=batch_size)
x_predictions = []
model2.eval()
for batch in x_dataloader:
    input_ids, attention_mask = batch
    input_ids = input_ids.to(DEVICE)
    attention_mask = attention_mask.to(DEVICE)
    model2 = model2.to(DEVICE)
    with torch.no_grad():
        outputs = model2(input_ids, attention_mask=attention_mask)
    predicted_labels = torch.argmax(outputs.logits, dim=1)
    x_predictions.extend(predicted_labels.tolist())

# Predict sentiment for x2
x2_encodings = tokenizer(list(x2), truncation=True, padding=True, max_length=128)
x2_dataset = TensorDataset(
    torch.tensor(x2_encodings['input_ids']),
    torch.tensor(x2_encodings['attention_mask'])
)
x2_dataloader = DataLoader(x2_dataset, batch_size=batch_size)
x2_predictions = []
model2.eval()
for batch in x2_dataloader:
    input_ids, attention_mask = batch
    input_ids = input_ids.to(DEVICE)
    attention_mask = attention_mask.to(DEVICE)
    model2 = model2.to(DEVICE)
    with torch.no_grad():
        outputs = model2(input_ids, attention_mask=attention_mask)
    predicted_labels = torch.argmax(outputs.logits, dim=1)
    x2_predictions.extend(predicted_labels.tolist())

# Map sentiment index to labels
sentiment_labels = {0: 'negative', 1: 'neutral', 2: 'positive'}

# Print the predictions with real labels
title_sentiment_list = []
for sentiment in x_predictions:
    title_sentiment_list.append(sentiment_labels[sentiment])

text_sentiment_list = []
for sentiment in x2_predictions:
    text_sentiment_list.append(sentiment_labels[sentiment])

df['flair_title_sentiment'] = title_sentiment_list
df['flair_text_sentiment'] = text_sentiment_list


In [11]:
df.head()

Unnamed: 0,stock,title,text,date,time,am_pm,finbert_title_sentiment,finbert_text_sentiment,flair_title_sentiment,flair_text_sentiment
0,AAPL,BofA expects 'strong refresh cycle' for iPhone...,Citing findings from their global smartphone...,2024-02-28,07:49,AM,neutral,positive,positive,positive
1,AAPL,Apple cancels decade-long electric car project...,By Stephen Nellis and Shivansh Tiwary(Reuter...,2024-02-27,15:47,PM,neutral,negative,neutral,negative
2,AAPL,Marketmind: Calm prevails before inflation dat...,A look at the day ahead in European an...,2024-02-28,00:41,AM,neutral,neutral,neutral,neutral
3,AAPL,"Apple Halts Electric Car Project Titan, Shifts...",Quiver Quantitative - In a surprising ...,2024-02-27,15:44,PM,neutral,neutral,neutral,neutral
4,AAPL,Marketmind: US tracking 3%+ growth; Apple down...,A look at the day ahead in U.S. and global...,2024-02-28,06:02,AM,neutral,positive,neutral,positive


In [21]:
def find_conflicts(df, kind, column1, column2):
    conflicts = df[(df[column1] != df[column2])]
    return conflicts[['stock', kind, column1, column2]]

title_conflicts = find_conflicts(df, 'title', 'finbert_title_sentiment', 'flair_title_sentiment')
text_conflicts = find_conflicts(df, 'text', 'finbert_text_sentiment', 'flair_text_sentiment')
title_conflicts

Unnamed: 0,stock,title,finbert_title_sentiment,flair_title_sentiment
0,AAPL,BofA expects 'strong refresh cycle' for iPhone...,neutral,positive
5,AAPL,Morgan Stanley says Apple discontinuing its el...,neutral,positive
10,AMC,GameStop shares spike premarket amid strong vo...,negative,positive
14,AMC,Retail traders reignite rally in GameStop shares,negative,neutral
20,AMD,Stock Market Today: S&P 500 in record close as...,negative,positive
21,AMD,Why Is AMD (AMD) Stock Soaring Today,negative,neutral
23,AMD,"Nvidia stock surges after results, lifts other...",neutral,negative
27,AMZN,"Factbox-US, Canadian companies kick off 2024 w...",neutral,negative
30,AMZN,Google Accuses Microsoft of Seeking Cloud Comp...,neutral,negative
35,BA,Exclusive-Allegiant expects fewer aircraft fro...,neutral,negative
