In [2]:
%pip install transformers

Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import scipy
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [2]:
data = pd.read_csv('all-data.csv', 
                   encoding='unicode_escape',
                   names=['Sentiment', 'Text'])
data.head()

Unnamed: 0,Sentiment,Text
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...


In [14]:
# Load the FinBERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

tokenizer_kwargs = {"padding": True, "truncation": True, "max_length": 512}

# Define a function to predict sentiment
def predict_sentiment(text):
    encoded_text = tokenizer(text, return_tensors="pt", **tokenizer_kwargs)
    # Get predictions from the model
    with torch.no_grad():
        logits = model(**encoded_text).logits
    scores = {k: v for k, v in zip(model.config.id2label.values(), scipy.special.softmax(logits.numpy().squeeze()))}
    print(scores)
    print(f"Predicted sentiment for '{text}': {max(scores, key=scores.get)}")


test_text = "The company's stock price is not expected to surge in the coming months."
predict_sentiment(test_text)


test_text = "The company's stock price is going down."
predict_sentiment(test_text)

test_text = "The british pound weakened but stocks rallied."
predict_sentiment(test_text)


{'positive': 0.45130324, 'negative': 0.09192983, 'neutral': 0.4567669}
Predicted sentiment for 'The company's stock price is not expected to surge in the coming months.': neutral
{'positive': 0.008231973, 'negative': 0.94887114, 'neutral': 0.04289689}
Predicted sentiment for 'The company's stock price is going down.': negative
{'positive': 0.43633714, 'negative': 0.5117515, 'neutral': 0.05191137}
Predicted sentiment for 'The british pound weakened but stocks rallied.': negative


In [4]:
preds = []
preds_proba = []
text=data['Text'].tolist()
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
tokenizer_kwargs = {"padding": True, "truncation": True, "max_length": 512}
for x in text:
    # disable gradient calculation for efficiency since we're not performing backpropagation during prediction.
    with torch.no_grad():
        # This line uses the tokenizer to convert the current text x into a format suitable for the model. 
        # The return_tensors="pt" argument specifies converting the data to PyTorch tensors, 
        # and the **tokenizer_kwargs dictionary unpacks additional arguments like 
        # padding, truncation, and maximum sequence length (set to 512 here).
        input_sequence = tokenizer(x, return_tensors="pt", **tokenizer_kwargs)
        # These logits represent the raw scores for each possible sentiment class.
        logits = model(**input_sequence).logits
        # create a dictionary to store the predicted sentiment label and its corresponding probability score
        scores = {
        k: v
        # iterates through the model's configuration (model.config.id2label.values()) to get the sentiment labels and 
        # combines them with the softmax probabilities calculated using scipy.special.softmax on the logits 
        # converted to NumPy array and squeezed to remove extra dimensions
        for k, v in zip(
            model.config.id2label.values(),
            scipy.special.softmax(logits.numpy().squeeze()),
        )
    }
    
    # This line finds the sentiment label (key) in the scores dictionary with the highest probability value.
    sentimentFinbert = max(scores, key=scores.get)
    
    # This captures the highest probability score (value) from the scores dictionary.
    probabilityFinbert = max(scores.values())
    preds.append(sentimentFinbert)
    preds_proba.append(probabilityFinbert)

  return self.fget.__get__(instance, owner)()


In [6]:
y = data['Sentiment'].to_list()

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

print(f'Accuracy-Score: {accuracy_score(y, preds)}')


Accuracy-Score: 0.8893933140734627
