In [1]:
# Install the Hugging Face Transformers library
!pip install transformers

from transformers import BertTokenizer, BertForSequenceClassification, pipeline
import pandas as pd

Collecting transformers
  Downloading transformers-4.51.1-py3-none-any.whl.metadata (38 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.30.2-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Using cached safetensors-0.5.3-cp38-abi3-win_amd64.whl.metadata (3.9 kB)
Downloading transformers-4.51.1-py3-none-any.whl (10.4 MB)
   ---------------------------------------- 0.0/10.4 MB ? eta -:--:--
   -------------- ------------------------- 3.7/10.4 MB 19.8 MB/s eta 0:00:01
   ------------------------------------ --- 9.4/10.4 MB 24.5 MB/s eta 0:00:01
   ---------------------------------------- 10.4/10.4 MB 23.1 MB/s eta 0:00:00
Downloading huggingface_hub-0.30.2-py3-none-any.whl (481 kB)
Using cached safetensors-0.5.3-cp38-abi3-win_amd64.whl (308 kB)
Downloading tokenizers-0.21.1-cp39-

In [2]:
# Load the FinBERT model and tokenizer from Hugging Face
model_name = "yiyanghkust/finbert-tone"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=3)

# Create a sentiment analysis pipeline
finbert_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

# Load the dataset
data = pd.read_csv('aggregate.csv')

headlines = data['headline'].tolist()

# Perform sentiment analysis on the headlines
results = finbert_pipeline(headlines)

# Add sentiment results to the dataset
data['sentiment_label'] = [result['label'] for result in results]
data['sentiment_score'] = [result['score'] for result in results]

#Sentiment labels (LABEL_0, LABEL_1, LABEL_2) are mapped to their corresponding sentiments (neutral, positive, negative).
#The sentiment_score column represents the confidence level of the model for its prediction. 
#They are probabilities ranging from 0 to 1. A higher score indicates greater confidence in the assigned label. For example:
#LABEL_1 with a score of 0.95 means the model is 95% confident that the sentiment is positive.
#LABEL_2 with a score of 0.85 means the model is 85% confident that the sentiment is negative.

# Save the results to a new CSV file
data.to_csv('sentiment_scored_headlines_FinBERT.csv', index=False)

# Display sample results
print(data.head())

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

Device set to use cpu


model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

                                            headline stock        date  \
0  AI Daily: Analyst sees Apple  Alibaba partners...  aapl  2025-03-03   
1  Apple’s iPhone 16e Is Likely to Underwhelm  Sa...  aapl  2025-03-03   
2  Apple CEO teases ‘something in the Air’ this week  aapl  2025-03-03   
3  Apple’s iPhone ceded market share in China  Eu...  aapl  2025-03-03   
4  Apple (AAPL): New Buy Recommendation for This ...  aapl  2025-03-03   

     open   close sentiment_label  sentiment_score  
0  241.79  238.03        Positive         1.000000  
1  241.79  238.03         Neutral         0.735460  
2  241.79  238.03         Neutral         0.999914  
3  241.79  238.03         Neutral         0.997822  
4  241.79  238.03        Positive         1.000000  
