In [3]:
# FinBERT sentiment Pipeline 
!pip install --upgrade pip --quiet
!pip install "numpy==1.26.4" "transformers==4.36.2" "torch==2.1.2" pandas tqdm --quiet

import torch
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer, BertForSequenceClassification

print("Torch version:", torch.__version__)
print("GPU available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("Using:", torch.cuda.get_device_name(0))

df = pd.read_csv("news_with_prices_ALL.csv")

# Keeping only AAPL, AMZN, MSFT (GOOGL had no article summaries that could be used for finbert model sentiment) 
df = df[df['Stock_symbol'].isin(['AAPL', 'AMZN', 'MSFT'])].copy()

df = df[df['Article'].notna() & (df['Article'].str.strip() != "")]
print("Remaining rows after dropping empty articles:", len(df))

# Define text column for sentiment analysis
df['text_for_sentiment'] = df['Article']

print("\nLoading FinBERT model...")
tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
model = BertForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

def get_finbert_sentiment(text):
    """
    Runs FinBERT sentiment inference on a single article.
    Returns a dict with probabilities for positive, neutral, and negative.
    """
    if not isinstance(text, str) or text.strip() == "":
        return {'positive': 0, 'neutral': 0, 'negative': 0}

    # Tokenize & send to GPU
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=256).to(device)

    # Forward pass
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
        probs = probs.detach().cpu().tolist()[0]   # âœ… no .numpy()

    labels = ['positive', 'neutral', 'negative']
    return dict(zip(labels, probs))

sentiments = []
for text in tqdm(df['text_for_sentiment'], desc="Running FinBERT sentiment..."):
    sentiments.append(get_finbert_sentiment(text))

# Combine scores with the main dataframe
sent_df = pd.DataFrame(sentiments)
df = pd.concat([df.reset_index(drop=True), sent_df.reset_index(drop=True)], axis=1)

# Create composite FinBERT sentiment score
df['sent_finbert'] = df['positive'] - df['negative']

df.to_csv("news_with_finbert.csv", index=False)
print("\nSaved article-level FinBERT results â†’ 'news_with_finbert.csv'")

print("\nSample FinBERT output:")
print(df[['Stock_symbol', 'positive', 'neutral', 'negative', 'sent_finbert']].head())

print("\nSentiment stats:")
print(df['sent_finbert'].describe())

print("\nAverage sentiment per stock:")
print(df.groupby('Stock_symbol')['sent_finbert'].mean())

daily_sent = (
    df.groupby(['news_date', 'Stock_symbol'])['sent_finbert']
      .mean()
      .reset_index()
      .rename(columns={'sent_finbert': 'daily_sentiment'})
)

daily_sent.to_csv("daily_sentiment.csv", index=False)
print("\nSaved daily sentiment averages â†’ 'daily_sentiment.csv'")

print("\nPreview of daily sentiment:")
print(daily_sent.head())

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0m

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0mâœ… Torch version: 2.1.2+cu121
âœ… GPU available: True
ðŸ’ª Using: NVIDIA A10
Remaining rows after dropping empty articles: 21855

Loading FinBERT model...


Running FinBERT sentiment...: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 21855/21855 [03:22<00:00, 108.05it/s]



âœ… Saved article-level FinBERT results â†’ 'news_with_finbert.csv'

Sample FinBERT output:
  Stock_symbol  positive   neutral      negative  sent_finbert
0         AAPL  0.000019  0.001644  9.983364e-01     -0.998317
1         AAPL  0.982119  0.017686  1.953843e-04      0.981923
2         AAPL  0.001325  0.001397  9.972785e-01     -0.995954
3         AAPL  0.999929  0.000053  1.816657e-05      0.999911
4         AAPL  0.000035  0.999965  1.300703e-07      0.000035

Sentiment stats:
count    21855.000000
mean         0.134152
std          0.639383
min         -1.000000
25%         -0.000038
50%          0.000289
75%          0.853066
max          1.000000
Name: sent_finbert, dtype: float64

Average sentiment per stock:
Stock_symbol
AAPL    0.128690
AMZN    0.155632
MSFT    0.127934
Name: sent_finbert, dtype: float64

âœ… Saved daily sentiment averages â†’ 'daily_sentiment.csv'

Preview of daily sentiment:
    news_date Stock_symbol  daily_sentiment
0  2022-04-26         MSFT        -0