In [None]:
!pip install transformers



In [None]:
# Import libraries
import pandas as pd
from transformers import pipeline
from tqdm.notebook import tqdm  # ✅ Notebook-friendly progress bar

# Load your dataset
df = pd.read_csv('classified_documents_new.csv')
texts = df['Document'].tolist()

# Load FinBERT model and tokenizer
sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model="yiyanghkust/finbert-tone",
    tokenizer="yiyanghkust/finbert-tone"
)

# Mapping from label to numeric encoding
label_encoding = {
    'positive': 1,
    'neutral': 0,
    'negative': -1
}

# Create empty lists to store results
sentiment_labels = []
confidence_scores = []

# Predict sentiment with progress bar
for text in tqdm(texts, desc="Generating FinBERT Sentiment", unit="headline", total=len(texts), ncols=100, leave=True):
    result = sentiment_pipeline(text)[0]
    label = result['label'].lower()
    score = result['score']

    sentiment_labels.append(label_encoding[label])
    confidence_scores.append(score)

# Add results to dataframe
df['finbert_sentiment_label'] = sentiment_labels
df['finbert_confidence_score'] = confidence_scores

# Save to new CSV if needed
df.to_csv('news_with_finbert_sentiment.csv', index=False)

print("Sentiment analysis complete.")


Device set to use cpu


Generating FinBERT Sentiment:   0%|                                 | 0/34481 [00:00<?, ?headline/s]

Sentiment analysis complete.


In [None]:
# Install Huggingface if needed
!pip install transformers torch tqdm --quiet

from transformers import pipeline
import pandas as pd
from tqdm.notebook import tqdm

# Load the latest RoBERTa sentiment model
model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"

classifier = pipeline("sentiment-analysis", model=model_name)

# Load your dataset
df = pd.read_csv('classified_documents_final.csv')  # Update path if needed
texts = df['Document'].tolist()

# Prepare storage
labels = []
scores = []

# Label headlines with progress bar
for text in tqdm(texts, desc="Labeling Sentiment with RoBERTa-Latest", ncols=100):
    result = classifier(text)[0]
    labels.append(result['label'])
    scores.append(result['score'])

# Map Huggingface labels to sentiment
label_mapping = {
    "LABEL_0": "negative",
    "LABEL_1": "neutral",
    "LABEL_2": "positive"
}

numeric_mapping = {
    "negative": -1,
    "neutral": 0,
    "positive": 1
}

# Apply mappings
df['roberta_sentiment_label'] = [label_mapping.get(label, "unknown") for label in labels]
df['roberta_sentiment_score'] = scores
df['roberta_sentiment_numeric'] = df['roberta_sentiment_label'].map(numeric_mapping)

# Save the result
df.to_csv('roberta_latest_sentiment.csv', index=False)

print("✅ Sentiment labeling complete!")


config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Device set to use cpu


Labeling Sentiment with RoBERTa-Latest:   0%|                             | 0/25904 [00:00<?, ?it/s]

✅ Sentiment labeling complete!


In [None]:
# Import packages
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import pandas as pd
import numpy as np
from scipy.special import softmax
from tqdm.notebook import tqdm

# Load your data
df = pd.read_csv('classified_documents_final.csv')

# Load model and tokenizer
MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

# 🔵 Define correct labels manually
labels = ['negative', 'neutral', 'positive']

# Map label to numeric encoding
label_encoding = {'negative': -1, 'neutral': 0, 'positive': 1}

# Prepare storage for results
sentiment_numeric = []

# Progress bar
for text in tqdm(df['Document'], desc="Generating Roberta Sentiment", ncols=100):
    try:
        encoded_input = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
        output = model(**encoded_input)
        scores = output[0][0].detach().numpy()
        scores = softmax(scores)

        # Get best prediction
        ranking = np.argsort(scores)[::-1]
        best_label = labels[ranking[0]]  # Top label
        best_numeric = label_encoding[best_label]  # Map to numeric

        sentiment_numeric.append(best_numeric)

    except Exception as e:
        sentiment_numeric.append(0)  # Default to neutral if error
        print(f"Error with text: {text[:30]}...")

# Add numeric label to dataframe
df['roberta_sentiment'] = sentiment_numeric

# Save final dataframe
df.to_csv('roberta_sentiment_final.csv', index=False)

print("✅ Roberta sentiment classification complete and saved!")

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Generating Roberta Sentiment:   0%|                                       | 0/25904 [00:00<?, ?it/s]

✅ Roberta sentiment classification complete and saved!


In [None]:
#Finetuned finbert sentiment
import pandas as pd
from transformers import pipeline
from tqdm.notebook import tqdm  # For notebook-friendly progress

# Load dataset (does not remove any columns)
df = pd.read_csv('classified_documents_final.csv')
texts = df['Document'].tolist()

# Load fine-tuned FinBERT model and tokenizer
sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model="./finbert-finetuned-tech",
    tokenizer="./finbert-finetuned-tech"
)

# Map fine-tuned labels to desired numeric values
label_encoding = {
    'positive': 1,
    'neutral': 0,
    'negative': -1
}

# Prepare result lists
sentiment_labels = []
confidence_scores = []

# Generate sentiment predictions
for text in tqdm(texts, desc="Generating Sentiment (Fine-Tuned)", unit="headline"):
    result = sentiment_pipeline(text)[0]
    label = result['label'].lower()
    score = result['score']

    sentiment_labels.append(label_encoding[label])
    confidence_scores.append(score)

# Append new columns to existing DataFrame
df['finetuned_sentiment_label'] = sentiment_labels
df['finetuned_confidence_score'] = confidence_scores

# Optionally save to a new CSV
df.to_csv('news_with_finetuned_sentiment.csv', index=False)

print("Sentiment analysis using fine-tuned FinBERT complete.")


Device set to use cpu


Generating Sentiment (Fine-Tuned):   0%|          | 0/25904 [00:00<?, ?headline/s]

Sentiment analysis using fine-tuned FinBERT complete.
