# Relevant Imports

In [1]:
# Cell 1: Relevant Imports
# We need the same imports as in the RoBERTa_base.ipynb for sentiment annotation
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline
nltk.download('stopwords', quiet=True)  # Ensure stopwords are downloaded


True

In [2]:
# Cell 2: Define the SentimentAnnotator Class
# This is directly from the RoBERTa_base.ipynb - no changes needed
class SentimentAnnotator:
    def __init__(self, model_name='cardiffnlp/twitter-roberta-base-sentiment-latest'):
        """
        Initializes the SentimentAnnotator with the specified model.
        Utilizes Apple's MPS backend if available.
        """
        self.device = self._get_device()
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
        self.model.to(self.device)
        self.pipeline = TextClassificationPipeline(
            model=self.model,
            tokenizer=self.tokenizer,
            device=self.device.index if self.device.type != 'cpu' else -1,
            top_k=None,  # Return scores for all labels
            batch_size=32
        )

    def _get_device(self):
        """
        Determines the appropriate device to use (MPS if available, else CPU).
        """
        if torch.backends.mps.is_available():
            device = torch.device('mps')
            print("Using Apple's MPS backend.")
        else:
            device = torch.device('cpu')
            print("MPS backend not available. Using CPU.")
        return device

    def preprocess_text(self, text):
        """
        Preprocesses the input text by cleaning and normalizing it.
        """
        # Convert to string and strip leading/trailing whitespace
        text = str(text).strip()
        
        # Handle missing or empty text
        if not text:
            return ''
        
        # Convert to lowercase
        text = text.lower()
        
        # Remove punctuation
        text = re.sub(r'[^\w\s]', '', text)
        
        # Remove extra whitespace
        text = ' '.join(text.split())
        
        # Remove stop words
        stop_words = set(stopwords.words('english'))
        text = ' '.join([word for word in text.split() if word not in stop_words])
        
        return text

    def annotate(self, texts):
        """
        Annotates a list of texts with sentiment scores.
        """
        # Preprocess texts
        preprocessed_texts = [self.preprocess_text(text) for text in texts]
        
        # Run the pipeline
        annotations = self.pipeline(preprocessed_texts)
        return annotations

    def annotate_dataframe(self, df, text_column, output_column='sentiment_score'):
        texts = df[text_column].tolist()
        annotations = self.annotate(texts)
        
        # Extract sentiment scores
        scores = []
        for annotation in annotations:
            # Convert list of dicts to a dict for easier access
            score_dict = {item['label']: item['score'] for item in annotation}
            
            # Correct label mapping for Twitter RoBERTa model
            negative_score = score_dict.get('negative', 0)  # 'negative' label for negative sentiment
            neutral_score = score_dict.get('neutral', 0)    # 'neutral' label for neutral sentiment
            positive_score = score_dict.get('positive', 0)  # 'positive' label for positive sentiment
            
            # Calculate sentiment score considering neutral values
            sentiment_score = (positive_score - negative_score) * (1 - neutral_score)
            
            # Format the score to two decimal places without rounding
            # If sentiment_score is close to zero, set it explicitly to "0.00" to avoid "-0.00"
            if abs(sentiment_score) < 0.005:
                formatted_score = "0.00"
            else:
                formatted_score = f"{sentiment_score:.2f}"
            
            scores.append(formatted_score)
        
        # Add the formatted sentiment scores to the DataFrame
        df[output_column] = scores
        return df


In [3]:
# Cell 3: Load the Cleaned CSV File
# Assuming the file is named 'cleaned_sp500_news_2024_AAPL.csv' and is in the current directory
df = pd.read_csv('cleaned_sp500_news_2024_AAPL.csv')
# Display the first few rows to inspect the data
df.head()


Unnamed: 0,date,title,content
0,06/01/2024,Barron's Weekend Stock Picks: Abercrombie & Fi...,Benzinga reviews this weekend's top stories co...
1,06/01/2024,"What That Famous Investing Quote About Bulls, ...",Motley Fool co-founder David Gardner responds ...
2,07/01/2024,"Apple's Busy Week: Tech Standoff, Legal Battle...","It was an eventful week for Apple Inc. AAPL, w..."
3,07/01/2024,Stocks Decline And Fed Rate Cuts Imperiled To ...,"After stellar gains of 107% in 2023, the Magni..."
4,07/01/2024,"The Last Time Apple Spent This Much Money, It ...",Spending on research and development has nearl...


In [4]:
# Cell 4: Initialize the Annotator and Annotate the DataFrame
# We'll annotate based on the 'title' column, as in the RoBERTa_base.ipynb examples (which used 'title' for sentiment)
# You can change to 'content' if preferred by updating text_column='content'
annotator = SentimentAnnotator()
annotated_df = annotator.annotate_dataframe(df, text_column='title', output_column='sentiment_score')
# Display the results (first few rows)
annotated_df.head()


Using Apple's MPS backend.


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use mps:0


model.safetensors:   0%|          | 0.00/501M [00:00<?, ?B/s]

Unnamed: 0,date,title,content,sentiment_score
0,06/01/2024,Barron's Weekend Stock Picks: Abercrombie & Fi...,Benzinga reviews this weekend's top stories co...,0.01
1,06/01/2024,"What That Famous Investing Quote About Bulls, ...",Motley Fool co-founder David Gardner responds ...,-0.42
2,07/01/2024,"Apple's Busy Week: Tech Standoff, Legal Battle...","It was an eventful week for Apple Inc. AAPL, w...",0.02
3,07/01/2024,Stocks Decline And Fed Rate Cuts Imperiled To ...,"After stellar gains of 107% in 2023, the Magni...",-0.43
4,07/01/2024,"The Last Time Apple Spent This Much Money, It ...",Spending on research and development has nearl...,0.37


In [5]:
# Cell 5: Optional - Handle Missing Values or Duplicates
# Drop rows with missing values (if any)
annotated_df = annotated_df.dropna()
# Remove duplicates (if any)
annotated_df = annotated_df.drop_duplicates()
# Display info about the annotated DataFrame
annotated_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6977 entries, 0 to 6976
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   date             6977 non-null   object
 1   title            6977 non-null   object
 2   content          6977 non-null   object
 3   sentiment_score  6977 non-null   object
dtypes: object(4)
memory usage: 218.2+ KB


In [6]:
# Cell 6: Save the Annotated Data to a New CSV File
output_filename = 'annotated_sp500_news_2024_AAPL.csv'
annotated_df.to_csv(output_filename, index=False)
print(f"Annotated data saved to {output_filename}")


Annotated data saved to annotated_sp500_news_2024_AAPL.csv
