# 1. Relevant Imports

In [1]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline

# 2. Initialize RoBERTa model

In [4]:
class SentimentAnnotator:
    def __init__(self, model_name='cardiffnlp/twitter-roberta-base-sentiment-latest'):
        """
        Initializes the SentimentAnnotator with the specified model.
        Utilizes Apple's MPS backend if available.
        """
        self.device = self._get_device()
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
        self.model.to(self.device)
        self.pipeline = TextClassificationPipeline(
            model=self.model,
            tokenizer=self.tokenizer,
            device=self.device.index if self.device.type != 'cpu' else -1,
            top_k=None,  # Return scores for all labels
            batch_size=32
        )

    def _get_device(self):
        """
        Determines the appropriate device to use (MPS if available, else CPU).
        """
        if torch.backends.mps.is_available():
            device = torch.device('mps')
            print("Using Apple's MPS backend.")
        else:
            device = torch.device('cpu')
            print("MPS backend not available. Using CPU.")
        return device

    def preprocess_text(self, text):
        """
        Preprocesses the input text by cleaning and normalizing it.
        """
        # Ensure required NLTK data is downloaded
        nltk.download('stopwords', quiet=True)
    
        # Convert to string and strip leading/trailing whitespace
        text = str(text).strip()
    
        # Handle missing or empty text
        if not text:
            return ''
    
        # Convert to lowercase
        text = text.lower()
        
        # Remove punctuation
        text = re.sub(r'[^\w\s]', '', text)
        
        # Remove extra whitespace
        text = ' '.join(text.split())
        
        # Remove stop words
        stop_words = set(stopwords.words('english'))
        text = ' '.join([word for word in text.split() if word not in stop_words])
        
        return text

    def annotate(self, texts):
        """
        Annotates a list of texts with sentiment scores.
        """
        # Preprocess texts
        preprocessed_texts = [self.preprocess_text(text) for text in texts]

        # Run the pipeline
        annotations = self.pipeline(preprocessed_texts)
        return annotations

    def annotate_dataframe(self, df, text_column, output_column='sentiment_score'):
        texts = df[text_column].tolist()
        annotations = self.annotate(texts)
    
        # Extract sentiment scores
        scores = []
        for annotation in annotations:
            # Convert list of dicts to a dict for easier access
            score_dict = {item['label']: item['score'] for item in annotation}
    
            # Correct label mapping for Twitter RoBERTa model
            negative_score = score_dict.get('negative', 0)  # 'negative' label for negative sentiment
            neutral_score = score_dict.get('neutral', 0)    # 'neutral' label for neutral sentiment
            positive_score = score_dict.get('positive', 0)  # 'positive' label for positive sentiment
    
            # Calculate sentiment score considering neutral values
            sentiment_score = (positive_score - negative_score) * (1 - neutral_score)
            
            # Format the score to two decimal places without rounding
            # If sentiment_score is close to zero, set it explicitly to "0.00" to avoid "-0.00"
            if abs(sentiment_score) < 0.005:
                formatted_score = "0.00"
            else:
                formatted_score = f"{sentiment_score:.2f}"
    
            scores.append(formatted_score)

        # Add the formatted sentiment scores to the DataFrame
        df[output_column] = scores
        return df


# 3. Annotation for different datsets

## a. MRK dataset

In [5]:
# Example usage
if __name__ == '__main__':
    # Read the MRK_text.csv file
    df = pd.read_csv('../dataset_final/Daily_Financial_News/MRK/MRK_text.csv')

    # Initialize the annotator
    annotator = SentimentAnnotator()

    # Annotate the DataFrame using the 'title' column
    annotated_df = annotator.annotate_dataframe(df, text_column='title', output_column='sentiment_score')

    # Display the results
    print(annotated_df[['title', 'sentiment_score']])

    # Optionally, save the annotated DataFrame to a new CSV file
    annotated_df.to_csv('../dataset_final/Daily_Financial_News/MRK/MRK_text_annotated.csv', index=False)

Using Apple's MPS backend.


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


                                                  title sentiment_score
0     Shares of several healthcare companies are tra...           -0.31
1     Johnson & Johnson To Start Coronavirus Vaccine...            0.01
2     The Daily Biotech Pulse: Keytruda Setback For ...           -0.38
3     Merck Announces That The Phase 3 KEYNOTE-361 T...            0.00
4     The Week Ahead In Biotech: Viela FDA Decision,...            0.01
...                                                 ...             ...
3329  BenchmarkJournal.com Free Analyst Review for A...            0.00
3330  Trends in the U.K. and Irish Pharmaceutical an...            0.00
3331  ParagonReport.com Complimentary Market Update ...            0.02
3332  ParagonReport.com Complimentary Market Update ...            0.01
3333  Wall Street News Alert:  Stocks This Morning: ...            0.01

[3334 rows x 2 columns]


## b. MS dataset

In [6]:
# Example usage
if __name__ == '__main__':
    # Read the MS_text.csv file
    df = pd.read_csv('../dataset_final/Daily_Financial_News/MS/MS_text.csv')

    # Initialize the annotator
    annotator = SentimentAnnotator()

    # Annotate the DataFrame using the 'title' column
    annotated_df = annotator.annotate_dataframe(df, text_column='title', output_column='sentiment_score')

    # Display the results
    print(annotated_df[['title', 'sentiment_score']])

    # Optionally, save the annotated DataFrame to a new CSV file
    annotated_df.to_csv('../dataset_final/Daily_Financial_News/MS/MS_text_annotated.csv', index=False)

Using Apple's MPS backend.


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


                                                  title sentiment_score
0          Price Over Earnings Overview: Morgan Stanley            0.00
1     Shares of several financial service companies ...           -0.22
2     Goldman Sachs Employees Returning To Their Des...            0.01
3     Shares of several financial services companies...            0.00
4     Morgan Stanley CEO James Gorman Says Markets B...            0.04
...                                                 ...             ...
3237  4Q Profit For Morgan Stanley (MS) After Huge L...           -0.49
3238  U.S Futures Slip Despite Optimism in the Econo...           -0.05
3239  Company News for January 20, 2010 - Corporate ...            0.00
3240  Top 5 Stocks To Focus On Today (MS, BAC, MTB, ...            0.02
3241                 Banks Paid A TARP Premium (GS, MS)            0.00

[3242 rows x 2 columns]


## c. MU dataset

In [7]:
# Example usage
if __name__ == '__main__':
    # Read the MU_text.csv file
    df = pd.read_csv('../dataset_final/Daily_Financial_News/MU/MU_text.csv')

    # Initialize the annotator
    annotator = SentimentAnnotator()

    # Annotate the DataFrame using the 'title' column
    annotated_df = annotator.annotate_dataframe(df, text_column='title', output_column='sentiment_score')

    # Display the results
    print(annotated_df[['title', 'sentiment_score']])

    # Optionally, save the annotated DataFrame to a new CSV file
    annotated_df.to_csv('../dataset_final/Daily_Financial_News/MU/MU_text_annotated.csv', index=False)

Using Apple's MPS backend.


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


                                                  title sentiment_score
0     Hearing Wedbush Downgrades Micron On Valuation...           -0.06
1     Many Smartphones Expected to Come with 256GB o...            0.10
2     Shares of several companies in the broader tec...            0.59
3     Shares of several technology companies are tra...            0.62
4     Micron Launches Robot Design Challenge to Acce...            0.01
...                                                 ...             ...
3139  CEOWORLD Most Actives Technology Stocks by vol...            0.00
3140  CEOWORLD Technology Stocks Watch on 4/27/11 (T...            0.03
3141  Hot Stocks to Buy on 4/25/11 and April 26, 201...            0.01
3142                   Earnings Preview: Sandisk (SNDK)            0.01
3143  Benzinga's Top ETF Decliners, April 20th (SOXS...            0.01

[3144 rows x 2 columns]


## d. QQQ dataset

In [8]:
# Example usage
if __name__ == '__main__':
    # Read the QQQ_text.csv file
    df = pd.read_csv('../dataset_final/Daily_Financial_News/QQQ/QQQ_text.csv')

    # Initialize the annotator
    annotator = SentimentAnnotator()

    # Annotate the DataFrame using the 'title' column
    annotated_df = annotator.annotate_dataframe(df, text_column='title', output_column='sentiment_score')

    # Display the results
    print(annotated_df[['title', 'sentiment_score']])

    # Optionally, save the annotated DataFrame to a new CSV file
    annotated_df.to_csv('../dataset_final/Daily_Financial_News/QQQ/QQQ_text_annotated.csv', index=False)

Using Apple's MPS backend.


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


                                                  title sentiment_score
0                   Afternoon Market Stats in 5 Minutes            0.01
1                     Morning Market Stats in 5 Minutes            0.01
2                   Afternoon Market Stats in 5 Minutes            0.01
3     Cramer: NASDAQ Rebound Makes Sense As It Bette...            0.59
4                     Morning Market Stats in 5 Minutes            0.01
...                                                 ...             ...
3095  Stocks Grind Higher On Wednesday (SPY, QQQQ, G...            0.04
3096    Doug Kass Shorting Broader ETFs (IWM, SPY, QQQ)           -0.01
3097    Reminder: QQQQ Changes To QQQ Today (QQQQ, QQQ)            0.00
3098  PowerShares QQQ Ticker to Change from ‘QQQQ' t...            0.00
3099  ETFs To Watch March 17, 2011 (CZI, EZJ, TMF, VNM)            0.00

[3100 rows x 2 columns]


## e. NVDA dataset

In [9]:
# Example usage
if __name__ == '__main__':
    # Read the NVDA_text.csv file
    df = pd.read_csv('../dataset_final/Daily_Financial_News/NVDA/NVDA_text.csv')

    # Initialize the annotator
    annotator = SentimentAnnotator()

    # Annotate the DataFrame using the 'title' column
    annotated_df = annotator.annotate_dataframe(df, text_column='title', output_column='sentiment_score')

    # Display the results
    print(annotated_df[['title', 'sentiment_score']])

    # Optionally, save the annotated DataFrame to a new CSV file
    annotated_df.to_csv('../dataset_final/Daily_Financial_News/NVDA/NVDA_text_annotated.csv', index=False)

Using Apple's MPS backend.


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


                                                  title sentiment_score
0     Shares of several technology companies are tra...            0.04
1                   Afternoon Market Stats in 5 Minutes            0.01
2                     Morning Market Stats in 5 Minutes            0.01
3     Shares of several technology companies are tra...            0.37
4                   Afternoon Market Stats in 5 Minutes            0.01
...                                                 ...             ...
3128  J.P. Morgan Upgrades NVIDIA Corporation To Neu...            0.01
3129       JP Morgan Upgrades NVIDIA To Neutral, $21 PT            0.01
3130  Goldman Sachs Gives Color On Semiconductors (N...            0.01
3131  Auriga Still Not Sure Where Reality Lies For N...           -0.29
3132                        Nvidia Goes Negative (NVDA)           -0.04

[3133 rows x 2 columns]


# FinSen Dataset

In [10]:
# Example usage
if __name__ == '__main__':
    # Read the FinSen_text.csv file
    df = pd.read_csv('../dataset_final/FinSen_S&P500/FinSen_text.csv')

    # Initialize the annotator
    annotator = SentimentAnnotator()

    # Annotate the DataFrame using the 'Content' column
    annotated_df = annotator.annotate_dataframe(df, text_column='Content', output_column='sentiment_score')

    # Display the results
    print(annotated_df[['Content', 'sentiment_score']])

    # Optionally, save the annotated DataFrame to a new CSV file
    annotated_df.to_csv('../dataset_final/FinSen_S&P500/FinSen_text_annotated.csv', index=False)

Using Apple's MPS backend.


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


                                                 Content sentiment_score
0      TSX Slightly Down, Books Weekly GainsUnited St...            0.13
1      UnitedHealth Hits 4-week HighUnited States sto...            0.62
2      Cisco Systems Hits 4-week LowUnited States sto...           -0.43
3      AT&T Hits All-time LowUnited States stocksAT&T...           -0.28
4      Microsoft Hits 4-week HighUnited States stocks...            0.60
...                                                  ...             ...
15529  United States GDP Rises 0.6 percent in the fir...            0.02
15530  Consumer Price Index 2.6 percent higher than i...            0.00
15531  U.S. Federal Reserve Kept Rates Unchanged at 5...           -0.01
15532  Trade Deficit Increases in March 2007United St...            0.00
15533  Blackstone boosts IPO after Beijing takes $3bn...            0.03

[15534 rows x 2 columns]
