In [1]:
from datasets import load_dataset
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
import re
import emoji
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

In [2]:
# Load the dataset
dataset = load_dataset('zeroshot/twitter-financial-news-sentiment')

In [3]:
train_dataset = dataset['train']
test_dataset = dataset['validation']

In [4]:
# Download necessary NLTK data files
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Define stop words and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Define a set of financial slang terms and their replacements
financial_slang = {
    'bullish': 'positive',
    'bearish': 'negative',
    'moon': 'high',
    'bagholder': 'investor',
    'whale': 'large investor',
    # Add more financial slang as needed
}

def preprocess(text):
    # Remove URLs, mentions, and hashtags
    text = re.sub(r"http\S+|www\S+|https\S+|@\S+|#\S+", '', text, flags=re.MULTILINE)
    
    # Replace financial slang
    for term, replacement in financial_slang.items():
        text = re.sub(r'\b' + term + r'\b', replacement, text)
    
    # Tokenization with TweetTokenizer to handle emojis
    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(text)
    
    # Replace emojis with text
    tokens = [emoji.demojize(token) for token in tokens]
    
    # Lowercasing and removing stopwords
    tokens = [word.lower() for word in tokens if word.lower() not in stop_words and word.isalpha()]
    
    # Lemmatization
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sardorbek/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/sardorbek/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sardorbek/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/sardorbek/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [5]:
# Apply preprocessing
train_dataset = train_dataset.map(lambda x: {'text': preprocess(x['text'])})
test_dataset = test_dataset.map(lambda x: {'text': preprocess(x['text'])})

In [6]:
train_dataset[0]

{'text': 'bynd jpmorgan reel expectation beyond meat', 'label': 0}

In [7]:
test_dataset[0]

{'text': 'ally ally financial pull outlook', 'label': 0}

In [8]:
# Extract features using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_dataset['text'])
X_test = vectorizer.transform(test_dataset['text'])

y_train = train_dataset['label']
y_test = test_dataset['label']

In [9]:
# Naive Bayes
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)
nb_predictions = nb_classifier.predict(X_test)

# Logistic Regression
lr_classifier = LogisticRegression(max_iter=1000)
lr_classifier.fit(X_train, y_train)
lr_predictions = lr_classifier.predict(X_test)

In [10]:

# Naive Bayes Evaluation
nb_acc = accuracy_score(y_test, nb_predictions)
nb_prec = precision_score(y_test, nb_predictions, average='macro')
nb_rec = recall_score(y_test, nb_predictions, average='macro')
nb_f1 = f1_score(y_test, nb_predictions, average='macro')

print(f'Naive Bayes - Accuracy: {nb_acc}, Precision: {nb_prec}, Recall: {nb_rec}, F1 Score: {nb_f1}')
print(classification_report(y_test, nb_predictions))

# Logistic Regression Evaluation
lr_acc = accuracy_score(y_test, lr_predictions)
lr_prec = precision_score(y_test, lr_predictions, average='macro')
lr_rec = recall_score(y_test, lr_predictions, average='macro')
lr_f1 = f1_score(y_test, lr_predictions, average='macro')

print(f'Logistic Regression - Accuracy: {lr_acc}, Precision: {lr_prec}, Recall: {lr_rec}, F1 Score: {lr_f1}')
print(classification_report(y_test, lr_predictions))

Naive Bayes - Accuracy: 0.7579564489112228, Precision: 0.7994742889587583, Recall: 0.5398218797921374, F1 Score: 0.5801131885563271
              precision    recall  f1-score   support

           0       0.88      0.21      0.34       347
           1       0.77      0.43      0.55       475
           2       0.75      0.98      0.85      1566

    accuracy                           0.76      2388
   macro avg       0.80      0.54      0.58      2388
weighted avg       0.77      0.76      0.72      2388

Logistic Regression - Accuracy: 0.7918760469011725, Precision: 0.7809560273587314, Recall: 0.6303712988936433, F1 Score: 0.6762636597708379
              precision    recall  f1-score   support

           0       0.77      0.40      0.52       347
           1       0.78      0.54      0.64       475
           2       0.80      0.96      0.87      1566

    accuracy                           0.79      2388
   macro avg       0.78      0.63      0.68      2388
weighted avg       0.

## Method Implementation and Rationale

I decided to implement two different classifiers: Naive Bayes and Logistic Regression.

These methods were chosen based on their effectiveness in handling text data and their straightforward imple- mentation. Naive Bayes: This classifier is based on Bayes’ theorem with the assumption of independence between features. It is particularly effective for text classification due to its simplicity and efficiency in high-dimensional spaces (Prabha et al., 2022). Logistic Regression: This model is a linear classifier that predicts the probability of a class label based on the logistic function. It is well-suited for binary and multiclass classification problems and can handle large feature spaces (Wiley & Pace, 2015).
Strengths:
• Naive Bayes: Efficient, requires minimal training data, and performs well with high-dimensional data (Prabha et al., 2022).
• Logistic Regression: Simple, interpretable, and can capture non-linear relationships with appropri- ate feature engineering (Wiley & Pace, 2015).
Limitations:
• Naive Bayes: The independence assumption rarely holds true for text data, which can limit its effectiveness (Prabha et al., 2022).
• Logistic Regression: Assumes a linear relationship between features and the log-odds of the out- come, which may not always be appropriate (Wiley & Pace, 2015).
To prepare the text data for classification, I implemented several preprocessing steps to clean and nor- malize the text:
1. Removing Noise: URLs, mentions, and hashtags were removed to eliminate irrelevant information.
3
2. Replacing Financial Slang: Specific financial terms were replaced with more general terms to standardize the language.
3. Tokenization: We used TweetTokenizer to handle Twitter-specific formatting and emojis.
4. Replacing Emojis with Text: Emojis were converted to text descriptions to retain their sentiment information.
5. Lowercasing and Removing Stopwords: Standard text preprocessing steps to normalize the text.
6. Lemmatization: Converted words to their base forms to reduce dimensionality and handle different word forms.
Features Used:
TF-IDF Vectorization: This technique was used to transform the text data into numerical features. By setting the maximum number of features to 5000, we captured the most significant terms in the corpus. The chosen preprocessing steps and features aimed to handle the noisy and unstructured nature of tweets, especially those related to financial news. By normalizing the text and focusing on significant terms, we expected the models to perform better in capturing the sentiment of the tweets.

TF-IDF Vectorization: This technique was used to transform the text data into numerical features. By setting the maximum number of features to 5000, we captured the most significant terms in the corpus. The chosen preprocessing steps and features aimed to handle the noisy and unstructured nature of tweets, especially those related to financial news. By normalizing the text and focusing on significant terms, we expected the models to perform better in capturing the sentiment of the tweets.

## Implementation Details and Testing Procedure

The dataset was loaded from the Huggingface datasets repository and split into training and test sets. The training set was used to train the models, and the test set was used to evaluate their performance.

Implementation challenges:

Data Augmentation Challenges: The initial implementation included synonym replacement and back translation for data augmentation. However, these methods proved time-consuming and were ultimately removed to speed up the processing.

Translation API Issues: 
The use of the googletrans library for back translation caused errors due to changes in the Google Translate API, necessitating a switch to the deep-translator library. This change helped mitigate translation errors but still posed time constraints.

Performance Trade-offs: 

Finding a balance between processing time and performance improvements was challenging. Advanced data augmentation techniques were deemed too time-intensive given the constraints.

Model Performance: 

Handling class imbalance effectively remained a challenge, with Naive Bayes strug- gling more than Logistic Regression in this aspect.

## Evaluation, Interpretation, and Discussion of Results

The Naive Bayes classifier showed decent performance but struggled with class imbalance, particularly for the bearish class. The Logistic Regression classifier performed better overall, especially for the bullish and neutral classes. The higher recall and F1 score indicate better handling of imbalanced classes.

The sentiment classifier for Twitter Financial News was successfully implemented using Naive Bayes and Logistic Regression models. While both models showed reasonable performance, Logistic Regression outperformed Naive Bayes in terms of accuracy, precision, recall, and F1 score. Potential improvements were identified to further enhance the classifier’s performance, including advanced feature engineering, using transformer-based models, hyperparameter tuning, cross-validation, and thorough error analysis.

Possible Areas for Improvement:

Transformer-based Models: Transformer-based models such as BERT for sentiment analysis in financial texts can provide substantial improvements. These models leverage deep contextual understanding and have shown superior performance across various NLP tasks. Lengkeek and Frasincar (2023) highlight how hierarchical language models that use BERT can enhance aspect-based sentiment analysis, making them well-suited for the complex and context-sensitive nature of financial news sentiment classification. Additionally, the use of transformer-based architectures allows for capturing long-range dependencies and intricate sentiment cues that simpler models might miss.
Cross-Validation: Implementing k-fold cross-validation can ensure that the model’s performance is robust and generalizable across different subsets of the dataset. Aghbalou et al. (2022) discuss the effectiveness of k-fold cross-validation in preventing overfitting and providing a more reliable estimate of model per- formance. This method ensures that the model is trained and evaluated on multiple splits of the data, leading to a more comprehensive understanding of its strengths and weaknesses.
