In [16]:
!pip install emoji wordsegment


Collecting wordsegment
  Downloading wordsegment-1.3.1-py2.py3-none-any.whl.metadata (7.7 kB)
Downloading wordsegment-1.3.1-py2.py3-none-any.whl (4.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.8/4.8 MB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25hInstalling collected packages: wordsegment
Successfully installed wordsegment-1.3.1


In [18]:
"""
# Sentiment Analysis for Financial Social Media

## 1. Introduction
- Goal: Compare sentiment models (VADER, FinBERT, DeBERTa) on financial social media data.
- Data Sources: Reddit, Twitter, Financial News.
- Focus: Preprocessing, Model Benchmarking, Sarcasm Detection.
"""

import re
import emoji
from wordsegment import load, segment

In [20]:
load()  # Load segmentation for hashtags

In [22]:
# Example Preprocessing Function
def preprocess_text(text):
    text = emoji.demojize(text)
    text = re.sub(r'\$\w+', '<TICKER>', text)
    text = re.sub(r'#(\w+)', lambda m: ' '.join(segment(m.group(1))), text)
    text = re.sub(r'http\S+', '', text)  # remove URLs
    text = re.sub(r'@[A-Za-z0-9_]+', '', text)  # remove mentions
    return text

# Example usage
example_text = "I love $TSLA 🚀🚀🚀 #BullMarket"
print(preprocess_text(example_text))

I love <TICKER> :rocket::rocket::rocket: bull market


In [96]:
# VADER Example
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk

nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

text_vader = "Tesla stock is going to the moon! 🚀"
vader_result = sia.polarity_scores(text_vader)
vader_sentiment = vader_result['compound']
print("VADER Sentiment:", vader_result)

VADER Sentiment: {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/rickliu/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [100]:
# FinBERT Example
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

inputs = tokenizer(text_vader, return_tensors="pt", padding=True, truncation=True)
outputs = model(**inputs)
probs = torch.nn.functional.softmax(outputs.logits, dim=-1)

finbert_labels = ["Negative", "Neutral", "Positive"]
finbert_pred = finbert_labels[torch.argmax(probs)]
print("FinBERT Sentiment:", finbert_pred, probs)



FinBERT Sentiment: Positive tensor([[0.0558, 0.0373, 0.9068]], grad_fn=<SoftmaxBackward0>)


In [60]:
!pip install sentencepiece protobuf
!pip install --upgrade tokenizers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting tokenizers
  Downloading tokenizers-0.21.2-cp39-abi3-macosx_11_0_arm64.whl.metadata (6.8 kB)
Downloading tokenizers-0.21.2-cp39-abi3-macosx_11_0_arm64.whl (2.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.7/2.7 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25hInstalling collected packages: tokenizers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.19.1
    Uninstalling tokenizers-0.19.1:
      Successfully uninstalled tokenizers-0.19.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
transformers 4.44.2 requires tokenizers<0.20,>=0.19, but you have tokenizers 0.21.2 which is incompatible.[0m[31m
[0mSuccessfully installed tokenizers-0.21.2


In [102]:
# DeBERTa Example (Updated to v1 base)
tokenizer_deberta = AutoTokenizer.from_pretrained("microsoft/deberta-base")
model_deberta = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-base", num_labels=3)

inputs_deberta = tokenizer_deberta(text_vader, return_tensors="pt")
outputs_deberta = model_deberta(**inputs_deberta)
probs_deberta = torch.nn.functional.softmax(outputs_deberta.logits, dim=-1)

deberta_labels = ["Negative", "Neutral", "Positive"]
deberta_pred = deberta_labels[torch.argmax(probs_deberta)]
print("DeBERTa Sentiment:", deberta_pred, probs_deberta)

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DeBERTa Sentiment: Negative tensor([[0.3486, 0.3415, 0.3099]], grad_fn=<SoftmaxBackward0>)


In [106]:
comparison_data = {
    "Model": ["VADER", "FinBERT", "DeBERTa"],
    "Sentiment": [f"Compound: {vader_sentiment:.3f}", finbert_pred, deberta_pred],
    "Probabilities": [
        str(vader_result),
        [round(float(p), 3) for p in probs.detach().numpy().flatten()],
        [round(float(p), 3) for p in probs_deberta.detach().numpy().flatten()]
    ]
}

comparison_df = pd.DataFrame(comparison_data)
from IPython.display import display

display(comparison_df)

Unnamed: 0,Model,Sentiment,Probabilities
0,VADER,Compound: 0.000,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
1,FinBERT,Positive,"[0.056, 0.037, 0.907]"
2,DeBERTa,Negative,"[0.349, 0.341, 0.31]"


In [112]:
from transformers import pipeline

# Use sentiment analysis as placeholder since no public sarcasm model is available
sarcasm_detector = pipeline("sentiment-analysis")

text_sarcasm = "Oh great, another fantastic earnings miss. Just what we needed."
sarcasm_result = sarcasm_detector(text_sarcasm)
print("Sarcasm Detection (placeholder sentiment):", sarcasm_result)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Sarcasm Detection (placeholder sentiment): [{'label': 'POSITIVE', 'score': 0.9996737241744995}]
