In [1]:
import pandas as pd

In [2]:
aapl_df = pd.read_csv("data/formatted_csv/AAPL_merged.csv")
amzn_df = pd.read_csv("data/formatted_csv/AMZN_merged.csv")
msft_df = pd.read_csv("data/formatted_csv/MSFT_merged.csv")
tsla_df = pd.read_csv("data/formatted_csv/TSLA_merged.csv")

df = pd.concat([aapl_df, amzn_df])
df = pd.concat([df, msft_df])
df = pd.concat([df, tsla_df])

df.head()

Unnamed: 0,date,open,close,movement,headline,label,label_num
0,2023-01-03,128.613993,123.470619,-5.143374,markets close mostly lower again rost panw rep...,negative,-1
1,2023-01-03,128.613993,123.470619,-5.143374,gap plummets on earnings miss cuts fullyear fo...,negative,-1
2,2023-01-03,128.613993,123.470619,-5.143374,billionaire ken fisher is selling these 10 stocks,negative,-1
3,2023-01-03,128.613993,123.470619,-5.143374,corning net income drops 13 shares fall,negative,-1
4,2023-01-03,128.613993,123.470619,-5.143374,internet explorer shutdown to cause japan prob...,negative,-1


In [3]:
df.shape

(104000, 7)

In [4]:
df.columns

Index(['date', 'open', 'close', 'movement', 'headline', 'label', 'label_num'], dtype='object')

In [5]:
df.to_dict("records")[:2]

[{'date': '2023-01-03',
  'open': 128.61399342864948,
  'close': 123.47061920166016,
  'movement': -5.143374226989323,
  'headline': 'markets close mostly lower again rost panw report earnings',
  'label': 'negative',
  'label_num': -1},
 {'date': '2023-01-03',
  'open': 128.61399342864948,
  'close': 123.47061920166016,
  'movement': -5.143374226989323,
  'headline': 'gap plummets on earnings miss cuts fullyear forecast',
  'label': 'negative',
  'label_num': -1}]

In [6]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/varshinibalaji/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [7]:
from tqdm import tqdm

vader = SentimentIntensityAnalyzer()

def map_sentiment(score, label):
    # score ranges [-1, 1]
    if label == "negative":
        # scaling -ve scores to [-5, -1]
        return -5 + 4 * (score + 1) / 1 
    else:
        return 1 + 4 * score / 1  # scaling score to 1 → 5

records = df.to_dict("records")
results = []

for record in tqdm(records):
    text = record["headline"]
    label = record["label"]
    
    # vader sentiment analysis
    sentiment_scores = vader.polarity_scores(text)
    compound = sentiment_scores["compound"]
    
    # scale vader score to -5 to -1 and 1 to 5
    sentiment_score = map_sentiment(compound, label)
    
    results.append({"headline": text, "vader_compound": compound, "custom_score": sentiment_score})

100%|██████████| 104000/104000 [00:05<00:00, 19301.95it/s]


In [8]:
results_vader_df = pd.DataFrame(results)

In [9]:
results_vader_df['custom_score'].describe()

count    104000.000000
mean          1.133785
std           1.609657
min          -4.533600
25%           1.000000
50%           1.000000
75%           2.272800
max           4.851200
Name: custom_score, dtype: float64

Using BERT to perform sentiment classification

In [10]:
import transformers
from transformers import BertModel, BertTokenizer, get_linear_schedule_with_warmup
from torch.optim import AdamW
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader


  from .autonotebook import tqdm as notebook_tqdm


In [11]:
# Set the model name
MODEL_NAME = 'bert-base-cased'

# Build a BERT based tokenizer
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

In [13]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the data
def tokenize_function(text):
    return tokenizer(text, padding='max_length', truncation=True, max_length=128)

data = results_vader_df
data['tokenized'] = data['headline'].apply(tokenize_function)

Using `https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment` for sentiment scoring - which is finetuned on thousands of product review to predict sentiment - with labels

In [15]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F

model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

def get_sentiment_score(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True)
    outputs = model(**inputs)
    probs = F.softmax(outputs.logits, dim=1)
    stars = torch.arange(1, 6)  # 1 to 5 stars
    rating = torch.sum(probs * stars, dim=1).item()  # Expected value
    # Map 1–5 stars to -5 to +5
    # return (rating - 3) * 2.5  # e.g., 1→-5, 3→0, 5→+5
    return rating


# Test
text = "this is the worst thing in the world"
score = get_sentiment_score(text)
print(f"Score: {score:.2f}")

Score: 1.05


In [21]:
def scale_score(score, label):
    if label == "negative":
        # Map [1, 5] to [-5, -1]
        return -5 + (score - 1) * 2
    else:
        # Map [1, 5] to [1, 5]
        return -1 + score * 2


text = "i love this, this is the best thing ever"
score = get_sentiment_score(text)
print(f"Score: {score:.2f}")
scale_score(score, "negative")

Score: 4.96


2.910137176513672

In [26]:
results_vader_df.drop_duplicates("headline", inplace = True)

In [134]:
from tqdm import tqdm
import polars as pl

results_pdf = pl.from_pandas(results_df[['headline']])

# Config
CHECKPOINT_EVERY = 1000
OUT_DIR = "results2"

# Convert to list
headlines = results_pdf["headline"].to_list()
scores = []

# Process row-by-row with progress
for i, h in enumerate(tqdm(headlines, desc="Scoring headlines")):
    scores.append(get_sentiment_score(h))

    # Every N rows: checkpoint
    if (i + 1) % CHECKPOINT_EVERY == 0 or (i + 1) == len(headlines):
        partial_df = results_pdf[:i+1].with_columns(pl.Series("score", scores))
        partial_df.write_parquet(f"{OUT_DIR}/sentiment_progress_up_to_row_{i+1}.parquet")

Scoring headlines: 100%|██████████| 25936/25936 [35:05<00:00, 12.32it/s]   


In [None]:
from tqdm import tqdm
import polars as pl

results_pdf = pl.from_pandas(results_df[['headline']])

# Config
CHECKPOINT_EVERY = 1000
OUT_DIR = "results2"

# Convert to list
headlines = results_pdf["headline"].to_list()
scores = []

# Process row-by-row with progress
for i, h in enumerate(tqdm(headlines, desc="Scoring headlines")):
    scores.append(get_sentiment_score(h))

    # Every N rows: checkpoint
    if (i + 1) % CHECKPOINT_EVERY == 0 or (i + 1) == len(headlines):
        partial_df = results_pdf[:i+1].with_columns(pl.Series("score", scores))
        partial_df.write_parquet(f"{OUT_DIR}/sentiment_progress_up_to_row_{i+1}.parquet")

Scoring headlines: 100%|██████████| 104000/104000 [3:53:01<00:00,  7.44it/s]


In [24]:
partial_df = pd.read_parquet(f"results2/sentiment_progress_up_to_row_25936.parquet")
partial_df.columns = ['headline', 'bert_score']

In [29]:
results_vader_df.columns = ['headline', 'vader_compound', 'vader_score', 'tokenized']

In [31]:
results_merge = partial_df.merge(results_vader_df, on = "headline", how = "left")

In [44]:
results_merge_df = df.drop_duplicates("headline")[['headline', 'label']].merge(results_merge, on = "headline", how = "inner")

In [40]:
results_merge_df

Unnamed: 0,headline,label,bert_score,vader_compound,vader_score,tokenized
0,markets close mostly lower again rost panw rep...,negative,2.040362,-0.2960,-2.1840,"[input_ids, token_type_ids, attention_mask]"
1,gap plummets on earnings miss cuts fullyear fo...,negative,2.325796,-0.4215,-2.6860,"[input_ids, token_type_ids, attention_mask]"
2,billionaire ken fisher is selling these 10 stocks,negative,4.089379,0.0000,-1.0000,"[input_ids, token_type_ids, attention_mask]"
3,corning net income drops 13 shares fall,negative,1.589967,0.2960,0.1840,"[input_ids, token_type_ids, attention_mask]"
4,internet explorer shutdown to cause japan prob...,negative,1.612979,-0.4019,-2.6076,"[input_ids, token_type_ids, attention_mask]"
...,...,...,...,...,...,...
25931,boston scientific bsx q1 earnings top 2022 rev...,positive,4.651347,0.2023,1.8092,"[input_ids, token_type_ids, attention_mask]"
25932,producer sentiment improves with strengthened ...,positive,2.892199,-0.2263,0.0948,"[input_ids, token_type_ids, attention_mask]"
25933,equinix declares quarterly dividend on its com...,positive,2.964839,0.0000,1.0000,"[input_ids, token_type_ids, attention_mask]"
25934,featurelove island dumps fast fashion for seco...,positive,3.219742,-0.4019,-0.6076,"[input_ids, token_type_ids, attention_mask]"


In [45]:
def scale_score(score, label):
    if label == "negative":
        # Map [1, 5] → [-5, -1]
        return -5 + 4 * (score - 1) / 4
    else:
        # Keep [1, 5] as is
        return score

results_merge_df['bert_score_scaled'] = results_merge_df.apply(lambda x: scale_score(x['bert_score'], x['label']), axis = 1)

In [46]:
results_merge_df['bert_score_scaled'].describe()

count    25936.000000
mean         2.266206
std          2.867533
min         -4.884063
25%          2.146257
50%          3.447841
75%          4.144660
max          4.978554
Name: bert_score_scaled, dtype: float64

In [47]:
results_merge_df.groupby('label', dropna = False).size()

label
negative     4802
neutral      8725
positive    12409
dtype: int64

In [51]:
results_merge_df.columns

Index(['headline', 'label', 'bert_score', 'vader_compound', 'vader_score',
       'tokenized', 'bert_score_scaled'],
      dtype='object')

In [54]:
import polars as pl

final_df = results_merge_df[['headline', 'label', 'vader_score','bert_score_scaled']]
final_df.columns = ['headline', 'label', 'vader_score','bert_score']
results_merged_pdf = pl.from_pandas(final_df)
results_merged_pdf

headline,label,vader_score,bert_score
str,str,f64,f64
"""markets close mostly lower aga…","""negative""",-2.184,-3.959638
"""gap plummets on earnings miss …","""negative""",-2.686,-3.674204
"""billionaire ken fisher is sell…","""negative""",-1.0,-1.910621
"""corning net income drops 13 sh…","""negative""",0.184,-4.410033
"""internet explorer shutdown to …","""negative""",-2.6076,-4.387021
…,…,…,…
"""boston scientific bsx q1 earni…","""positive""",1.8092,4.651347
"""producer sentiment improves wi…","""positive""",0.0948,2.892199
"""equinix declares quarterly div…","""positive""",1.0,2.964839
"""featurelove island dumps fast …","""positive""",-0.6076,3.219742


#### comparing vader score and bert score

In [60]:
df2 = results_merged_pdf.with_columns(
    (pl.col("bert_score") > 0).alias("positive_score")
)
print(df2.columns)  # now includes “positive_score”


grouped_df_bert = (
    df2
    .group_by(["label", "positive_score"])
    .agg([
        pl.count().alias("count"),
        pl.col("bert_score").mean().alias("avg_score")
    ])
)

['headline', 'label', 'vader_score', 'bert_score', 'positive_score']


(Deprecated in version 0.20.5)
  pl.count().alias("count"),


In [58]:
df3 = results_merged_pdf.with_columns(
    (pl.col("vader_score") > 0).alias("positive_score")
)
print(df2.columns)  # now includes “positive_score”


grouped_df_vader = (
    df2
    .group_by(["label", "positive_score"])
    .agg([
        pl.count().alias("count"),
        pl.col("vader_score").mean().alias("avg_score")
    ])
)

['headline', 'label', 'vader_score', 'bert_score', 'positive_score']


(Deprecated in version 0.20.5)
  pl.count().alias("count"),


In [61]:
grouped_df_bert

label,positive_score,count,avg_score
str,bool,u32,f64
"""positive""",True,12409,3.485661
"""neutral""",True,8725,3.693619
"""negative""",False,4802,-3.478568


In [62]:
grouped_df_vader

label,positive_score,count,avg_score
str,bool,u32,f64
"""positive""",True,12409,1.655822
"""negative""",False,4802,-1.188264
"""neutral""",True,8725,1.665412
