In [4]:
import pandas as pd

In [5]:
aapl_df = pd.read_csv("data/formatted_csv/AAPL_merged.csv")
amzn_df = pd.read_csv("data/formatted_csv/AMZN_merged.csv")
msft_df = pd.read_csv("data/formatted_csv/MSFT_merged.csv")
tsla_df = pd.read_csv("data/formatted_csv/TSLA_merged.csv")

df = pd.concat([aapl_df, amzn_df])
df = pd.concat([df, msft_df])
df = pd.concat([df, tsla_df])

df.head()

Unnamed: 0,date,open,close,movement,headline,label,label_num
0,2023-01-03,128.613993,123.470619,-5.143374,markets close mostly lower again rost panw rep...,negative,-1
1,2023-01-03,128.613993,123.470619,-5.143374,gap plummets on earnings miss cuts fullyear fo...,negative,-1
2,2023-01-03,128.613993,123.470619,-5.143374,billionaire ken fisher is selling these 10 stocks,negative,-1
3,2023-01-03,128.613993,123.470619,-5.143374,corning net income drops 13 shares fall,negative,-1
4,2023-01-03,128.613993,123.470619,-5.143374,internet explorer shutdown to cause japan prob...,negative,-1


In [6]:
df.shape

(104000, 7)

In [7]:
df.columns

Index(['date', 'open', 'close', 'movement', 'headline', 'label', 'label_num'], dtype='object')

In [8]:
df.to_dict("records")[:2]

[{'date': '2023-01-03',
  'open': 128.61399342864948,
  'close': 123.47061920166016,
  'movement': -5.143374226989323,
  'headline': 'markets close mostly lower again rost panw report earnings',
  'label': 'negative',
  'label_num': -1},
 {'date': '2023-01-03',
  'open': 128.61399342864948,
  'close': 123.47061920166016,
  'movement': -5.143374226989323,
  'headline': 'gap plummets on earnings miss cuts fullyear forecast',
  'label': 'negative',
  'label_num': -1}]

In [9]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/varshinibalaji/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [10]:
from tqdm import tqdm

vader = SentimentIntensityAnalyzer()

def map_sentiment(score, label):
    # score ranges [-1, 1]
    if label == "negative":
        # scaling -ve scores to [-5, -1]
        return -5 + 4 * (score + 1) / 1 
    else:
        return 1 + 4 * score / 1  # scaling score to 1 → 5

records = df.to_dict("records")
results = []

for record in tqdm(records):
    text = record["headline"]
    label = record["label"]
    
    # vader sentiment analysis
    sentiment_scores = vader.polarity_scores(text)
    compound = sentiment_scores["compound"]
    
    # scale vader score to -5 to -1 and 1 to 5
    sentiment_score = map_sentiment(compound, label)
    
    results.append({"headline": text, "vader_compound": compound, "custom_score": sentiment_score})

100%|██████████| 104000/104000 [00:09<00:00, 11164.18it/s]


In [11]:
results_df = pd.DataFrame(results)

In [12]:
results_df['custom_score'].describe()

count    104000.000000
mean          1.133785
std           1.609657
min          -4.533600
25%           1.000000
50%           1.000000
75%           2.272800
max           4.851200
Name: custom_score, dtype: float64

Using BERT to perform sentiment classification

In [13]:
import transformers
from transformers import BertModel, BertTokenizer, get_linear_schedule_with_warmup
from torch.optim import AdamW
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader


In [14]:
# Set the model name
MODEL_NAME = 'bert-base-cased'

# Build a BERT based tokenizer
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

In [15]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the data
def tokenize_function(text):
    return tokenizer(text, padding='max_length', truncation=True, max_length=128)

data = results_df
data['tokenized'] = data['headline'].apply(tokenize_function)

Using `https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment` for sentiment scoring - which is finetuned on thousands of product review to predict sentiment - with labels

In [130]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F

model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

def get_sentiment_score(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True)
    outputs = model(**inputs)
    probs = F.softmax(outputs.logits, dim=1)
    stars = torch.arange(1, 6)  # 1 to 5 stars
    rating = torch.sum(probs * stars, dim=1).item()  # Expected value
    # Map 1–5 stars to -5 to +5
    # return (rating - 3) * 2.5  # e.g., 1→-5, 3→0, 5→+5
    return rating


# Test
text = "this is the worst thing in the world"
score = get_sentiment_score(text)
print(f"Score: {score:.2f}")

Score: 1.05


In [133]:
results_df.drop_duplicates("headline", inplace = True)

In [134]:
from tqdm import tqdm
import polars as pl

results_pdf = pl.from_pandas(results_df[['headline']])

# Config
CHECKPOINT_EVERY = 1000
OUT_DIR = "results2"

# Convert to list
headlines = results_pdf["headline"].to_list()
scores = []

# Process row-by-row with progress
for i, h in enumerate(tqdm(headlines, desc="Scoring headlines")):
    scores.append(get_sentiment_score(h))

    # Every N rows: checkpoint
    if (i + 1) % CHECKPOINT_EVERY == 0 or (i + 1) == len(headlines):
        partial_df = results_pdf[:i+1].with_columns(pl.Series("score", scores))
        partial_df.write_parquet(f"{OUT_DIR}/sentiment_progress_up_to_row_{i+1}.parquet")

Scoring headlines: 100%|██████████| 25936/25936 [35:05<00:00, 12.32it/s]   


In [None]:
from tqdm import tqdm
import polars as pl

results_pdf = pl.from_pandas(results_df[['headline']])

# Config
CHECKPOINT_EVERY = 1000
OUT_DIR = "results2"

# Convert to list
headlines = results_pdf["headline"].to_list()
scores = []

# Process row-by-row with progress
for i, h in enumerate(tqdm(headlines, desc="Scoring headlines")):
    scores.append(get_sentiment_score(h))

    # Every N rows: checkpoint
    if (i + 1) % CHECKPOINT_EVERY == 0 or (i + 1) == len(headlines):
        partial_df = results_pdf[:i+1].with_columns(pl.Series("score", scores))
        partial_df.write_parquet(f"{OUT_DIR}/sentiment_progress_up_to_row_{i+1}.parquet")

Scoring headlines: 100%|██████████| 104000/104000 [3:53:01<00:00,  7.44it/s]


In [60]:
df.drop_duplicates(["headline", "label"], inplace=True)

In [61]:
results_pdf = pl.from_pandas(df)
results_pdf.shape

(25948, 7)

In [63]:
bert_score_df = partial_df.unique(subset=["headline"])
bert_score_df.shape

(25936, 2)

In [67]:
bert_score_df

headline,score
str,f64
"""is weakness in archerdanielsmi…",-2.910015
"""resonant capital advisors llc …",-1.584835
"""carnegie mellon university ele…",1.900832
"""iron mountain irm outpaces sto…",3.282338
"""initial claims drop to 52year …",-1.15844
…,…
"""apple aapl boosts content port…",3.528634
"""dte energy dte q1 earnings sur…",0.934064
"""ge completes engine test with …",3.619068
"""scoop up 3 trucking stocks wit…",3.345572


In [64]:
results_merged_pdf = results_pdf.join(bert_score_df, on="headline", how="inner")

In [68]:
results_merged_pdf

date,open,close,movement,headline,label,label_num,score
str,f64,f64,f64,str,str,i64,f64
"""2023-01-03""",128.613993,123.470619,-5.143374,"""markets close mostly lower aga…","""negative""",-1,-2.399095
"""2023-01-03""",128.613993,123.470619,-5.143374,"""gap plummets on earnings miss …","""negative""",-1,-1.685511
"""2023-01-03""",128.613993,123.470619,-5.143374,"""billionaire ken fisher is sell…","""negative""",-1,2.723448
"""2023-01-03""",128.613993,123.470619,-5.143374,"""corning net income drops 13 sh…","""negative""",-1,-3.525082
"""2023-01-03""",128.613993,123.470619,-5.143374,"""internet explorer shutdown to …","""negative""",-1,-3.467552
…,…,…,…,…,…,…,…
"""2024-05-28""",190.618535,189.105621,-1.512914,"""boston scientific bsx q1 earni…","""positive""",1,4.128368
"""2024-05-28""",190.618535,189.105621,-1.512914,"""producer sentiment improves wi…","""positive""",1,-0.269502
"""2024-05-28""",190.618535,189.105621,-1.512914,"""equinix declares quarterly div…","""positive""",1,-0.087903
"""2024-05-28""",190.618535,189.105621,-1.512914,"""featurelove island dumps fast …","""positive""",1,0.549356


In [102]:
pd.DataFrame(results_merged_pdf).groupby(5).size()

5
negative     4802
neutral      8725
positive    12421
dtype: int64

In [84]:
df2 = results_merged_pdf.with_columns(
    (pl.col("score") > 0).alias("positive_score")
)
print(df2.columns)  # now includes “positive_score”


grouped_df = (
    df2
    .group_by(["label", "positive_score"])
    .agg([
        pl.count().alias("count"),
        pl.col("score").mean().alias("avg_score")
    ])
)

['date', 'open', 'close', 'movement', 'headline', 'label', 'label_num', 'score', 'positive_score']


(Deprecated in version 0.20.5)
  pl.count().alias("count"),


In [97]:
grouped_df

label,positive_score,count,avg_score
str,bool,u32,f64
"""positive""",True,8905,2.31511
"""negative""",False,3601,-2.20482
"""negative""",True,1201,1.827105
"""neutral""",False,1991,-1.575204
"""neutral""",True,6734,2.712473
"""positive""",False,3516,-1.572661


In [108]:
results_df.columns = ['headline', 'vader_compound', 'vader_score', 'tokenized']

vader_df = results_df.copy()

vader_df.head()

Unnamed: 0,headline,vader_compound,vader_score,tokenized
0,markets close mostly lower again rost panw rep...,-0.296,-2.184,"[input_ids, token_type_ids, attention_mask]"
1,gap plummets on earnings miss cuts fullyear fo...,-0.4215,-2.686,"[input_ids, token_type_ids, attention_mask]"
2,billionaire ken fisher is selling these 10 stocks,0.0,-1.0,"[input_ids, token_type_ids, attention_mask]"
3,corning net income drops 13 shares fall,0.296,0.184,"[input_ids, token_type_ids, attention_mask]"
4,internet explorer shutdown to cause japan prob...,-0.4019,-2.6076,"[input_ids, token_type_ids, attention_mask]"


In [111]:
bert_score_df.columns = ['headline', 'bert_score']
bert_score_df.head()

headline,bert_score
str,f64
"""is weakness in archerdanielsmi…",-2.910015
"""resonant capital advisors llc …",-1.584835
"""carnegie mellon university ele…",1.900832
"""iron mountain irm outpaces sto…",3.282338
"""initial claims drop to 52year …",-1.15844


In [114]:
bert_score_pdf = bert_score_df

In [117]:
bert_score_df = pd.DataFrame(bert_score_df)
bert_score_df.columns = ['headline', 'bert_score']
bert_score_df.head()

Unnamed: 0,headline,bert_score
0,is weakness in archerdanielsmidland company ny...,-2.910015
1,resonant capital advisors llc buys marqeta inc...,-1.584835
2,carnegie mellon university elects ansys ceo aj...,1.900832
3,iron mountain irm outpaces stock market gains ...,3.282338
4,initial claims drop to 52year low top 5 staffi...,-1.15844


In [119]:
headline_sentiment_df = vader_df.merge(bert_score_df, on = "headline", how = "inner")

comparing vader score and bert score

In [121]:
print(f"{len(headline_sentiment_df.loc[((headline_sentiment_df['vader_score']>0)&(headline_sentiment_df['bert_score']>0))])}")
print(f"{len(headline_sentiment_df.loc[((headline_sentiment_df['vader_score']<0)&(headline_sentiment_df['bert_score']<0))])}")
print(f"{len(headline_sentiment_df.loc[((headline_sentiment_df['vader_score']>0)&(headline_sentiment_df['bert_score']<0))])}")
print(f"{len(headline_sentiment_df.loc[((headline_sentiment_df['vader_score']<0)&(headline_sentiment_df['bert_score']>0))])}")

62292
15212
20996
5108


In [124]:
df.head()

Unnamed: 0,date,open,close,movement,headline,label,label_num
0,2023-01-03,128.613993,123.470619,-5.143374,markets close mostly lower again rost panw rep...,negative,-1
1,2023-01-03,128.613993,123.470619,-5.143374,gap plummets on earnings miss cuts fullyear fo...,negative,-1
2,2023-01-03,128.613993,123.470619,-5.143374,billionaire ken fisher is selling these 10 stocks,negative,-1
3,2023-01-03,128.613993,123.470619,-5.143374,corning net income drops 13 shares fall,negative,-1
4,2023-01-03,128.613993,123.470619,-5.143374,internet explorer shutdown to cause japan prob...,negative,-1


In [127]:
headline_sentiment_label_df = headline_sentiment_df.merge(df[['headline', 'label']], on = "headline", how = "inner")
headline_sentiment_label_df.head()

Unnamed: 0,headline,vader_compound,vader_score,tokenized,bert_score,label
0,markets close mostly lower again rost panw rep...,-0.296,-2.184,"[input_ids, token_type_ids, attention_mask]",-2.399095,negative
1,gap plummets on earnings miss cuts fullyear fo...,-0.4215,-2.686,"[input_ids, token_type_ids, attention_mask]",-1.685511,negative
2,billionaire ken fisher is selling these 10 stocks,0.0,-1.0,"[input_ids, token_type_ids, attention_mask]",2.723448,negative
3,corning net income drops 13 shares fall,0.296,0.184,"[input_ids, token_type_ids, attention_mask]",-3.525082,negative
4,internet explorer shutdown to cause japan prob...,-0.4019,-2.6076,"[input_ids, token_type_ids, attention_mask]",-3.467552,negative


In [128]:
headline_sentiment_label_df.loc[((headline_sentiment_label_df['vader_score']>0)&(headline_sentiment_label_df['bert_score']<0))].groupby("label").size()


label
negative     1996
neutral      6568
positive    12440
dtype: int64

In [None]:
headline_sentiment_label_df.loc[((headline_sentiment_label_df['vader_score']<0)&(headline_sentiment_label_df['bert_score']>0))].groupby("label").size()

label
negative    3584
neutral      732
positive     792
dtype: int64