In [2]:
# General DS libs
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt

In [3]:
reddit_df = pd.read_csv("../data/raw_reddit_data.csv")
reddit_df.head()

Unnamed: 0,id,title,selftext,comments,score,created_utc,url,num_comments,flair,is_self
0,1jwz2d3,Bitcoin Newcomers FAQ - Please read!,# Welcome to the /r/Bitcoin Newcomers FAQ\n\nY...,Please feel free to make [constructive edits](...,95,1744402000.0,https://www.reddit.com/r/Bitcoin/comments/1jwz...,19,,True
1,1lpofh3,"Daily Discussion, July 02, 2025",Please utilize this sticky thread for all gene...,107k stablecoin || I'm currently thinking abou...,23,1751437000.0,https://www.reddit.com/r/Bitcoin/comments/1lpo...,18,,True
2,1lpsaad,"HISTORY: 11 years ago today, the U.S. governme...",,Did he hold on to them? || They piss away $3 b...,272,1751452000.0,https://i.redd.it/kpf0ix5rtfaf1.png,9,,False
3,1lp2u2t,This guy bought a 100 BTC gold Casascius bar b...,,That’s insane but it’s a win win for him. $500...,7556,1751379000.0,https://i.redd.it/uewmioq0u9af1.jpeg,460,,False
4,1lptdcf,They will never admit it,,25 cards deep in denial,95,1751456000.0,https://i.redd.it/n3odwchl4gaf1.jpeg,1,,False


# 1. Mandatory Features

## 1.1. Separation of comments

The approach here will be to create another table containing only the post ids & the list of comments associated with it

In [4]:
reddit_comment_df = reddit_df[["id", "comments"]].copy()
reddit_comment_df.head()

Unnamed: 0,id,comments
0,1jwz2d3,Please feel free to make [constructive edits](...
1,1lpofh3,107k stablecoin || I'm currently thinking abou...
2,1lpsaad,Did he hold on to them? || They piss away $3 b...
3,1lp2u2t,That’s insane but it’s a win win for him. $500...
4,1lptdcf,25 cards deep in denial


In [5]:
print(f'comments for post no. 0:\n{reddit_comment_df["comments"][0]}\n\n\n')
print(f'comments for post no. 1:\n{reddit_comment_df["comments"][1]}\n\n\n')
print(f'comments for post no. 2:\n{reddit_comment_df["comments"][2]}\n\n\n')

comments for post no. 0:
Please feel free to make [constructive edits](https://www.reddit.com/r/BitcoinWiki/wiki/rbitcoin_sticky) to the document, they will be implemented pending mod review.  Also if you have any beginner questions regarding bitcoin feel free to post them in the comments below, several community members are happy to help answer them.  Note that this thread will be moderated and non-constructive feedback will be removed.  Thanks and welcome to Bitcoin!



comments for post no. 1:
107k stablecoin || I'm currently thinking about moving my old workplace pension, probably around 50% of my total pensions into MSTR. I'm in my early 30s, so this represents about 10 years of workplace contributions.

If I could invest directly into Bitcoin, I wouldn't think twice, but since I'm in the UK, MSTR is the only real Bitcoin exposure I can get within a SIPP.

That said, I'm aware we're quite far along in the current cycle (assuming cycles continue to repeat). MSTR isn't true Bitcoin,

Comments are currently separated by the symbol " || ". It's likely better if it's stored in json dict format instead, but I'll manage with this for now

In [6]:
# Drop columns where comments is NaN
reddit_comment_df = reddit_comment_df.dropna()
reddit_comment_df.reset_index(inplace=True, drop=True)
reddit_comment_df

Unnamed: 0,id,comments
0,1jwz2d3,Please feel free to make [constructive edits](...
1,1lpofh3,107k stablecoin || I'm currently thinking abou...
2,1lpsaad,Did he hold on to them? || They piss away $3 b...
3,1lp2u2t,That’s insane but it’s a win win for him. $500...
4,1lptdcf,25 cards deep in denial
5,1lpqp6p,astrology for men || pretty safe to say nobody...
6,1lphta8,17% is crazy dude \n\nThe time to take out a l...
7,1lps839,When you give anything that can possibly sligh...
8,1lpo231,The visual representation nobody needed lmfao ...
9,1lpisij,"Yes, but make it a regular thing. || It depend..."


In [7]:
reddit_comment_df["comments"] = reddit_comment_df["comments"].apply(lambda x: x.split(' || '))
reddit_comment_df.head()

Unnamed: 0,id,comments
0,1jwz2d3,[Please feel free to make [constructive edits]...
1,1lpofh3,"[107k stablecoin, I'm currently thinking about..."
2,1lpsaad,"[Did he hold on to them?, They piss away $3 bi..."
3,1lp2u2t,[That’s insane but it’s a win win for him. $50...
4,1lptdcf,[25 cards deep in denial]


In [8]:
reddit_comment_df = reddit_comment_df.explode("comments")
reddit_comment_df.reset_index(drop=True, inplace=True)

In [9]:
reddit_comment_df.head(30)

Unnamed: 0,id,comments
0,1jwz2d3,Please feel free to make [constructive edits](...
1,1lpofh3,107k stablecoin
2,1lpofh3,I'm currently thinking about moving my old wor...
3,1lpofh3,"Just a reminder, everything is going to zero a..."
4,1lpofh3,"Self perpetuating trend right now, when will i..."
5,1lpofh3,Now that's a v-shaped recovery
6,1lpofh3,Bought some of those cheap sats
7,1lpofh3,🚨 **BREAKING NEWS EVERYONE** 🚨\n\n‼️ **1 BTC i...
8,1lpofh3,"Could be a big day, seller exhaustion."
9,1lpofh3,Another good start before the burgers dump


In [10]:
reddit_comment_df["comments"][7]

'🚨 **BREAKING NEWS EVERYONE** 🚨\n\n‼️ **1 BTC is still = 1 BTC!!** ‼️\n\nThis important announcement was just released today! What will you do with this new information?! 🤔⏳'

## 1.2. Data Cleaning

In [11]:
import re
import contractions
import emoji

def clean_text(text):
    """# General data cleaning function for subreddit texts, including post's title, body and comments"""
    # If text is NaN, leave it be
    if pd.isna(text):
        return text

    # Replace multiple whitespaces with just one
    text = re.sub(r"\s+", " ", text)

    # Convert all emojis to textual representation
    text = emoji.demojize(text)

    # Replace URLs with tag <URL>
    text = re.sub(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", 
                  "<URL>", text)
    
	# Expand contractions in the text
    text = contractions.fix(text)

    return text

In [12]:
reddit_df["cleaned_title"] = reddit_df["title"].apply(clean_text)
reddit_df["cleaned_selftext"] = reddit_df["selftext"].apply(clean_text)
reddit_comment_df["cleaned_comments"] = reddit_comment_df["comments"].apply(clean_text)

In [13]:
reddit_df[["selftext", "cleaned_selftext"]].head(10)

Unnamed: 0,selftext,cleaned_selftext
0,# Welcome to the /r/Bitcoin Newcomers FAQ\n\nY...,# Welcome to the /r/Bitcoin Newcomers FAQ You ...
1,Please utilize this sticky thread for all gene...,Please utilize this sticky thread for all gene...
2,,
3,,
4,,
5,Chart above showing we are heading there!!,Chart above showing we are heading there!!
6,I have been collecting Bitcoin by DCA'ing ever...,I have been collecting Bitcoin by DCA'ing ever...
7,,
8,,
9,I barely have any savings. But I just paid off...,I barely have any savings. But I just paid off...


In [14]:
reddit_comment_df[["comments", "cleaned_comments"]].head(10)

Unnamed: 0,comments,cleaned_comments
0,Please feel free to make [constructive edits](...,Please feel free to make [constructive edits](...
1,107k stablecoin,107k stablecoin
2,I'm currently thinking about moving my old wor...,I am currently thinking about moving my old wo...
3,"Just a reminder, everything is going to zero a...","Just a reminder, everything is going to zero a..."
4,"Self perpetuating trend right now, when will i...","Self perpetuating trend right now, when will i..."
5,Now that's a v-shaped recovery,Now that is a v-shaped recovery
6,Bought some of those cheap sats,Bought some of those cheap sats
7,🚨 **BREAKING NEWS EVERYONE** 🚨\n\n‼️ **1 BTC i...,:police_car_light: **BREAKING NEWS EVERYONE** ...
8,"Could be a big day, seller exhaustion.","Could be a big day, seller exhaustion."
9,Another good start before the burgers dump,Another good start before the burgers dump


## 1.3. Run the sentiment analyzer

In [15]:
import torch

In [16]:
# This is to delete CUDA cache
try:
    del tokenizer
except:
    print("tokenizer already deleted")

try:
    del model
except:
    print("model already deleted")

import gc
torch.cuda.empty_cache()
gc.collect()

tokenizer already deleted
model already deleted


20

In [17]:
# Define label-id mappings
label2id = {"negative": 0, "neutral": 1, "positive": 2}
id2label = {0: "negative", 1: "neutral", 2: "positive"}

In [18]:
from transformers import BertForSequenceClassification, BertTokenizerFast, BertConfig
model_name = "ProsusAI/finbert"

tokenizer = BertTokenizerFast.from_pretrained(
    model_name,
    use_fast=True,
    padding_side="left",
    padding_token='[PAD]',
)

config = BertConfig.from_pretrained(
    model_name,
    num_labels=3,
    label2id=label2id,
    id2label=id2label,
)

model = BertForSequenceClassification.from_pretrained(
    model_name,
    config=config,
    device_map="auto",
)

  from .autonotebook import tqdm as notebook_tqdm


In [19]:
from transformers import pipeline

finbert_classifier = pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer,
    top_k=3
)

Device set to use cuda:0


In [26]:
# We only want to apply sentiment analysis on non-na entries
non_na_selftext = reddit_df[~pd.isna(reddit_df["cleaned_selftext"])]

In [27]:
# Apply pipeline on title, selftext and comments
title_predicted_sentiments = finbert_classifier(
    reddit_df["cleaned_title"].to_list(),
    truncation=True,
    max_length=512,
)
selftext_predicted_sentiments = finbert_classifier(
    non_na_selftext["cleaned_selftext"].to_list(),
    truncation=True,
    max_length=512,
)
comments_predicted_sentiments = finbert_classifier(
    reddit_comment_df["cleaned_comments"].to_list(),
    truncation=True,
    max_length=512,
)

In [28]:
selftext_predicted_sentiments

[[{'label': 'positive', 'score': 0.9041252732276917},
  {'label': 'negative', 'score': 0.07342976331710815},
  {'label': 'neutral', 'score': 0.022444985806941986}],
 [{'label': 'positive', 'score': 0.9338553547859192},
  {'label': 'neutral', 'score': 0.03885916620492935},
  {'label': 'negative', 'score': 0.0272854994982481}],
 [{'label': 'positive', 'score': 0.6091300249099731},
  {'label': 'negative', 'score': 0.3599986433982849},
  {'label': 'neutral', 'score': 0.030871327966451645}],
 [{'label': 'positive', 'score': 0.8393078446388245},
  {'label': 'negative', 'score': 0.141819566488266},
  {'label': 'neutral', 'score': 0.018872568383812904}],
 [{'label': 'positive', 'score': 0.7520480751991272},
  {'label': 'neutral', 'score': 0.15562866628170013},
  {'label': 'negative', 'score': 0.09232325851917267}],
 [{'label': 'positive', 'score': 0.5785338878631592},
  {'label': 'negative', 'score': 0.3793378472328186},
  {'label': 'neutral', 'score': 0.04212823882699013}],
 [{'label': 'posit

In [30]:
# Attach the scores back the the dataframe where they come from
reddit_df = pd.concat([reddit_df, pd.DataFrame({"title_score_dict" : title_predicted_sentiments})], axis=1)
non_na_selftext = pd.concat([non_na_selftext, pd.DataFrame({"selftext_score_dict" : selftext_predicted_sentiments})], axis=1)
reddit_comment_df = pd.concat([reddit_comment_df, pd.DataFrame({"comments_score_dict" : comments_predicted_sentiments})], axis=1)

In [31]:
# Left join reddit_df and non_na_selftext based on post id
reddit_df = pd.merge(reddit_df, non_na_selftext[["id", "selftext_score_dict"]],
                     how="left",
                     on="id")

In [32]:
reddit_df.head()

Unnamed: 0,id,title,selftext,comments,score,created_utc,url,num_comments,flair,is_self,cleaned_title,cleaned_selftext,title_score_dict,selftext_score_dict
0,1jwz2d3,Bitcoin Newcomers FAQ - Please read!,# Welcome to the /r/Bitcoin Newcomers FAQ\n\nY...,Please feel free to make [constructive edits](...,95,1744402000.0,https://www.reddit.com/r/Bitcoin/comments/1jwz...,19,,True,Bitcoin Newcomers FAQ - Please read!,# Welcome to the /r/Bitcoin Newcomers FAQ You ...,"[{'label': 'positive', 'score': 0.942112445831...","[{'label': 'positive', 'score': 0.904125273227..."
1,1lpofh3,"Daily Discussion, July 02, 2025",Please utilize this sticky thread for all gene...,107k stablecoin || I'm currently thinking abou...,23,1751437000.0,https://www.reddit.com/r/Bitcoin/comments/1lpo...,18,,True,"Daily Discussion, July 02, 2025",Please utilize this sticky thread for all gene...,"[{'label': 'positive', 'score': 0.938369214534...","[{'label': 'positive', 'score': 0.933855354785..."
2,1lpsaad,"HISTORY: 11 years ago today, the U.S. governme...",,Did he hold on to them? || They piss away $3 b...,272,1751452000.0,https://i.redd.it/kpf0ix5rtfaf1.png,9,,False,"HISTORY: 11 years ago today, the YOU.S. govern...",,"[{'label': 'positive', 'score': 0.934455394744...",
3,1lp2u2t,This guy bought a 100 BTC gold Casascius bar b...,,That’s insane but it’s a win win for him. $500...,7556,1751379000.0,https://i.redd.it/uewmioq0u9af1.jpeg,460,,False,This guy bought a 100 BTC gold Casascius bar b...,,"[{'label': 'positive', 'score': 0.899448633193...",
4,1lptdcf,They will never admit it,,25 cards deep in denial,95,1751456000.0,https://i.redd.it/n3odwchl4gaf1.jpeg,1,,False,They will never admit it,,"[{'label': 'positive', 'score': 0.817370116710...",


In [33]:
reddit_comment_df.head()

Unnamed: 0,id,comments,cleaned_comments,comments_score_dict
0,1jwz2d3,Please feel free to make [constructive edits](...,Please feel free to make [constructive edits](...,"[{'label': 'positive', 'score': 0.921246707439..."
1,1lpofh3,107k stablecoin,107k stablecoin,"[{'label': 'positive', 'score': 0.908650517463..."
2,1lpofh3,I'm currently thinking about moving my old wor...,I am currently thinking about moving my old wo...,"[{'label': 'positive', 'score': 0.897661805152..."
3,1lpofh3,"Just a reminder, everything is going to zero a...","Just a reminder, everything is going to zero a...","[{'label': 'positive', 'score': 0.702814579010..."
4,1lpofh3,"Self perpetuating trend right now, when will i...","Self perpetuating trend right now, when will i...","[{'label': 'positive', 'score': 0.872654676437..."


In [35]:
# Save the df for future work
reddit_df.to_csv("sentiment_analyzed_reddit_df.csv", index=False)
reddit_comment_df.to_csv("sentiment_analyzed_reddit_df_comments.csv", index=False)

## 1.4. Apply weights

## 1.5. Aggregate into a score

## Draft

In [None]:
# Code adapted from https://www.geeksforgeeks.org/python/python-sentiment-analysis-using-vader/
def sentiment_scores(sentence):
    sid_obj = SentimentIntensityAnalyzer()
    sentiment_dict = sid_obj.polarity_scores(sentence)

    print(f"Sentiment Scores: {sentiment_dict}")
    print(f"Negative Sentiment: {sentiment_dict['neg']*100}%")
    print(f"Neutral Sentiment: {sentiment_dict['neu']*100}%")
    print(f"Positive Sentiment: {sentiment_dict['pos']*100}%")

    if sentiment_dict["compound"] >= 0.05:
        print("Overall Sentiment: Positive")
    elif sentiment_dict["compound"] <= -0.05:
        print("Overall Sentiment: Negative")
    else:
        print("Overall Sentiment: Neutral")

In [6]:
sid_obj = SentimentIntensityAnalyzer()

In [None]:
reddit_df["title_scores"] = reddit_df["title"].apply(
    lambda x: sid_obj.polarity_scores(x)
)

In [8]:
reddit_df.head()

Unnamed: 0,id,title,selftext,comments,score,created_utc,url,num_comments,flair,is_self,title_scores
0,1jwz2d3,Bitcoin Newcomers FAQ - Please read!,# Welcome to the /r/Bitcoin Newcomers FAQ\n\nY...,Please feel free to make [constructive edits](...,95,1744402000.0,https://www.reddit.com/r/Bitcoin/comments/1jwz...,19,,True,"{'neg': 0.0, 'neu': 0.659, 'pos': 0.341, 'comp..."
1,1lpofh3,"Daily Discussion, July 02, 2025",Please utilize this sticky thread for all gene...,107k stablecoin || I'm currently thinking abou...,23,1751437000.0,https://www.reddit.com/r/Bitcoin/comments/1lpo...,18,,True,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
2,1lpsaad,"HISTORY: 11 years ago today, the U.S. governme...",,Did he hold on to them? || They piss away $3 b...,272,1751452000.0,https://i.redd.it/kpf0ix5rtfaf1.png,9,,False,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
3,1lp2u2t,This guy bought a 100 BTC gold Casascius bar b...,,That’s insane but it’s a win win for him. $500...,7556,1751379000.0,https://i.redd.it/uewmioq0u9af1.jpeg,460,,False,"{'neg': 0.0, 'neu': 0.892, 'pos': 0.108, 'comp..."
4,1lptdcf,They will never admit it,,25 cards deep in denial,95,1751456000.0,https://i.redd.it/n3odwchl4gaf1.jpeg,1,,False,"{'neg': 0.285, 'neu': 0.715, 'pos': 0.0, 'comp..."


In [9]:
reddit_df["overall_score"] = reddit_df["title_scores"].apply(lambda x: x["compound"])

In [10]:
reddit_df["overall_score"]

0     0.3802
1     0.0000
2     0.0000
3     0.4404
4    -0.1511
5     0.0000
6     0.0000
7     0.0000
8     0.0000
9     0.4215
10    0.0000
11    0.0000
12    0.3595
13    0.4926
14    0.2023
15    0.0000
16    0.2500
17    0.0000
18    0.0000
19    0.0000
20    0.2263
21    0.0772
22    0.0000
23    0.5719
24    0.0000
25    0.4404
26    0.3612
27    0.0000
28    0.0000
29    0.0000
30    0.0000
31   -0.3561
32    0.0000
33    0.0000
34    0.0000
35    0.1779
36    0.0000
37    0.0000
38   -0.4215
39    0.0000
40    0.4404
41    0.4215
42    0.4404
43    0.0000
44    0.0000
45    0.0000
46    0.0000
47    0.5267
48    0.0000
49    0.0000
Name: overall_score, dtype: float64