## Getting API Keys

In [2]:
from dotenv import load_dotenv
import os

load_dotenv()

REDDIT_ID = os.getenv("REDDIT_CLIENT_ID")
REDDIT_SECRET = os.getenv("REDDIT_CLIENT_SECRET")

## 📦 Install required libraries

In [3]:
# 📚 Imports
import praw
import pandas as pd
import re
import random
import time
from datetime import datetime, timedelta
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
import nltk
nltk.download("punkt")

[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## 🔑 Reddit API Setup (Read-only mode)

In [4]:
reddit = praw.Reddit(
    client_id=REDDIT_ID,
    client_secret=REDDIT_SECRET,
    user_agent="youtube_to_reddit_sentiment"
)

## 📥 Load video titles


In [5]:
youtube_df = pd.read_csv("../data/youtube_data.csv")

# We'll use it as 'month' or 'week' here (PRAW only allows fixed ranges)
time_filter = "month"

## 🧹 Clean text

In [6]:
def clean_text(text):
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    return text.lower()

## 📊 Sentiment analysis


In [7]:
analyzer = SentimentIntensityAnalyzer()
def analyze_sentiment(text):
    return analyzer.polarity_scores(text)["compound"]

In [8]:
# 😃 Convert score to emoji
def sentiment_emoji(score):
    if score >= 0.5:
        return "😃"
    elif score <= -0.5:
        return "😠"
    else:
        return "😐"

## 🧠 Reddit Post + Comment collector


In [9]:
%%time
all_data = []

for topic in youtube_df["video_title"].unique():
    posts = reddit.subreddit("all").search(query=topic, limit=5, time_filter=time_filter)

    for post in posts:
        post_data = {
            "youtube_title": topic,
            "reddit_post_title": post.title,
            "post_score": post.score,
            "post_url": post.url,
            "post_created": pd.to_datetime(post.created_utc, unit="s"),
            "post_sentiment": analyze_sentiment(clean_text(post.title)),
            "post_sentiment_emoji": sentiment_emoji(analyze_sentiment(clean_text(post.title))),
        }

        # ⛓️ Add top 3 comments per post
        post.comments.replace_more(limit=0)
        comments = post.comments[:3]
        for comment in comments:
            cleaned = clean_text(comment.body)
            all_data.append({
                **post_data,
                "comment": comment.body,
                "cleaned_comment": cleaned,
                "comment_sentiment": analyze_sentiment(cleaned),
                "comment_sentiment_emoji": sentiment_emoji(analyze_sentiment(cleaned)),
                "comment_author": str(comment.author),
                "comment_score": comment.score
            })

CPU times: user 2.04 s, sys: 149 ms, total: 2.19 s
Wall time: 1min 32s


### This is how our data looks in `semi-structured` form

In [15]:
all_data[:2]

[{'youtube_title': 'The Elder Scrolls IV: Oblivion Remastered - Full Reveal Stream',
  'reddit_post_title': 'Possessed Love Season 2 - Episode 9 - 250422',
  'post_score': 6,
  'post_url': 'https://www.reddit.com/r/koreanvariety/comments/1k5cfsl/possessed_love_season_2_episode_9_250422/',
  'post_created': Timestamp('2025-04-22 17:40:21'),
  'post_sentiment': 0.6369,
  'post_sentiment_emoji': '😃',
  'comment': 'why did kangwon leave? weird 🤔🤔',
  'cleaned_comment': 'why did kangwon leave weird ',
  'comment_sentiment': -0.2263,
  'comment_sentiment_emoji': '😐',
  'comment_author': '009763',
  'comment_score': 1},
 {'youtube_title': 'The Elder Scrolls IV: Oblivion Remastered - Full Reveal Stream',
  'reddit_post_title': 'Possessed Love Season 2 - Episode 9 - 250422',
  'post_score': 6,
  'post_url': 'https://www.reddit.com/r/koreanvariety/comments/1k5cfsl/possessed_love_season_2_episode_9_250422/',
  'post_created': Timestamp('2025-04-22 17:40:21'),
  'post_sentiment': 0.6369,
  'post_s

### This is how our data looks when `structured` into a DataFrame

In [11]:
df = pd.DataFrame(all_data)
df

Unnamed: 0,youtube_title,reddit_post_title,post_score,post_url,post_created,post_sentiment,post_sentiment_emoji,comment,cleaned_comment,comment_sentiment,comment_sentiment_emoji,comment_author,comment_score
0,The Elder Scrolls IV: Oblivion Remastered - Fu...,Possessed Love Season 2 - Episode 9 - 250422,6,https://www.reddit.com/r/koreanvariety/comment...,2025-04-22 17:40:21,0.6369,😃,why did kangwon leave? weird 🤔🤔,why did kangwon leave weird,-0.2263,😐,009763,1
1,The Elder Scrolls IV: Oblivion Remastered - Fu...,Possessed Love Season 2 - Episode 9 - 250422,6,https://www.reddit.com/r/koreanvariety/comment...,2025-04-22 17:40:21,0.6369,😃,I saw the spoilers and just wondering why did ...,i saw the spoilers and just wondering why did ...,0.6280,😃,dreamstorming,1
2,The Elder Scrolls IV: Oblivion Remastered - Fu...,Possessed Love Season 2 - Episode 9 - 250422,6,https://www.reddit.com/r/koreanvariety/comment...,2025-04-22 17:40:21,0.6369,😃,Yugyeong really said- if you're not giving me ...,yugyeong really said if youre not giving me an...,-0.0351,😐,Rare-Counter-8772,1
3,The Elder Scrolls IV: Oblivion Remastered - Of...,The Elder Scrolls IV: Oblivion Remastered - Of...,3739,https://www.youtube.com/watch?v=wFJ3PZuAjK4,2025-04-22 15:15:16,0.0000,😐,"Wow, that looks absolutely great. A *huge* gra...",wow that looks absolutely great a huge graphic...,0.9716,😃,TheVoidDragon,1239
4,The Elder Scrolls IV: Oblivion Remastered - Of...,The Elder Scrolls IV: Oblivion Remastered - Of...,3739,https://www.youtube.com/watch?v=wFJ3PZuAjK4,2025-04-22 15:15:16,0.0000,😐,My goodness. I'd never expected this. They wen...,my goodness id never expected this they went a...,0.1779,😐,Walgreens_Security,620
...,...,...,...,...,...,...,...,...,...,...,...,...,...
182,Squad is IN DANGER! 😳 #SquadVsMonsters,"[USA][H] Over a hundred cib and new 3DS games,...",3,https://www.reddit.com/r/GameSale/comments/1jp...,2025-04-02 12:42:01,0.0000,😐,"Would you do $65 shipped for Pokemon SMD, rumb...",would you do 65 shipped for pokemon smd rumble...,0.0000,😐,cheezytacos2,1
183,Squad is IN DANGER! 😳 #SquadVsMonsters,"[USA][H] Over a hundred cib and new 3DS games,...",3,https://www.reddit.com/r/GameSale/comments/1jp...,2025-04-02 12:42:01,0.0000,😐,"Can you do Mario & Sonic at London, Mario and ...",can you do mario sonic at london mario and so...,-0.1779,😐,Strangy1234,1
184,Squad is IN DANGER! 😳 #SquadVsMonsters,"Is it just me, or is there a consensus that '9...",5,https://www.reddit.com/r/90sHipHop/comments/1j...,2025-03-29 11:26:58,0.6369,😃,Just you. 93 or 94.,just you 93 or 94,0.0000,😐,BobbyR123,15
185,Squad is IN DANGER! 😳 #SquadVsMonsters,"Is it just me, or is there a consensus that '9...",5,https://www.reddit.com/r/90sHipHop/comments/1j...,2025-03-29 11:26:58,0.6369,😃,I feel that 93 was the biggest shift in the music,i feel that 93 was the biggest shift in the music,0.0000,😐,random_name23631,14


## Checking Downvoted Comments

In [12]:
df[df["comment_score"] < 0][["comment", "comment_score"]]

for comment, comment_score in zip(df["comment"], df["comment_score"]):
    if comment_score < 0:
        print(f"Comment:\n{comment}", f"Comment Score:\n{comment_score}", sep="\n\n")

## 💾 Save to CSV

In [13]:
df.to_csv("../data/reddit_data.csv", index=False)