In [7]:
import feedparser
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime
import time

# -------- CONFIG --------
KEYWORD = "Spotify"
SUBREDDITS = ["spotify", "truespotify"]
POST_LIMIT_PER_SUB = 50   # RSS is smaller but high quality

OUTPUT_PATH = "../data/raw_reddit_spotify.csv"
# ------------------------

def clean_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text(separator=" ").strip()

def collect_data():
    rows = []

    for subreddit in SUBREDDITS:
        print(f"Fetching RSS from r/{subreddit}")
        feed_url = f"https://www.reddit.com/r/{subreddit}/search.rss?q={KEYWORD}&restrict_sr=1"
        feed = feedparser.parse(feed_url)

        for entry in feed.entries[:POST_LIMIT_PER_SUB]:
            text = f"{entry.title} {clean_html(entry.get('summary', ''))}"

            if len(text) < 20:
                continue

            rows.append({
                "platform": "reddit",
                "subreddit": subreddit,
                "created_utc": datetime(*entry.published_parsed[:6]),
                "text": text
            })

        time.sleep(1)

    return pd.DataFrame(rows)

if __name__ == "__main__":
    df = collect_data()
    df.drop_duplicates(subset="text", inplace=True)
    df.to_csv(OUTPUT_PATH, index=False)
    print(f"Saved {len(df)} rows to {OUTPUT_PATH}")

Fetching RSS from r/spotify
Fetching RSS from r/truespotify
Saved 50 rows to ../data/raw_reddit_spotify.csv
