# IMPORTANT**
You have to run the cell below first before you are able to run the block that collects the data


In [1]:
from dotenv import load_dotenv
import os

load_dotenv()


True

In [3]:
import praw
import pandas as pd
import datetime
from dotenv import load_dotenv
import os

# Load the .env file
load_dotenv()

reddit = praw.Reddit(
    client_id=os.getenv('REDDIT_CLIENT_ID'),
    client_secret=os.getenv('REDDIT_CLIENT_SECRET'),
    user_agent=os.getenv('REDDIT_USER_AGENT')
)

# === Function to fetch posts from a subreddit ===
def fetch_reddit_posts(subreddit_name, max_posts=1000, time_filter='year'):
    """
    Fetches top Reddit posts from a subreddit.

    Parameters:
        subreddit_name (str): Name of the subreddit.
        max_posts (int): Number of posts to fetch (up to 1000).
        time_filter (str): 'all', 'year', 'month', 'week', 'day', 'hour'.

    Returns:
        DataFrame: Posts from the subreddit.
    """
    subreddit = reddit.subreddit(subreddit_name)
    posts = []

    for submission in subreddit.top(limit=max_posts, time_filter=time_filter):
        created_utc = datetime.datetime.fromtimestamp(submission.created_utc)
        posts.append({
            "subreddit": subreddit_name,
            "id": submission.id,
            "title": submission.title,
            "author": str(submission.author),
            "score": submission.score,
            "num_comments": submission.num_comments,
            "created_utc": created_utc,
            "selftext": submission.selftext,
            "url": submission.url,
            "permalink": f"https://reddit.com{submission.permalink}"
        })

    print(f"Fetched {len(posts)} posts from r/{subreddit_name}")
    return pd.DataFrame(posts)


# === Parameters ===
subreddits = ["stocks", 
              "stockstobuytoday", 
              "stocksandtrading", 
              "wallstreetbets", 
              "options",
              "pennystocks",
              "investing",
              "stockmarket",
              "robinhood",
              "fidelity",
              "finance",
              "economics",
              "personalfinance"
             ]
max_posts_per_subreddit = 1000
time_filter = "year"  # Options: 'all', 'year', 'month', 'week', 'day', 'hour'

# === Fetch all posts and combine ===
all_posts_df = pd.concat(
    [fetch_reddit_posts(sub, max_posts=max_posts_per_subreddit, time_filter=time_filter) for sub in subreddits],
    ignore_index=True
)

# === Save to CSV ===
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
csv_filename = f"reddit_posts_combined_{timestamp}.csv"
all_posts_df.to_csv(csv_filename, index=False)
print(f"All data saved to: {csv_filename}")


Fetched 1000 posts from r/stocks
Fetched 1000 posts from r/stockstobuytoday
Fetched 261 posts from r/stocksandtrading
Fetched 1000 posts from r/wallstreetbets
Fetched 1000 posts from r/options
Fetched 999 posts from r/pennystocks
Fetched 1000 posts from r/investing
Fetched 1000 posts from r/stockmarket
Fetched 400 posts from r/robinhood
Fetched 476 posts from r/fidelity
Fetched 125 posts from r/finance
Fetched 1000 posts from r/economics
Fetched 1000 posts from r/personalfinance
All data saved to: reddit_posts_combined_20250423_140014.csv
