In [None]:
import pandas as pd
from datetime import datetime
import time
from reddit_client import redditClient

# connect to reddit
reddit = redditClient()
subreddit_names = ["OnePiece", "anime", "manga", "animesuggestions"]
subreddits = [reddit.subreddit(name) for name in subreddit_names]

posts = []
comments = []
post_ids = set()

# sort types and time filters
sort_types_with_time = ['top', 'controversial']
sort_types_basic = ['hot', 'new', 'rising']
time_filters = ['hour', 'day', 'week', 'month', 'year', 'all']

print("collecting posts...")

for subreddit in subreddits:
    for sort in sort_types_basic:
        try:
            submissions = getattr(subreddit, sort)(limit=2000)
            for submission in submissions:
                if submission.id in post_ids:
                    continue
                post_ids.add(submission.id)

                posts.append({
                    "id": submission.id,
                    "subreddit": str(submission.subreddit),
                    "title": submission.title,
                    "score": submission.score,
                    "upvote_ratio": submission.upvote_ratio,
                    "num_comments": submission.num_comments,
                    "created_utc": datetime.utcfromtimestamp(submission.created_utc),
                    "selftext": submission.selftext,
                    "author": str(submission.author),
                    "url": submission.url,
                })

                if len(posts) >= 20000:
                    break
            if len(posts) >= 20000:
                break
        except Exception as e:
            print(f"Failed on {sort} for {subreddit.display_name}: {e}")

    for sort in sort_types_with_time:
        for tf in time_filters:
            try:
                submissions = getattr(subreddit, sort)(time_filter=tf, limit=2000)
                for submission in submissions:
                    if submission.id in post_ids:
                        continue
                    post_ids.add(submission.id)

                    posts.append({
                        "id": submission.id,
                        "subreddit": str(submission.subreddit),
                        "title": submission.title,
                        "score": submission.score,
                        "upvote_ratio": submission.upvote_ratio,
                        "num_comments": submission.num_comments,
                        "created_utc": datetime.utcfromtimestamp(submission.created_utc),
                        "selftext": submission.selftext,
                        "author": str(submission.author),
                        "url": submission.url,
                    })

                    if len(posts) >= 20000:
                        break
                if len(posts) >= 20000:
                    break
            except Exception as e:
                print(f"Failed on {sort}:{tf} for {subreddit.display_name}: {e}")
        if len(posts) >= 20000:
            break
    if len(posts) >= 20000:
        break

print(f"collected {len(posts)} posts.")

# collect comments until 40k
print("collecting comments...")

for post_data in posts:
    if len(comments) >= 40000:
        break
    try:
        submission = reddit.submission(id=post_data["id"])
        submission.comments.replace_more(limit=0)
        for comment in submission.comments:
            if len(comments) >= 40000:
                break
            comments.append({
                "post_id": submission.id,
                "comment_id": comment.id,
                "body": comment.body,
                "score": comment.score,
                "author": str(comment.author),
                "created_utc": datetime.utcfromtimestamp(comment.created_utc),
            })
        time.sleep(0.5)
    except Exception as e:
        print(f"Failed to fetch comments for post {post_data['id']}: {e}")

print(f"collected {len(comments)} comments.")

# save results
pd.DataFrame(posts).to_csv("anime_15k_posts.csv", index=False)
pd.DataFrame(comments).to_csv("anime_40k_comments.csv", index=False)

print("data saved")


Version 7.7.1 of praw is outdated. Version 7.8.1 was released Friday October 25, 2024.


Starting post collection...


  "created_utc": datetime.utcfromtimestamp(submission.created_utc),
  "created_utc": datetime.utcfromtimestamp(submission.created_utc),


Collected 19252 unique posts.
Collecting comments...


  "created_utc": datetime.utcfromtimestamp(comment.created_utc),


Collected 40000 comments.
Data saved to anime_15k_posts.csv and anime_40k_comments.csv.
