In [18]:
import os
import json
import praw
import time
from datetime import datetime

# API credentials 

reddit = praw.Reddit(
    client_id='Y5DdpLJDcxJPSAT7Vvmo-A',
    client_secret='KZ83gqe2B-C3uJwlcjEI4e1gR_gpPw',
    user_agent='test by u/Inner-Astronomer3735'
)

# constants 
max_depth = 30
max_comments = 30000
base_folder = 'data'


if not os.path.exists(base_folder):
    os.makedirs(base_folder)

def fetch_comments(comment_forest, depth=0):

    if depth > max_depth:
        return []
    
    comment_trees = []

    # loop through root nodes
    for comment in comment_forest:
        
        # When there's too many comments the API collaposes some of them into MoreComments objects
        # This bit of code is to expand those and get the comments. 

        if isinstance(comment, praw.models.MoreComments):

            time.sleep(0.05)
            # this is an API call (time.sleep with rate limit in mind)
            more = comment.comments()
            comment_trees.extend(fetch_comments(more, depth))

            continue

        dictionary = {
                    "author": str(comment.author),
                    "body": comment.body,
                    "score": comment.score,
                    "replies": fetch_comments(comment.replies, depth+1)
                }
        
        comment_trees.append(dictionary)

    return comment_trees

# The input is a post object (not a post_id like it used to be, but the actual post object)
# Timestamp is used as part of the directory title, for organization
def fetch_post(post, rank, timestamp): 

    comments_thread = post.comments

    comments_processed = fetch_comments(comments_thread)

    # create file path
    subreddit_folder = os.path.join(base_folder, post.subreddit.display_name)
    timestamp_folder = os.path.join(subreddit_folder, timestamp)

    if not os.path.exists(timestamp_folder):
        os.makedirs(timestamp_folder)

    post_file = os.path.join(timestamp_folder,f'{post.id}.json')


    # gather post data
    post_data = {
                "title": post.title,
                "author": str(post.author),
                "subreddit": post.subreddit.display_name,
                "rank" : rank,
                "score": post.score,
                "upvote_ratio": post.upvote_ratio,
                "num_comments (reported by reddit)": post.num_comments,
                "url": post.url,
                "id": post.id,
                "selftext": post.selftext,
                "comments": comments_processed
            }
    
    with open(post_file, "w", encoding="utf-8") as file:
        json.dump(post_data, file, indent=4)
        # print(f'Saved {post.id}.json')

# Apparently n_posts has to be less than 100. 
def fetch_subreddit(subreddit_name, n_posts): 

    skipped = 0
    downloaded = 0

    subreddit = reddit.subreddit(subreddit_name)
    
    # timestamp for directory name. (
    # We freeze this timestamp as soon as we run the function 
    # so that it doesn't change the folder name even if this takes time to run.
    timestamp = datetime.now().strftime("date %m-%d-%Y time %H-%M")

    # Do we want top or hot? 
    # I'm assuming the enumeration here corresponds to the actual rank, testing quickly it seemed to be true. 
    for rank, post in enumerate(subreddit.hot(limit=n_posts)):

        if post.num_comments < max_comments:
            
            # print(f'Fetching post {post.id} (rank = {rank}, num_comments = {post.num_comments}).')
            fetch_post(post, rank, timestamp)
            downloaded += 1
            if downloaded % 10 == 1: 
                print(f'Downloaded {downloaded} posts from {subreddit}.')

        else: 
            print(f'Skipped post with id {post.id} because it has {post.num_comments} comments. (max_comments set to {max_comments}).')
            skipped += 1
            print(f'Skipped count: {skipped}')

In [20]:
# test
# seems to weigh like 200kb and take ~1min per 1000 comments. 
# (I think there's such a thing as "authenticating an API user" which might improve rate limits, but takes some weeks to get approval).
n = 60
# general
# fetch_subreddit('politics',n) ran, got 41, then too many requests error.
# fetch_subreddit('PoliticalDiscussion',n) got 61. 
# left
fetch_subreddit('communism', n) 
fetch_subreddit('socialism', n)
fetch_subreddit('LateStageCapitalism', n)

# right
fetch_subreddit('Conservative', n)
fetch_subreddit('Libertarian', n)
fetch_subreddit('Anarcho_Capitalism', n)

# possibly uncontroversial?
fetch_subreddit('MadeMeSmile', n)
fetch_subreddit('DogTraining',n)
fetch_subreddit('ADHD',n)
fetch_subreddit('stopdrinking',n)

Downloaded 1 posts from communism.
Downloaded 11 posts from communism.
Downloaded 21 posts from communism.
Downloaded 31 posts from communism.
Downloaded 41 posts from communism.
Downloaded 51 posts from communism.
Downloaded 1 posts from socialism.
Downloaded 11 posts from socialism.
Downloaded 21 posts from socialism.
Downloaded 31 posts from socialism.
Downloaded 41 posts from socialism.
Downloaded 51 posts from socialism.
Downloaded 1 posts from LateStageCapitalism.
Downloaded 11 posts from LateStageCapitalism.
Downloaded 21 posts from LateStageCapitalism.
Downloaded 31 posts from LateStageCapitalism.
Downloaded 41 posts from LateStageCapitalism.
Downloaded 51 posts from LateStageCapitalism.
Downloaded 1 posts from Conservative.
Downloaded 11 posts from Conservative.
Downloaded 21 posts from Conservative.
Downloaded 31 posts from Conservative.
Downloaded 41 posts from Conservative.
Downloaded 51 posts from Conservative.
Downloaded 1 posts from Libertarian.
Downloaded 11 posts from 