In [None]:
import os
import json
import praw
import time
from datetime import datetime

# API credentials 

reddit = praw.Reddit(
    client_id='Y5DdpLJDcxJPSAT7Vvmo-A',
    client_secret='KZ83gqe2B-C3uJwlcjEI4e1gR_gpPw',
    user_agent='test by u/Inner-Astronomer3735'
)

# constants 
max_depth = 45
max_comments = 30000

def smart_limit_reset_pause(n = 5):
    ''' 
    checks if a pause is necessary (i.e. if we have less than n requests left)
    and waits for the rate limit reset)
    '''
    remaining = reddit.auth.limits['remaining']

    if remaining < n: # We pause because we're close to hitting the limit

        # The limit resets every 10 minutes on the hour (i.e. 3:00, 3:10, 3:20... etc).
        seconds_until_reset = reddit.auth.limits['reset_timestamp'] + 10 - time.time()
        seconds_until_reset = max(seconds_until_reset, 0) #sometimes it comes out negative which would throw an in the time.sleep() call. 

        minutes_until_reset = seconds_until_reset/60

        print(f'Sleeping for {minutes_until_reset:.2f} minutes, as we are close to hitting rate limit.')

        time.sleep(seconds_until_reset)



def fetch_comments(comment_forest, depth=0):

    smart_limit_reset_pause()

    if depth > max_depth:
        return []
    
    comment_trees = []

    # loop through root nodes
    for comment in comment_forest:
        
        # When there's too many comments the API collaposes some of them into MoreComments objects
        # This bit of code is to expand those and get the comments. 

        if isinstance(comment, praw.models.MoreComments):

            smart_limit_reset_pause()

            more = comment.comments()
            comment_trees.extend(fetch_comments(more, depth))

            continue

        dictionary = {
                    "author": str(comment.author),
                    "body": comment.body,
                    "score": comment.score,
                    "replies": fetch_comments(comment.replies, depth+1)
                }
        
        comment_trees.append(dictionary)

    return comment_trees

# The input is a post object (not a post_id like it used to be, but the actual post object)
# Timestamp is used as part of the directory title, for organization
def fetch_post(post, rank, timestamp, target_base_directory): 

    comments_thread = post.comments #TODO rename to comment forest

    comments_processed = fetch_comments(comments_thread)

    # create file path
    subreddit_folder = os.path.join(target_base_directory, post.subreddit.display_name)
    timestamp_folder = os.path.join(subreddit_folder, timestamp)

    if not os.path.exists(timestamp_folder):
        os.makedirs(timestamp_folder)

    post_file = os.path.join(timestamp_folder,f'{post.id}.json')


    # gather post data
    post_data = {
                "title": post.title,
                "author": str(post.author),
                "subreddit": post.subreddit.display_name,
                "rank" : rank,
                "score": post.score,
                "upvote_ratio": post.upvote_ratio,
                "num_comments (reported by reddit)": post.num_comments,
                "url": post.url,
                "id": post.id,
                "selftext": post.selftext,
                "comments": comments_processed
            }
    
    with open(post_file, "w", encoding="utf-8") as file:
        json.dump(post_data, file, indent=4)
        # print(f'Saved {post.id}.json')

def fetch_subreddit(subreddit_name, n_posts, target_base_directory = 'data', max_runtime = 600): 

    skipped = 0
    downloaded = 0

    subreddit = reddit.subreddit(subreddit_name) # TODO i think i can remove this line
    
    # timestamp for directory name. (
    # We freeze this timestamp as soon as we run the function 
    # so that it doesn't change the folder name even if this takes time to run.
    # I.e., this is the stamp of when we began this scraping run. (any individual file was probably downloaded later).

    timestamp = datetime.now().strftime("date_%m-%d-%Y_time_%H-%M")

    # start time tracking (this is to interrupt the function if it runs for too long)
    start_time = time.time()

    # TODO: Do we want top or hot? 
    # I'm assuming the enumeration here corresponds to the actual rank, testing quickly it seemed to be true. 
    for rank, post in enumerate(subreddit.hot(limit=n_posts)):

        if time.time() - start_time > max_runtime:
            print(f"Stopping fetch for {subreddit_name} as it exceeded 10 minutes.")
            break

        if post.num_comments < max_comments:
            
            # print(f'Fetching post {post.id} (rank = {rank}, num_comments = {post.num_comments}).')
            fetch_post(post, rank, timestamp, target_base_directory)
            downloaded += 1
            if downloaded % 10 == 1: 
                time_now = datetime.now().strftime("time %H:%M")
                print(f'Downloaded {downloaded} posts from {subreddit} so far ({time_now}).')

        else: 
            print(f'Skipped post with id {post.id} because it has {post.num_comments} comments. (max_comments set to {max_comments}).')
            skipped += 1
            print(f'Skipped count: {skipped}')
    time_now = datetime.now().strftime("time %H:%M")
    print(f'Downloaded {downloaded} posts from {subreddit} in total ({time_now}).')

In [19]:
# how many posts from each subreddit. 
n = 500

# Define three separate lists for each category
left_leaning = [
    #'communism',
    'socialism',
    #'LateStageCapitalism',
    'Anarchism',
    #'Marxism',
    'progressive'
]

right_leaning = [
    'Conservative',
    'Libertarian',
    'Anarcho_Capitalism',
    'Republican',
    'AskTrumpSupporters', # is this actually right leaning? might have a lot of "passers-by"
    'RedState'
]

uncontroversial_or_mixed = [
    'Politics',
    'news',
    'MadeMeSmile',
    'DogTraining',
    'ADHD',
    'stopdrinking',
    'aww',
    'wholesomememes'
]

# Combine all lists into one
all_subreddits = left_leaning + right_leaning + uncontroversial_or_mixed

import pandas as pd
from datetime import datetime

# start time of scraping (for naming exceptions dataframe later)
scrape_start_time = datetime.now()

# dictionary to collect exceptions data
exceptions_data = {
    "exception_type": [],
    "exception_message": [],
    "subreddit": []
}

exception_counter = 0

for subreddit in all_subreddits:
    try:
        fetch_subreddit(subreddit, n)
    except Exception as e:
        exception_counter += 1

        exceptions_data["exception_type"].append(type(e).__name__)
        exceptions_data["exception_message"].append(str(e))
        exceptions_data["subreddit"].append(subreddit)

        if exception_counter % 3 == 0: print(f'{exception_counter} exceptions so far. Latest: {str(e)}, at subreddit = {subreddit}')

#if there actually were some exceptions, save the log:
if exception_counter: 
    exceptions_df = pd.DataFrame(exceptions_data)
    timestamp_str = scrape_start_time.strftime('%Y-%m-%d_%H-%M-%S')
    filename = f"exceptions_{timestamp_str}.csv"
    exceptions_df.to_csv(filename, index=False)
else:
    print("No exceptions raised!")


Downloaded 1 posts from socialism so far (time 17:08).
Downloaded 11 posts from socialism so far (time 17:08).
Downloaded 21 posts from socialism so far (time 17:08).
Downloaded 31 posts from socialism so far (time 17:08).
Downloaded 41 posts from socialism so far (time 17:08).
Downloaded 51 posts from socialism so far (time 17:08).
Downloaded 61 posts from socialism so far (time 17:08).
Downloaded 71 posts from socialism so far (time 17:08).
Downloaded 81 posts from socialism so far (time 17:08).
Downloaded 91 posts from socialism so far (time 17:08).
Downloaded 101 posts from socialism so far (time 17:08).
Downloaded 111 posts from socialism so far (time 17:09).
Downloaded 121 posts from socialism so far (time 17:09).
Downloaded 131 posts from socialism so far (time 17:09).
Downloaded 141 posts from socialism so far (time 17:09).
Downloaded 151 posts from socialism so far (time 17:09).
Downloaded 161 posts from socialism so far (time 17:09).
Downloaded 171 posts from socialism so far