In [1]:
import praw
import re
import random
import bs4, markdown
from tqdm import tqdm
import copy
import pandas as pd
import time
import json
import warnings
from praw.exceptions import DuplicateReplaceException
import prawcore

In [2]:
credential_file = "credentials.key"

try:
    with open(credential_file, 'r') as f:
        creds = f.read().split('\n')
    personal = creds[0]
    secret = creds[1]
    username = creds[2]
    password = creds[3]
    user_agent = creds[4]
except IOError as e:
    print("You didn't create a credential file! Please see sample_credentials.key")
    print("Then go to http://www.storybench.org/how-to-scrape-reddit-with-python/")
    print("And register a new app named fastai_reddit in your reddit account.")
    print("And insert the values into sample_credentials.key and save it as {}.".format(credential_file))
    import getpass
    personal = getpass.getpass("Personal key?")
    secret = getpass.getpass("Secret key?")
    username - getpass.getpass("Username?")
    password = getpass.getpass("Password?")
    user_agent = getpass.getpass("User Agent?")

def noquotes(text):
    """
This function first stated out as a way to remove markdown quotes from raw reddit markdown text but now it's more of a
general purpose text parser, but the name hasn't changed.
    """
    #https://stackoverflow.com/questions/761824/python-how-to-convert-markdown-formatted-text-to-text
    t1 = re.sub(">.+?(\n|$)","",text).replace("\\n","").replace("\\","")
    html = markdown.markdown(t1)
    t2 = ''.join(bs4.BeautifulSoup(html, 'lxml').findAll(text=True))
    
    return t2    

reddit = praw.Reddit(client_id=personal, client_secret=secret, user_agent=user_agent, username=username, \
                     password=password)

In [3]:
def handle_api_errors(func):
    def wrapper(*args, **kwargs):
        while True:
            try:
                result = func(*args, **kwargs)
                return result
            except (praw.exceptions.APIException, praw.exceptions.ServerError, prawcore.exceptions.ServerError) as e:
                print(f'API Error: {e}')
                print(f'Retrying in 60 seconds...')
                time.sleep(60)
    return wrapper

def skip_on_assertion_error(func):
    def wrapper(*args, **kwargs):
        try:
            func(*args, **kwargs)
        except AssertionError as e:
            warnings.warn(f"AssertionError encountered: {e}")
        except Exception as e:
            raise e
    return wrapper

In [4]:
@handle_api_errors
def traverse_replies(comment, parent=None):
    results = []
    # Add a dictionary for the current comment
    result = {"instruction": "Reply to this Reddit post", "input": parent, "output": noquotes(comment.body), "score": comment.score}
    results.append(result)
    # Recursively traverse any nested replies
    for reply in comment.replies:
        if isinstance(reply, praw.models.MoreComments):
            # Replace any MoreComments object with its actual comments
            try:
                comments = reply.comments()
                if isinstance(comments, praw.models.comment_forest.CommentForest):
                    comments.replace_more(limit=None)
                for r in comments:
                    results.extend(traverse_replies(r, parent=noquotes(comment.body)))
            except AttributeError:
                pass
                #print(f"MoreComments object not found. Skipping...")
            except Exception as e:
                pass
                #print(f"Exception encountered when expanding: {e}")
        else:
            results.extend(traverse_replies(reply, parent=noquotes(comment.body)))
    return results

In [5]:
def run(sub, reply_instruction="Reply to this Reddit post", create_instruction="Create a Reddit post", limit=1000):
    subreddit = reddit.subreddit(sub)
    top = list(subreddit.top(limit=limit))
    new = list(subreddit.new(limit=limit))
    rising = list(subreddit.rising(limit=limit))
    controversial = list(subreddit.controversial(limit=limit))
    posts = list(set(top + new + rising + controversial))
    output = []
    for p in tqdm(posts):
        comments = p.comments.list()
        if len(comments) == 0:
            continue
        title = p.title
        try:
            selftext = noquotes(p.selftext)
        except AttributeError:
            selftext = ""
        posttext = "[Title:] {}\n[Selftext:]\n{}".format(title, selftext)
        output.append({"instruction": create_instruction, "input": "", "output": posttext, "score": p.score})
        for reply in comments:
            if isinstance(reply, praw.models.MoreComments):
                try:
                    reply.comments().replace_more(limit=None)
                    for r in reply.comments():
                        try:
                            output.extend(traverse_replies(r, posttext))
                        except Exception as e:
                            pass
                            #print("Exception encountered when traversing replies: {}".format(e))
                except Exception as e:
                    pass
                    #print("Exception encountered when expanding MoreComments: {}".format(e))

            else:
                try:
                    output.extend(traverse_replies(reply, posttext))
                except Exception as e:
                    pass
                    #print("Exception encountered when traversing replies: {}".format(e))
    with open("{}.json".format(sub), 'w') as f:
        json.dump(output, f)
    return output

In [6]:
for sub in ['berkeley']:
    try:
        print(sub)
        run(sub)
    except Exception as e:
        print("{}, {}".format(sub, e))

berkeley


100%|████████████████████████████████████████████████████████████████████████████| 2962/2962 [3:05:06<00:00,  3.75s/it]
