In [1]:
import praw
import re
import random
import bs4, markdown
from tqdm import tqdm
import copy
import pandas as pd
import time
import json
import warnings
from praw.exceptions import DuplicateReplaceException
import prawcore

In [2]:
credential_file = "credentials.key"

try:
    with open(credential_file, 'r') as f:
        creds = f.read().split('\n')
    personal = creds[0]
    secret = creds[1]
    username = creds[2]
    password = creds[3]
    user_agent = creds[4]
except IOError as e:
    print("You didn't create a credential file! Please see sample_credentials.key")
    print("Then go to http://www.storybench.org/how-to-scrape-reddit-with-python/")
    print("And register a new app named fastai_reddit in your reddit account.")
    print("And insert the values into sample_credentials.key and save it as {}.".format(credential_file))
    import getpass
    personal = getpass.getpass("Personal key?")
    secret = getpass.getpass("Secret key?")
    username - getpass.getpass("Username?")
    password = getpass.getpass("Password?")
    user_agent = getpass.getpass("User Agent?")

def noquotes(text):
    """
This function first stated out as a way to remove markdown quotes from raw reddit markdown text but now it's more of a
general purpose text parser, but the name hasn't changed.
    """
    #https://stackoverflow.com/questions/761824/python-how-to-convert-markdown-formatted-text-to-text
    t1 = re.sub(">.+?(\n|$)","",text).replace("\\n","").replace("\\","")
    html = markdown.markdown(t1)
    t2 = ''.join(bs4.BeautifulSoup(html, 'lxml').findAll(text=True))
    
    return t2    

reddit = praw.Reddit(client_id=personal, client_secret=secret, user_agent=user_agent, username=username, \
                     password=password)

In [14]:
def get_parent(comment_id):
    comment = reddit.comment(comment_id)
    parent = comment.parent()

    if isinstance(parent, praw.models.Submission):
        # Parent is a post
        parent_title = parent.title
        parent_text = parent.selftext if parent.is_self else ""
        return {"title": parent_title, "text": parent_text}
    elif isinstance(parent, praw.models.Comment):
        # Parent is a comment
        return {"title": None, "text": noquotes(parent.body)}
    else:
        # No parent found
        return None

In [15]:
def getall(x, limit = None):
    top = list(x.top(limit = limit))
    new = list(x.new(limit = limit))
    controversial = list(x.controversial(limit = limit))
    return list(set(top + new  + controversial))

In [22]:
def run(username, reply_instruction="Reply to this Reddit post", create_instruction="Create a Reddit post"):
    output = []
    comms = getall(reddit.redditor(username).comments)
    print(len(comms))
    posts = getall(reddit.redditor(username).submissions)
    print(len(posts))
    for c in tqdm(comms):
        parent = get_parent(c.id)
        title = parent.get('title')
        text = parent.get('text')
        if title:
            posttext = "[Title:] {}\n[Selftext:]\n{}".format(title, text)
        else:
            posttext = text
        result = {"instruction": reply_instruction, "input": posttext, "output": noquotes(c.body)}
        output.append(result)
    for p in tqdm(posts):
        title = p.title
        try:
            selftext = noquotes(p.selftext)
        except AttributeError:
            selftext = ""
        posttext = "[Title:] {}\n[Selftext:]\n{}".format(title, selftext)
        output.append({"instruction": create_instruction, "input": "", "output": posttext})
    with open("{}_user.json".format(username), 'w') as f:
        json.dump(output, f)
    return output

In [29]:
g = run("ragnarkar")

2514
669


100%|████████████████████████████████████████████████████████████████████████████| 2514/2514 [2:30:46<00:00,  3.60s/it]
100%|██████████████████████████████████████████████████████████████████████████████| 669/669 [00:00<00:00, 1140.36it/s]
