In [1]:
import numpy as np
import pandas as pd
import praw
from tqdm import tqdm
import py2neo as pn

In [2]:
submissions = [
    'https://www.reddit.com/r/sanfrancisco/comments/bs5f69/just_had_the_elementary_school_lottery_explained/',
    'https://www.reddit.com/r/sanfrancisco/comments/7r3cy3/how_the_san_francisco_school_lottery_works_and/',
    'https://www.reddit.com/r/sanfrancisco/comments/4ah4no/fuck_the_sf_school_lottery_thats_all/',
    'https://www.reddit.com/r/sanfrancisco/comments/b5kbse/how_the_student_assignment_system_works_sfusd/',
    'https://www.reddit.com/r/sanfrancisco/comments/9hh9z8/two_sf_school_board_members_to_introduce/',
    'https://www.reddit.com/r/sanfrancisco/comments/4646v8/experience_with_enrolling_in_sfusd_school/',
    'https://www.reddit.com/r/sanfrancisco/comments/a5nrej/sf_school_board_plans_to_replace_muchcriticized/',
    'https://www.reddit.com/r/sanfrancisco/comments/bhcxhb/san_francisco_had_an_ambitious_plan_to_tackle/',
    'https://www.reddit.com/r/sanfrancisco/comments/5e5834/i_made_a_website_of_sf_elementary_school_test/',
    'https://www.reddit.com/r/sanfrancisco/comments/cg5coh/sfusd_kindergarten/'
]

Connect to Reddit

In [3]:
reddit = praw.Reddit(client_id='8bgMudNSu0bC6Q',
                     client_secret='4evDbSmLfVBsuV6X8hoX8XHhaCA',
                     user_agent='testscript by /u/catlady900')

Connect to DB

In [4]:
g = pn.Graph(auth=("neo4j", "paBa%Wp36^"))

Generate queries to add data

In [7]:
def setup_db():
    query = "CREATE CONSTRAINT ON (n:User) ASSERT n.id IS UNIQUE;\
    CREATE CONSTRAINT ON (n:Submission) ASSERT n.id IS UNIQUE;\
    CREATE CONSTRAINT ON (n:Comment) ASSERT n.id IS UNIQUE;\
    CREATE CONSTRAINT ON (n:Subreddit) ASSERT n.id IS UNIQUE;"

In [48]:
def get_attr_dict(obj, atts):
    if obj is None: return None
    attrs = dict()
    for v in atts:
        if hasattr(obj, v):
            attrs[v] = getattr(obj, v)
    return attrs

def get_submission_dict(submission):
    return get_attr_dict(
        submission,
        ["id", "text", "url", "score", "upvote_ratio"]
    )
    
def get_user_dict(author):
    return get_attr_dict(
        author,
        ["id","name","comment_karma","created","link_karma"]
    )

def get_subreddit_dict(subreddit):
    return get_attr_dict(
        subreddit,
        ["id", "display_name", "descr", "created", "subscribers"]
    )

def get_comment_dict(comment):
    return get_attr_dict(
        comment,
        ["id", 'text', "score", "url"]
    )

In [22]:
def add_subreddit(subr, graph = g):
    if subr is None: 
        return None
    srs = graph.nodes.match("Subreddit", id=subr.id)
    if len(srs) > 0:
        return srs.first()
    subreddit = pn.Node("Subreddit", **get_subreddit_dict(subr))
    t = graph.begin()
    t.create(subreddit)
    t.commit()
    return subreddit
    
def add_user(user, graph = g):
    if user is None:
        return None
    usrs = graph.nodes.match("User", name= user.name)
    if len(usrs) > 0:
        return usrs.first()
    user = pn.Node("User", **get_user_dict(user))
    db = graph.begin()
    db.create(user)
    db.commit()
    return user
    
def add_submission(subm, graph = g):
    if subm is None:
        return
    if len(graph.nodes.match("Submission", id=subm.id)) > 0:
        return
    
    subreddit = graph.nodes.match("Subreddit", id=subm.subreddit.id)
    if len(subreddit) == 0:
        subreddit = add_subreddit(reddit.subreddit(subm.subreddit.display_name))
    else:
        subreddit = subreddit.first()
    
    if subm.author is not None:
        user = graph.nodes.match("User", name = subm.author.name)
        if len(user) == 0:
            user = add_user(reddit.redditor(subm.author.name))
        else:
            user = user.first()
    
    submission = pn.Node("Submission", **get_submission_dict(subm))
    if subm.author is not None:
        posted = pn.Relationship(user, "POSTED", submission)
    posted_on = pn.Relationship(submission, "POSTED_ON", subreddit)
    db = graph.begin()
    db.create(submission)
    if subm.author is not None:
        db.create(posted)
    db.create(posted_on)
    db.commit()
    return submission

def add_comment(comment, graph = g):
    if comment is None:
        return
    if len(graph.nodes.match("Comment", id = comment.id)) > 0:
        return
    
    if comment.author is not None:
        user = graph.nodes.match("User", name = comment.author.name)
        if len(user) == 0:
            user = add_user(reddit.redditor(comment.author.name))
        else:
            user = user.first()

    parent_id = comment.parent_id[3:]
 
    if comment.parent_id == comment.link_id:
        parent = graph.nodes.match("Submission", id = parent_id)
        if len(parent) == 0:
            parent = add_submission(reddit.submission(id = parent_id))
        else:
            parent = parent.first()
    else:
        parent = graph.nodes.match("Comment", id = parent_id)
        if len(parent) == 0:
            parent = add_comment(reddit.comment(id = parent_id))
        else:
            parent = parent.first()
    
    c = pn.Node("Comment", **get_comment_dict(comment))
    if comment.author is not None:
        posted = pn.Relationship(user, "POSTED", c)
    posted_on = pn.Relationship(c, "REPLY_TO", parent)
    db = graph.begin()
    db.create(c)
    if comment.author is not None:
        db.create(posted)
    db.create(posted_on)
    db.commit()
    return c

In [28]:
sf = reddit.subreddit("sanfrancisco")

In [56]:
srs = g.nodes.match("Subreddit", id=sf.id)

In [70]:
submission = reddit.submission(url = submissions[3])

In [11]:
sbs = g.nodes.match("Submission", id=submission.id)

In [13]:
sb1 = sbs.first()

In [51]:
add_subreddit(sf)

(_21:Subreddit {created: 1201272790.0, display_name: 'sanfrancisco', id: '2qh3u', subscribers: 147049})

In [34]:
get_subreddit_dict(sf)

{'created': 1558659585.0}

In [69]:
get_submission_dict(reddit.submission(url = submissions[3]))

{'id': 'b5kbse',
 'url': 'https://www.youtube.com/watch?v=-CQ-sZyFLz4',
 'score': 2,
 'upvote_ratio': 0.63}

In [71]:
comments = submission.comments.list()

In [84]:
get_comment_dict(comments[4])

{'id': 'ejf6xkm', 'score': 1}

In [86]:
get_comment_dict(reddit.comment(id = "ejf6xkm"))

{'id': 'ejf6xkm', 'score': 1}

In [68]:
write_submission(reddit.submission(url = submissions[3]))

In [123]:
mike = reddit.redditor('instant_michael')
print(mike.name)

instant_michael


In [124]:
vars(mike)

{'_reddit': <praw.reddit.Reddit at 0x11b382b10>,
 '_fetched': False,
 '_listing_use_sort': True,
 'name': 'instant_michael'}

In [130]:
write_submission(db, reddit.submission(url = submissions[2]))

In [63]:
def write_submission(submission): 
    # Add submission and subreddit
    add_submission(submission)
    
    # Replace any "load more comments"
    submission.comments.replace_more(limit = None)
    
    # Add Comments and Replies
    for comment in submission.comments.list():
        add_comment(comment)

In [131]:
for i in tqdm(range(len(submissions))):
    write_submission(reddit.submission(url = submissions[i]))


  0%|          | 0/7 [00:00<?, ?it/s][A
 14%|█▍        | 1/7 [00:10<01:02, 10.37s/it][A
 29%|██▊       | 2/7 [02:35<04:13, 50.77s/it][A
 43%|████▎     | 3/7 [02:48<02:38, 39.55s/it][A
 57%|█████▋    | 4/7 [03:17<01:48, 36.30s/it][A
 71%|███████▏  | 5/7 [07:04<03:06, 93.41s/it][A
 86%|████████▌ | 6/7 [07:28<01:12, 72.71s/it][A
100%|██████████| 7/7 [07:36<00:00, 65.28s/it]
