In [3]:
from   datetime import datetime, timedelta
import json
import os
import sys
import time

from hashlib import sha256
import numpy as np
import pandas as pd
import praw
import pytz



In [4]:
HOME = os.environ['HOME']
KEYS = f'{HOME}/config/db.json'
ENV = 'dev'
NOW = datetime.utcnow().replace(tzinfo=pytz.utc)
TODAY = NOW.date()

DATA_DIR = '../data' 
SQL_DIR = '../sql'
OUT_DIR = '../output'

In [5]:
config = # read local keys

In [8]:
reddit = praw.Reddit(client_id=config['reddit']['client-id'],
                     client_secret=config['reddit']['client-secret'],
                     user_agent=config['reddit']['user-agent'])

Version 7.2.0 of praw is outdated. Version 7.4.0 was released Friday July 30, 2021.


In [9]:
hot_networking_posts = reddit.subreddit('networking').hot(limit=10)

for post in hot_networking_posts:
    print(post.title)

Blogpost Friday!
Moronic Monday!
Alternative for Cisco ACI
Best pass through crimper?
ASN Usage for Independent Sites
OSPF design for Branch Office / Datacentre connectivity
Cisco L2VPN xconnect to Huawei L2VPN VSI
Creating a multi simcard network that is mobile
Automation Optimization - simultaneous netmiko ssh connections
Fiber distribution panel


In [13]:
def get_subreddits():
    data = pd.read_csv(DATA_DIR+'/subreddits.csv')
    data = data['subreddits'].str.replace(r'r/', '')
    return data

In [14]:
subreddits = get_subreddits()
subreddits

0         networking
1           sysadmin
2             devops
3      cybersecurity
4    MachineLearning
5        programming
6           hardware
Name: subreddits, dtype: object

In [15]:
def _convert_epoch_to_datetime(epoch_time):
    return datetime.utcfromtimestamp(epoch_time).strftime('%Y-%m-%d %H:%M:%S')

In [16]:
def _make_hash(*var_args):
        args = [str(arg) for arg in var_args]
        hashed = sha256(''.join(args).encode('utf-8')).hexdigest()
        return hashed

In [17]:
def _delete_fields(comment_obj):
    fields_to_del = ['_reddit','_replies','awarders',
                    'user_reports','treatment_tags',
                    'all_awardings','author_flair_richtext',
                    'gildings', 'mod_reports']
    for field in fields_to_del:
        if field in comment_obj.keys():
            del comment_obj[field]
    return comment_obj
    

In [18]:
def _stringify_fields(comment_instance, comment_obj):
    comment_obj.update({'author' : str(comment_instance.author)})
    # comment_obj.update({'_submission': str(comment_instance.submission.title)})
    comment_obj.update({'subreddit': str(comment_instance.subreddit)})
    return comment_obj

In [19]:
def _update_comment(comment_obj):
    comment_obj['sha_id'] = _make_hash([
            comment_obj['id'],
            comment_obj['author'],
            comment_obj['body'], 
            comment_obj['parent_id'],
            comment_obj['subreddit'],
            comment_obj['subreddit_id']])
    comment_obj['created_at'] = _convert_epoch_to_datetime(comment_obj['created_utc'])
    return comment_obj

In [25]:
# reddit comment model: https://praw.readthedocs.io/en/latest/code_overview/models/comment.html
def get_subreddit_comments(subreddit_name): 
    comments = []
    print(f'\n\n--\nObtaining data for r/{subreddit_name}...')
    subreddit = reddit.subreddit(subreddit_name) # subreddit api
    for comment_instance in subreddit.comments(limit=10):
        comment_obj = vars(comment_instance)
        submission_title = comment_instance.submission.title
        print('submission_title = ', submission_title)
        # stringify fields
        comment_obj = _stringify_fields(comment_instance, comment_obj)
        # delete fields
        comment_obj = _delete_fields(comment_obj)
        # update object with sha_id and created_at
        comment_obj = _update_comment(comment_obj)
        comments.append(comment_obj)
    print(' - No. of comments: ', len(comments))
    return comments

In [26]:
def print_max_col_length(comments_df):
    '''
    Get maximum length of column values
    '''
    comment_vals_as_str = comments_df.values.astype(str)
    max_col_lengths = dict(zip(comments_df, measurer(comment_vals_as_str).max(axis=0)))

    print('\nmax column lengths:')
    print("\n".join("{!r}: {!r}".format(k, v) for k, v in max_col_lengths.items()) + "\n")

In [27]:
def save_data(data, outpath):
    print(f'\nSaving data to {outpath}...')
    
    with open(outpath, 'w') as outfile:
        for entry in data:
            json.dump(entry, outfile)
            outfile.write('\n')
    

In [28]:
measurer = np.vectorize(len)

In [33]:
start = time.time()
for idx, subreddit in enumerate(subreddits):
    sub_start = time.time()
    subreddit_comments = get_subreddit_comments(subreddit)
    sub_end = time.time()
    print(f' - Time taken to get data from r/{subreddit}: {(sub_end-sub_start):.4f}s')
    
    write_start = time.time()
    save_data(subreddit_comments, f'{OUT_DIR}/{subreddit}.jsonl')
    write_end = time.time()
    print(f' - Time taken to write data for r/{subreddit}: {(write_end-write_start):.4f}s')
    
    comments_df = pd.json_normalize(subreddit_comments)
end = time.time()

# print maximum length of column values
print(f'\n\nTotal run time: {(end-start):0.4f}s')





--
Obtaining data for r/networking...
submission_title =  Alternative for Cisco ACI
submission_title =  Best pass through crimper?
submission_title =  Alternative for Cisco ACI
submission_title =  Alternative for Cisco ACI
submission_title =  Best pass through crimper?
submission_title =  Alternative for Cisco ACI
submission_title =  ASN Usage for Independent Sites
submission_title =  Creating a multi simcard network that is mobile
submission_title =  Best pass through crimper?
submission_title =  Creating a multi simcard network that is mobile
 - No. of comments:  10
 - Time taken to get data from r/networking: 4.1614s
 - Time taken to write data for r/networking: 0.0000s


--
Obtaining data for r/sysadmin...
submission_title =  Babysitting users. New responsibility for a Sysadmin.
submission_title =  Windows internals 7th edition part 2 is out
submission_title =  What operating systems do you all use outside of work?
submission_title =  What operating systems do you all use outside