# Reddit Mining

## Working with timestamps

In [19]:
from datetime import datetime

In [24]:
datetime.now()

datetime.datetime(2018, 9, 27, 10, 48, 59, 202591)

In [39]:
datetime.now().strftime('%c')

'Thu Sep 27 10:54:37 2018'

[Date format reference](http://strftime.org)

In [196]:
date_fmt = "%-d %b '%y"

In [197]:
datetime.now().strftime(date_fmt)

"27 Sep '18"

In [29]:
d = datetime(day=24, month=1, year=2018)
d.timestamp()

1516780800.0

In [31]:
datetime.fromtimestamp(1538069887)

datetime.datetime(2018, 9, 27, 10, 38, 7)

In [192]:
ts = 1537916303.0
# should be Tue Sep 25 14:58:23 2018 UTC
# Tue Sep 25 7:58:23 2018 GMT-7 (PDT)

## Raw Pushshift

In [35]:
import requests
import pandas as pd

### Comments

[Params doc](https://github.com/pushshift/api#search-parameters-for-comments)

In [88]:
comments.columns # all columns

Index(['author', 'author_flair_background_color', 'author_flair_css_class',
       'author_flair_richtext', 'author_flair_template_id',
       'author_flair_text', 'author_flair_text_color', 'author_flair_type',
       'author_fullname', 'body', 'created_utc', 'gildings', 'id', 'link_id',
       'no_follow', 'parent_id', 'permalink', 'retrieved_on', 'score',
       'send_replies', 'stickied', 'subreddit', 'subreddit_id'],
      dtype='object')

In [207]:
%%time
endpoint = 'https://api.pushshift.io/reddit/search/comment'
params = dict(
    q='trump',
    subreddit='politics',

    after='3d',  # in the past 300 days
#     before='2d', after='4d', # between 2 and 4 days ago

#     fields='created_utc,body,score',
    sort='asc',
    sort_type='score',
    size=20,  # <= 500
)

response = requests.get(endpoint, params)
content = response.json()
comments = pd.DataFrame(content['data'])

CPU times: user 21.3 ms, sys: 2.59 ms, total: 23.9 ms
Wall time: 555 ms


In [208]:
comments.created_utc  = comments.created_utc .apply(lambda d: datetime.fromtimestamp(d).strftime(date_fmt))
comments.retrieved_on = comments.retrieved_on.apply(lambda d: datetime.fromtimestamp(d).strftime(date_fmt))

In [209]:
comments[['created_utc', 'body' ,'score']].head(10)

Unnamed: 0,created_utc,body,score
0,24 Sep '18,"It's pretty simple, they investigate the named...",1
1,24 Sep '18,"&gt; The deputy attorney general, Rod J. Rosen...",1
2,24 Sep '18,"We just heard all about how he said ""we should...",1
3,24 Sep '18,See you’re proving what I said/moving the goal...,1
4,24 Sep '18,They will try. But judging from the bot/troll ...,1
5,24 Sep '18,Trump &gt; Pence\n\nOur VP is terrifying,1
6,24 Sep '18,Oh really? By whom?\n\nTrump? \n\nI realize yo...,1
7,24 Sep '18,My comment has nothing to do with who I suppor...,1
8,24 Sep '18,Why him? Because he would support Trump in th...,1
9,24 Sep '18,so was electing trump,1


### Posts

[Params doc](https://github.com/pushshift/api#search-parameters-for-submissions)

In [203]:
%%time
endpoint = 'https://api.pushshift.io/reddit/search/submission'
params = dict(
    q='trump',
    subreddit='politics',
    after='1d',
    
    sort='desc',
    sort_type='score',
    size=20,  # <= 500
)

response = requests.get(endpoint, params)
content = response.json()
posts = pd.DataFrame(content['data'])

CPU times: user 22.5 ms, sys: 2.8 ms, total: 25.3 ms
Wall time: 663 ms


In [204]:
posts.created_utc  = posts.created_utc .apply(lambda d: datetime.fromtimestamp(d).strftime(date_fmt))
posts.retrieved_on = posts.retrieved_on.apply(lambda d: datetime.fromtimestamp(d).strftime(date_fmt))

In [205]:
posts.columns

Index(['author', 'author_cakeday', 'author_flair_background_color',
       'author_flair_css_class', 'author_flair_richtext',
       'author_flair_template_id', 'author_flair_text',
       'author_flair_text_color', 'author_flair_type', 'author_fullname',
       'can_mod_post', 'contest_mode', 'created_utc', 'domain', 'full_link',
       'gildings', 'id', 'is_crosspostable', 'is_meta', 'is_original_content',
       'is_reddit_media_domain', 'is_self', 'is_video',
       'link_flair_background_color', 'link_flair_css_class',
       'link_flair_richtext', 'link_flair_text', 'link_flair_text_color',
       'link_flair_type', 'locked', 'media_only', 'no_follow', 'num_comments',
       'num_crossposts', 'over_18', 'parent_whitelist_status', 'permalink',
       'pinned', 'post_hint', 'preview', 'pwls', 'retrieved_on', 'score',
       'selftext', 'send_replies', 'spoiler', 'stickied', 'subreddit',
       'subreddit_id', 'subreddit_subscribers', 'subreddit_type', 'thumbnail',
       'thumbnail

In [206]:
posts[['title', 'created_utc', 'score']].head(10)

Unnamed: 0,title,created_utc,score
0,Democratic Senator Says Trump Complicit In 'Co...,27 Sep '18,5
1,The Latest: Trump says he may delay meeting wi...,26 Sep '18,1
2,Trump to China: ‘I Own You.’ Guess Again. The ...,26 Sep '18,1
3,If Trump Fires Rosenstein... Here's the Rapid ...,26 Sep '18,1
4,Trump Supporter on MSNBC Tells Her Daughters T...,26 Sep '18,1
5,Trump defends his own past as Kavanaugh faces ...,26 Sep '18,1
6,DONALD TRUMP SAYS CHINA RESPECTS HIM BECAUSE O...,26 Sep '18,1
7,"Amid Trump-China tariff tiff, Cisco kit prices...",26 Sep '18,1
8,Trump says he rejected a meeting with Trudeau ...,26 Sep '18,1
9,KKK’s official newspaper supports Donald Trump...,26 Sep '18,1


## Pushshift Wrapper

In [116]:
from psaw import PushshiftAPI

In [116]:
api = PushshiftAPI()

In [138]:
results = api.search_submissions(
    after='3d',
    q='trump',
    subreddits='politics',
    fields=['title', 'created_utc', 'score'],
    limit=10,
    sort='desc',
    sort_type='score',
)

In [149]:
df = pd.DataFrame([r[-1] for r in results])
df

## Pushshift + Reddit API Wrappers

In [142]:
import praw

In [146]:
reddit = praw.Reddit(
    client_id='A_dODrxrtnOexA',
    client_secret='2IeLwcoRKKCucteXRlYbTcql5_o',
    
    username='usc_social_miner',
    password='trojans',
    
    user_agent='test',
)

In [147]:
reddit.user.me()

Redditor(name='usc_social_miner')

In [148]:
api = PushshiftAPI(reddit)

In [150]:
results = api.search_submissions(
    after='3d',
    q='trump',
    subreddits='politics',
    fields=['title', 'created_utc', 'score'],
    limit=10,
    sort='desc',
    sort_type='score',
)

In [151]:
df = pd.DataFrame([r[-1] for r in results])
df

AttributeError: 'submission' object has no attribute 'id'

## Reddit API

In [214]:
import re

In [168]:
reddit = praw.Reddit(
    client_id='A_dODrxrtnOexA',
    client_secret='2IeLwcoRKKCucteXRlYbTcql5_o',
    
    username='usc_social_miner',
    password='trojans',
    
    user_agent='test',
)

In [179]:
def filter_keys(d: dict, keys: list) -> dict:
    return {k: d[k] for k in keys}

In [226]:
%%time
results = reddit.subreddit('politics').top(limit=1000, time_filter='week')

df = pd.DataFrame([filter_keys(r.__dict__, ['title', 'score', 'created']) 
                   for r in results])

CPU times: user 288 ms, sys: 14.8 ms, total: 302 ms
Wall time: 19 s


In [211]:
df.created = df.created.apply(lambda d: datetime.fromtimestamp(d).strftime(date_fmt))

[Actual reddit link](https://old.reddit.com/r/politics/top/?t=week) for reference

In [213]:
df.head()

Unnamed: 0,created,score,title
0,25 Sep '18,55668,Watch: Donald Trump laughed at by U.N. General...
1,21 Sep '18,45065,"400,000 Americans in 900 Cities Ready to Take ..."
2,25 Sep '18,42969,Supreme Court nominee Brett Kavanaugh's Yale r...
3,25 Sep '18,40259,President Trump Is Literally the Laughingstock...
4,26 Sep '18,38488,Fox News roasted on Twitter after editing out ...


In [222]:
f'{df.title.str.contains("trump", flags=re.IGNORECASE).mean(): .1%} of top posts this week contain "Tump"'

' 28.0% of top posts this week contain "Tump"'