# Acquisition of Subreddit Posts

In [37]:
# Importing libraries
import pandas as pd
import datetime as dt
import time
import requests

In [9]:
url = "https://api.pushshift.io/reddit/search/submission?subreddit=vegetarian"

In [10]:
res = requests.get(url)

In [11]:
res.status_code

200

In [12]:
json_data = res.json()

In [14]:
len(json_data["data"])

25

In [15]:
results_df = pd.DataFrame(json_data["data"])
results_df.head()

Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_type,author_fullname,author_patreon_flair,author_premium,...,whitelist_status,wls,link_flair_css_class,link_flair_template_id,link_flair_text,removed_by_category,post_hint,preview,thumbnail_height,thumbnail_width
0,[],False,CarlosTheBoss,,[],,text,t2_n1v5p,False,False,...,all_ads,6,,,,,,,,
1,[],False,K-M-R-,,[],,text,t2_3wjk07ol,False,False,...,all_ads,6,rant,8b8304e2-2bca-11e5-a801-0e5ca32a3025,Rant,moderator,,,,
2,[],False,RaulTiru,,[],,text,t2_2j4tlzya,False,False,...,all_ads,6,,,,,,,,
3,[],False,sometimesgauri,,[],,text,t2_19b98cws,False,False,...,all_ads,6,,,,,,,,
4,[],False,Donovan_MM,,[],,text,t2_2xmk3t5p,False,False,...,all_ads,6,,,,,,,,


In [16]:
results_df.columns

Index(['all_awardings', 'allow_live_comments', 'author',
       'author_flair_css_class', 'author_flair_richtext', 'author_flair_text',
       'author_flair_type', 'author_fullname', 'author_patreon_flair',
       'author_premium', 'awarders', 'can_mod_post', 'contest_mode',
       'created_utc', 'domain', 'full_link', 'gildings', 'id',
       'is_crosspostable', 'is_meta', 'is_original_content',
       'is_reddit_media_domain', 'is_robot_indexable', 'is_self', 'is_video',
       'link_flair_background_color', 'link_flair_richtext',
       'link_flair_text_color', 'link_flair_type', 'locked', 'media_only',
       'no_follow', 'num_comments', 'num_crossposts', 'over_18',
       'parent_whitelist_status', 'permalink', 'pinned', 'pwls',
       'retrieved_on', 'score', 'selftext', 'send_replies', 'spoiler',
       'stickied', 'subreddit', 'subreddit_id', 'subreddit_subscribers',
       'subreddit_type', 'thumbnail', 'title', 'total_awards_received', 'url',
       'whitelist_status', 'wls',

In [38]:
# Thanks to Mahdi for this code for getting posts from the subreddits

def query_pushshift(subreddit, kind = 'submission', day_window = 30, n = 5):
    SUBFIELDS = ['title', 'selftext', 'subreddit', 'author','created_utc','num_comments', 'is_self']
    # establish base url and stem
    BASE_URL = f"https://api.pushshift.io/reddit/search/{kind}" # also known as the "API endpoint" 
    stem = f"{BASE_URL}?subreddit={subreddit}&size=500" # always pulling max of 500
    # instantiate empty list for temp storage
    posts = []
    # implement for loop with `time.sleep(2)`
    for i in range(1, n + 1):
        URL = "{}&after={}d".format(stem, day_window * i)
        print("Querying from: " + URL)
        response = requests.get(URL)
        assert response.status_code == 200
        mine = response.json()['data']
        df = pd.DataFrame.from_dict(mine)
        posts.append(df)
        time.sleep(2)
    # pd.concat storage list
    full = pd.concat(posts, sort=False)
    # if submission
    if kind == "submission":
        # select desired columns
        full = full[SUBFIELDS]
        # drop duplicates
        full.drop_duplicates(inplace = True)
        # select `is_self` == True
        full = full.loc[full['is_self'] == True]
    # create `timestamp` column
    full['timestamp'] = full["created_utc"].map(dt.date.fromtimestamp)
    print("Query Complete!")    
    return full 


In [39]:
subreddit1 = 'vegetarian'

subreddit_vegetarian = query_pushshift(subreddit1)


Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=vegetarian&size=500&after=30d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=vegetarian&size=500&after=60d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=vegetarian&size=500&after=90d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=vegetarian&size=500&after=120d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=vegetarian&size=500&after=150d
Query Complete!


In [40]:
subreddit_1.shape

(1071, 8)

In [41]:
subreddit2 = 'vegan'
subreddit_vegan = query_pushshift(subreddit2)

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=vegan&size=500&after=30d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=vegan&size=500&after=60d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=vegan&size=500&after=90d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=vegan&size=500&after=120d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=vegan&size=500&after=150d
Query Complete!


In [42]:
subreddit_2.shape

(1230, 8)

In [43]:
both_dataframes = [subreddit_vegetarian, subreddit_vegan]
df = pd.concat(both_dataframes)

In [44]:
df.shape

(2250, 8)

In [45]:
df.to_csv('./data/df.csv', index = None);


In [46]:
subreddit_vegetarian.to_csv('./data/subreddit_vegetarian.csv')
subreddit_vegan.to_csv('./data/subreddit_vegan.csv')