In [15]:
import requests
import pandas as pd
import time
import random

In [16]:
url = 'https://www.reddit.com/r/boardgames.json'

In [17]:
res = requests.get(url)

In [18]:
res.status_code

429

Reddit knows that you are using a Chrome browser on a Mac is trying to access the address https://www.reddit.com/r/boardgames.json However, Python has its own default user agent. Since there are so many scripts out there that are already 'hitting' reddit's API, reddit is basically shutting down all Python scripts from accessing its API.

We will change our request a little bit to make it not use the default user agent. 

In [19]:
res = requests.get(url, headers={'User-agent': 'Pony Inc 1.0'})

In [20]:
res.status_code

200

In [21]:
reddit_dict = res.json()

In [22]:
reddit_dict.keys()

dict_keys(['kind', 'data'])

In [23]:
print(reddit_dict)



In [24]:
reddit_dict['kind']

'Listing'

In [25]:
reddit_dict['data']

{'modhash': '',
 'dist': 26,
 'children': [{'kind': 't3',
   'data': {'approved_at_utc': None,
    'subreddit': 'boardgames',
    'selftext': '**Welcome to /r/boardgames Daily Discussion and Game Recommendations**\n\nThis is meant to be a place where you can ask any and all questions relating to the board gaming world: general or specific game recommendations, rule clarifications, definitions of terms/acronyms, and other quick questions that might not warrant their own post. \n\nIf you are seeking game recommendations you will get better responses if you give us enough background to help you. You can use [this template](https://www.reddit.com/r/boardgames/wiki/personalized-game-recommendation-template-no-explainer) to do so. [Here](https://www.reddit.com/r/boardgames/wiki/personalized-game-recommendation-template) is a version with explanations of what we\'re looking for.  \n\nIf you reply to any comment that has a game name in **bold** with "**/u/r2d8 getparentinfo**", one of our robo

In [26]:
reddit_dict['data'].keys()

dict_keys(['modhash', 'dist', 'children', 'after', 'before'])

The most important keys are `children` and `after`.

In [27]:
reddit_dict['data']['children']

[{'kind': 't3',
  'data': {'approved_at_utc': None,
   'subreddit': 'boardgames',
   'selftext': '**Welcome to /r/boardgames Daily Discussion and Game Recommendations**\n\nThis is meant to be a place where you can ask any and all questions relating to the board gaming world: general or specific game recommendations, rule clarifications, definitions of terms/acronyms, and other quick questions that might not warrant their own post. \n\nIf you are seeking game recommendations you will get better responses if you give us enough background to help you. You can use [this template](https://www.reddit.com/r/boardgames/wiki/personalized-game-recommendation-template-no-explainer) to do so. [Here](https://www.reddit.com/r/boardgames/wiki/personalized-game-recommendation-template) is a version with explanations of what we\'re looking for.  \n\nIf you reply to any comment that has a game name in **bold** with "**/u/r2d8 getparentinfo**", one of our robots will tell you more about the game\n\nJust 

In [28]:
len(reddit_dict['data']['children'])

26

In [29]:
reddit_dict['data']['children'][0]

{'kind': 't3',
 'data': {'approved_at_utc': None,
  'subreddit': 'boardgames',
  'selftext': '**Welcome to /r/boardgames Daily Discussion and Game Recommendations**\n\nThis is meant to be a place where you can ask any and all questions relating to the board gaming world: general or specific game recommendations, rule clarifications, definitions of terms/acronyms, and other quick questions that might not warrant their own post. \n\nIf you are seeking game recommendations you will get better responses if you give us enough background to help you. You can use [this template](https://www.reddit.com/r/boardgames/wiki/personalized-game-recommendation-template-no-explainer) to do so. [Here](https://www.reddit.com/r/boardgames/wiki/personalized-game-recommendation-template) is a version with explanations of what we\'re looking for.  \n\nIf you reply to any comment that has a game name in **bold** with "**/u/r2d8 getparentinfo**", one of our robots will tell you more about the game\n\nJust reme

In [30]:
reddit_dict['data']['children'][0].keys()

dict_keys(['kind', 'data'])

In [31]:
reddit_dict['data']['children'][0]['kind']

't3'

In [32]:
reddit_dict['data']['children'][0]['data']

{'approved_at_utc': None,
 'subreddit': 'boardgames',
 'selftext': '**Welcome to /r/boardgames Daily Discussion and Game Recommendations**\n\nThis is meant to be a place where you can ask any and all questions relating to the board gaming world: general or specific game recommendations, rule clarifications, definitions of terms/acronyms, and other quick questions that might not warrant their own post. \n\nIf you are seeking game recommendations you will get better responses if you give us enough background to help you. You can use [this template](https://www.reddit.com/r/boardgames/wiki/personalized-game-recommendation-template-no-explainer) to do so. [Here](https://www.reddit.com/r/boardgames/wiki/personalized-game-recommendation-template) is a version with explanations of what we\'re looking for.  \n\nIf you reply to any comment that has a game name in **bold** with "**/u/r2d8 getparentinfo**", one of our robots will tell you more about the game\n\nJust remember that this is a commun

In [33]:
reddit_dict['data']['children'][0]['data']['subreddit']

'boardgames'

The cell directly above gives you the class label, aka your target.

In [34]:
reddit_dict['data']['children'][0]['data']['title']

'/r/boardgames Daily Discussion and Game Recommendations (June 07, 2019)'

That's mapping to the first post.

In [35]:
reddit_dict['data']['children'][0]['data']['selftext']

'**Welcome to /r/boardgames Daily Discussion and Game Recommendations**\n\nThis is meant to be a place where you can ask any and all questions relating to the board gaming world: general or specific game recommendations, rule clarifications, definitions of terms/acronyms, and other quick questions that might not warrant their own post. \n\nIf you are seeking game recommendations you will get better responses if you give us enough background to help you. You can use [this template](https://www.reddit.com/r/boardgames/wiki/personalized-game-recommendation-template-no-explainer) to do so. [Here](https://www.reddit.com/r/boardgames/wiki/personalized-game-recommendation-template) is a version with explanations of what we\'re looking for.  \n\nIf you reply to any comment that has a game name in **bold** with "**/u/r2d8 getparentinfo**", one of our robots will tell you more about the game\n\nJust remember that this is a community full of awesome, helpful people, and feel free to ask your ques

We want to get all these posts into a Pandas DataFrame and thereafter we can save it to a CSV.

In [36]:
posts = [p['data'] for p in reddit_dict['data']['children']]

In [37]:
pd.DataFrame(posts)

Unnamed: 0,all_awardings,approved_at_utc,approved_by,archived,author,author_flair_background_color,author_flair_css_class,author_flair_richtext,author_flair_template_id,author_flair_text,...,thumbnail_width,title,total_awards_received,ups,url,user_reports,view_count,visited,whitelist_status,wls
0,[],,,False,AutoModerator,#d0fffd,,"[{'e': 'text', 't': '🤖'}]",326005ee-9cf2-11e8-8b1f-0e8f9a199476,🤖,...,,/r/boardgames Daily Discussion and Game Recomm...,0,3,https://www.reddit.com/r/boardgames/comments/b...,[],,False,all_ads,6
1,[],,,False,Chromie121,,,[],,,...,,Gloomhaven Digital - First Gameplay Reveal on ...,0,279,https://www.reddit.com/r/boardgames/comments/b...,[],,False,all_ads,6
2,[],,,False,bungeeman,,blood-bowl,"[{'e': 'text', 't': 'Blood Bowl'}]",197491da-18aa-11e7-bae6-0ef608c792b0,Blood Bowl,...,,Please be nice to the guys demonstrating games...,0,881,https://www.reddit.com/r/boardgames/comments/b...,[],,False,all_ads,6
3,[],,,False,Gregorwhat,,,[],,,...,,"[COMC] One Year into Board Gaming (Shelfie, Fa...",0,45,https://www.reddit.com/r/boardgames/comments/b...,[],,False,all_ads,6
4,[],,,False,boardgamebarrage,,custom,"[{'e': 'text', 't': 'Podcast - Red Tank/Kellen'}]",ef809fb6-18ab-11e7-9e69-0e5a4f06ecbe,Podcast - Red Tank/Kellen,...,140.0,"What board games did you expect to hate, but e...",0,52,https://boardgamegeek.com/blogpost/90614/board...,[],,False,all_ads,6
5,[],,,False,Burnscars,,,[],,,...,,What great game has a terrible rulebook?,0,69,https://www.reddit.com/r/boardgames/comments/b...,[],,False,all_ads,6
6,[],,,False,Chromie121,,,[],,,...,,"7th Continent - First time playing, what are c...",0,35,https://www.reddit.com/r/boardgames/comments/b...,[],,False,all_ads,6
7,[],,,False,mahazoo,,,[],,,...,,What’s the one game that you were so positive ...,0,97,https://www.reddit.com/r/boardgames/comments/b...,[],,False,all_ads,6
8,[],,,False,SonaMidorFeed,,keyflower,"[{'e': 'text', 't': 'Gimme dat blacksmith...'}]",,Gimme dat blacksmith...,...,,"Got tired of Skull getting ruined, so made my ...",0,110,https://www.reddit.com/r/boardgames/comments/b...,[],,False,all_ads,6
9,[],,,False,j3ddy_l33,,heroquest,"[{'e': 'text', 't': 'The Cardboard Herald'}]",e3e67852-18aa-11e7-a0b7-0ea2adda5eac,The Cardboard Herald,...,140.0,"In a surprise twist, Pandemic: Rapid Response ...",0,54,https://youtu.be/XByaK4LHoTM,[],,False,all_ads,6


In [38]:
pd.DataFrame(posts).to_csv('posts.csv')

In [39]:
reddit_dict['data']['after']

't3_bxkt6m'

This is the name of the last post.

In [40]:
pd.DataFrame(posts)['name']

0     t3_bxqpl8
1     t3_bxluh6
2     t3_bxfm4v
3     t3_bxmz36
4     t3_bxm1d8
5     t3_bxjde6
6     t3_bxlsqn
7     t3_bxhh58
8     t3_bxglun
9     t3_bxiuoh
10    t3_bxmsvv
11    t3_bxlteq
12    t3_bx53sj
13    t3_bxhwux
14    t3_bxksf6
15    t3_bxgx6x
16    t3_bxkw5l
17    t3_bxhrdb
18    t3_bxmvf6
19    t3_bxkb2v
20    t3_bxrv2x
21    t3_bxhvdl
22    t3_bxgr3q
23    t3_bxirt6
24    t3_bxkfdx
25    t3_bxkt6m
Name: name, dtype: object

In [41]:
reddit_dict['data']['after']

't3_bxkt6m'

This is the new URL that gives you the next 25 posts.

In [42]:
url + '?after=' + reddit_dict['data']['after']

'https://www.reddit.com/r/boardgames.json?after=t3_bxkt6m'

## Looping through the posts, 25 posts at a time

In [43]:
posts = []
after = None

for a in range(4):
    if after == None:
        current_url = url
    else:
        current_url = url + '?after=' + after
    print(current_url)
    res = requests.get(current_url, headers={'User-agent': 'Pony Inc 1.0'})
    
    if res.status_code != 200:
        print('Status error', res.status_code)
        break
    
    current_dict = res.json()
    current_posts = [p['data'] for p in current_dict['data']['children']]
    posts.extend(current_posts)
    after = current_dict['data']['after']
    
    # generate a random sleep duration to look more 'natural'
    sleep_duration = random.randint(2,6)
    print(sleep_duration)
    time.sleep(sleep_duration)

https://www.reddit.com/r/boardgames.json
4
https://www.reddit.com/r/boardgames.json?after=t3_bxkt6m
5
https://www.reddit.com/r/boardgames.json?after=t3_bxbxr2
3
https://www.reddit.com/r/boardgames.json?after=t3_bx4i0s
4


In [44]:
posts = []
after = None

for a in range(4):
    if after == None:
        current_url = url
    else:
        current_url = url + '?after=' + after
    print(current_url)
    res = requests.get(current_url, headers={'User-agent': 'Pony Inc 1.0'})
    
    if res.status_code != 200:
        print('Status error', res.status_code)
        break
    
    current_dict = res.json()
    current_posts = [p['data'] for p in current_dict['data']['children']]
    posts.extend(current_posts)
    after = current_dict['data']['after']
    
    if a > 0:
        prev_posts = pd.read_csv('boardgames.csv')
        current_df = pd.DataFrame()
        
    else:
        pd.DataFrame(posts).to_csv('boardgames.csv', index = False)

    # generate a random sleep duration to look more 'natural'
    sleep_duration = random.randint(2,6)
    print(sleep_duration)
    time.sleep(sleep_duration)

https://www.reddit.com/r/boardgames.json
3
https://www.reddit.com/r/boardgames.json?after=t3_bxkt6m
4
https://www.reddit.com/r/boardgames.json?after=t3_bxbxr2
4
https://www.reddit.com/r/boardgames.json?after=t3_bx4i0s
2


In [45]:
len(posts)

101

In [46]:
pd.DataFrame(posts).to_csv('boardgames.csv', index = False)