In [4]:
# general imports
import time
import os

# redefine system path to import outside packages...
import sys
sys.path.append('../')

# import internal tools
from src import enrichment
from src import fetch
from src import tools

In [5]:
# parameters for our search...
SUB  = 'relationship_advice' # subreddit to scrape from
TIME = 'month'         # time range
SORT = 'top'           # category by which to sort by
N_POSTS    = 50        # number of posts
N_COMMENTS = None      # hard limit on comments per post
DEPTH      = None      # comment depth (or children) limit

# make sure you have these!
assert os.environ['REDDIT_CLIENT_ID']
assert os.environ['REDDIT_CLIENT_SECRET']

#### The example below calls a the `fetch` function from `../src/fetch.py`...

In [3]:
# fetch submissions
comments = fetch.fetch(
    sub  = SUB,
    time = TIME,
    sort = SORT,
    num_posts = N_POSTS,
    num_comments = N_COMMENTS,
    depth = DEPTH
)

fetching comments from relationship_advice...           
sorting by: top, scraping 50 posts, None comments each (depth=None)
connecting to reddit client...
connected!
serializing data...
percent complete: 2.0%
2235
percent complete: 4.0%
5505
percent complete: 6.0%
7203
percent complete: 8.0%
9327
percent complete: 10.0%
11392
percent complete: 12.0%
12578
percent complete: 14.000000000000002%
13980
percent complete: 16.0%
15213
percent complete: 18.0%
18643
percent complete: 20.0%
20391
percent complete: 22.0%
21368
percent complete: 24.0%
24277
percent complete: 26.0%
25355
percent complete: 28.000000000000004%
27186
percent complete: 30.0%
28031
percent complete: 32.0%
28034
percent complete: 34.0%
28851
percent complete: 36.0%
29327
percent complete: 38.0%
31215
percent complete: 40.0%
32043
percent complete: 42.0%
32380
percent complete: 44.0%
34197
percent complete: 46.0%
34432
percent complete: 48.0%
35065
percent complete: 50.0%
35894
percent complete: 52.0%
36755
percent compl

#### save both as jsons...

In [4]:
# save to .json
tools.save_json(comments, '../data/relationship_advice.json')

## Batch fetching from multiple subreddits

In [6]:
SUBREDDITS = [
    # political
#     'politics',
#     'sandersforpresident',
#     'worldnews',
#     'news',
    # misc
#     'askreddit',
#     'technology',
#     'gaming',
#     'funny',
#     'pics',
    'conservative'
]
TIME = 'month'         # time range
SORT = 'top'           # category by which to sort by
N_POSTS    = 100       # number of posts
N_COMMENTS = None      # hard limit on comments per post
DEPTH      = 5         # comment depth (or children) limit

In [7]:
# fetch all, save progress as we go...
for s in SUBREDDITS:
    print('========== calling fetch() on /r/{0} =========='.format(s))
    comments = fetch.fetch(
        sub  = s,
        time = TIME,
        sort = SORT,
        num_posts = N_POSTS,
        num_comments = N_COMMENTS,
        depth = DEPTH
    )
    tools.save_json(comments, '../data/{0}-m100.json'.format(s))
    print('========== saved fetch() data for /r/{0} =========='.format(s))
    print()
    

fetching comments from conservative...           
sorting by: top, scraping 100 posts, None comments each (depth=5)
connecting to reddit client...
connected!
serializing data...
1.00% complete
2.00% complete
3.00% complete
4.00% complete
5.00% complete
6.00% complete
7.00% complete
8.00% complete
9.00% complete
10.00% complete
11.00% complete
12.00% complete
13.00% complete
14.00% complete
15.00% complete
16.00% complete
17.00% complete
18.00% complete
19.00% complete
20.00% complete
21.00% complete
22.00% complete
23.00% complete
24.00% complete
25.00% complete
26.00% complete
27.00% complete
28.00% complete
29.00% complete
30.00% complete
31.00% complete
32.00% complete
33.00% complete
34.00% complete
35.00% complete
36.00% complete
37.00% complete
38.00% complete
39.00% complete
40.00% complete
41.00% complete
42.00% complete
43.00% complete
44.00% complete
45.00% complete
46.00% complete
47.00% complete
48.00% complete
49.00% complete
50.00% complete
51.00% complete
52.00% complete