### 1. Imports and Getting Set Up

In [192]:
# imports
import pandas as pd
import requests
import time

In [193]:
# obscures inputs and does not echo them back
import getpass

In [195]:
# this is to keep credentials secret when entering

client_id = getpass.getpass() #alphanumeric string provided under "personal use script"
client_secret =  getpass.getpass() #alphanumeric string provided as "secret"
user_agent =  getpass.getpass() #the name of your application
username =  getpass.getpass() #your reddit username
password =  getpass.getpass() #your reddit password

 ······················
 ······························
 ····
 ·······
 ···········


In [196]:
# retrieving the access token

auth = requests.auth.HTTPBasicAuth(client_id, client_secret)

data = {
    'grant_type': 'password',
    'username': username,
    'password': password
}

In [197]:
# Header for application
headers = {'User-Agent': 'dsi1113/0.0.1'}

res = requests.post(
    'https://www.reddit.com/api/v1/access_token',
    auth=auth,
    data=data,
    headers=headers)

# ensuring it works
print(res)

<Response [200]>


In [198]:
# this will cause an error if something did not work
res.json()

{'access_token': 'eyJhbGciOiJSUzI1NiIsImtpZCI6IlNIQTI1NjpzS3dsMnlsV0VtMjVmcXhwTU40cWY4MXE2OWFFdWFyMnpLMUdhVGxjdWNZIiwidHlwIjoiSldUIn0.eyJzdWIiOiJ1c2VyIiwiZXhwIjoxNzA0NDk1ODYwLjU3MzE1OSwiaWF0IjoxNzA0NDA5NDYwLjU3MzE1OSwianRpIjoiaGVHaTNDNTNQcFM0UHptSHM1em1RY2ItelFxVWRnIiwiY2lkIjoiXzFsMlFPdjRaUk9jYldwT3FGVjk2USIsImxpZCI6InQyX3I5cGdsZDV3ZyIsImFpZCI6InQyX3I5cGdsZDV3ZyIsImxjYSI6MTcwNDI0MzQ5NzQ1Miwic2NwIjoiZUp5S1Z0SlNpZ1VFQUFEX193TnpBU2MiLCJmbG8iOjl9.E4LwJxMpC5M1zsH2SDwXFRd7FrQYKNZZ-kMdZA_PsYSgNKyDiBGwIm-4F9TNbDzVcZLCWCMq-mjVtJGxZXjtAZHCv3mQrP9oRQQNCj8QcOBf41QktKa40bJOoWAkzyR9Iz6b4wZHrq8oyDq6vUMNj1W0jH16wbF-iFiTDCEmTXdncSI4g9MLMxqVQFOPs5KyUWm3ezpk204cycteiq7Sa8Ueu_G6ZoZtMB0te5pSZJ8UkfuuSSOi4gX8M96WSpRyV4YFOttqoeHFdoEhy5oPzfuQRu8gKs_KI4bjBDxoWrtQNO1Usa-SbqiUdZUR_5XOtI6sXWKV4dR5qz9WQOt8EQ',
 'token_type': 'bearer',
 'expires_in': 86400,
 'scope': '*'}

In [199]:
# retrieve access token
token = res.json()['access_token']

In [200]:
# add token to headers
headers['Authorization'] = f'bearer {token}'

requests.get('https://oauth.reddit.com/api/v1/me', headers=headers).status_code == 200

True

### 2. Testing to see if we can start scraping

In [201]:
# Let's see if we've connected appropriately and if we can start scraping data
base_url = 'https://oauth.reddit.com/r/'
subreddit1 = 'ADHD'

# make limit 100 instead of defult 25 (MAX IS 100)
params = {
    'limit': 100
   # 'after': <-- will be important for getting the 'next' posts
}

res1 = requests.get(base_url+subreddit1, 
                   headers=headers,
                   params= params)

# 'data' houses all the response info
# 'after', 'before', and 'children'

# let's just get the first (index 0) post to avoid a long output
res1.json()['data']['children'][0]

{'kind': 't3',
 'data': {'approved_at_utc': None,
  'subreddit': 'ADHD',
  'selftext': "Hey y'all,\n\nOnce the rules vacation is over, we're going to be collecting feedback about the current rules vacation and the state of the sub. Specifically, we're interested in feedback only on the rules that were suspended. We aren't sure what that's going to look like just yet, but it will likely consist of a set of ratings asking how you feel about each rule and the state of the sub, plus space for written comments. The questionnaire should be posted early January.\n\nUntil then, keep the following questions in mind:\n\n* What do you like about the current state of the sub? What don't you like?\n* Are there any rules you want to see removed or modified? Are there any you think we should keep?\n* How do you feel about the volume of posts? Is it overwhelming or just right?\n* How is the balance of discussion posts to memes and other lower-effort content? Are important things getting drowned out?\n

The above request works as anticipated and we were able to pull the first post. <br><br>
Next, let's write a function that will scrape the most recent ~1000 posts from a given subreddit for us. We will call the function using an input (the names of the subreddits), concatenate the dataframes, and then store the resulting dataframe and save it off as a csv file.

### 3. Scraping the Subreddits and Storing Data

In [203]:
# Function to return a dataframe of posts, title, sub, up/downvotes given the name of a subreddit
def subreddit_scrape(name):
    
    # empty lists (this is where we will store data)
    posts = []
    title = []
    subreddit = []
    ups = []
    downs = []
    
    # create separate pieces for URL
    base_url = 'https://oauth.reddit.com/r/'
    subreddit1 = name # this is where our input will be called in
    endpoint = '/new' # this specifies we are looking at the new posts (can change to 'hot' or 'top')
    
    
    params = {
        'limit': 100,
        'after': ''} # 'after' will be updated in the 1st for loop

    for n in range(1,12): # pull data 11 times
        # pull the data using our url, headers, and params
        data = requests.get(base_url+subreddit1+endpoint, 
                           headers=headers,
                           params=params)
        
        # this is our 'counter' --> updates 'after' for every batch of posts we pull, so the next pull will look at the next batch of posts
        params.update({'after': data.json()['data']['after']}) 
        
        # for the current pull of data, iterate thru and store the following data in the lists above
        for i in range(0, len(data.json()['data']['children'])):
            posts.append(data.json()['data']['children'][i]['data']['selftext'])
            title.append(data.json()['data']['children'][i]['data']['title'])
            subreddit.append(data.json()['data']['children'][i]['data']['subreddit'])
            ups.append(data.json()['data']['children'][i]['data']['ups'])
            downs.append(data.json()['data']['children'][i]['data']['downs'])

    # return a dataframe
    return pd.DataFrame({'post': posts, 'title': title, 'subreddit': subreddit, 'upvotes': ups, 'downvotes': downs}).drop_duplicates()

In [212]:
# store scraped data
r_adhd = subreddit_scrape('ADHD')
r_autism = subreddit_scrape('autism')

In [211]:
# concatenate both dataframes and save off as csv

pd.concat([r_adhd,r_autism]).to_csv('./data/scraped_reddit_data.csv')