# Data Collection from Reddit

In [1]:
import requests
import pandas as pd
import time

In [2]:
#set main url from the api pushift.io
url = 'https://api.pushshift.io/reddit/search/submission'

#### Define a function to call the api repeatively for data collection

In [3]:
#function to get multiple sets of data from Reddit
def get_data(subreddit, quantity):
    # establish the number of requests needed based on the user input, round float to single integer
    n_calls = int(quantity/500)
    # create master list of dictionaries
    whole_list =[]
    #set the initial parameter, collects the most recent 500 posts
    params = {
        'subreddit': str(subreddit),
        'size' : 500
    }
    # initial call to reddit
    response = requests.get(url,params)
    # save dictionary of json data
    results = response.json()
    
    # add the list of dictionaries under 'data' to the master list
    for item in range(len(results['data'])):
        whole_list.append(results['data'][item])

# loop through the range set by the # of requested data rows given by the user

    for i in range(1,n_calls):
        #display the functions progress as it slowly calls to the api for the requested data
        print(f'pulling subreddit submissions {i*500} of {n_calls*500}')
        #wait 5 seconds to reduce the strain on api
        time.sleep(5)
        
        #find the oldest post date for next pull and subtract 1 to increase the age
        date = results['data'][499]['created_utc'] - 1
    
        #add the date to the new param list
        params_2 = {
            'subreddit': str(subreddit),
            'size' : 500,
            'before' : date 
        } 
    
        #make ith call to reddit
        response = requests.get(url,params_2)
        results = response.json()

        # add the list of dictionaries under 'data' to the master list
        for item in range(len(results['data'])):
            whole_list.append(results['data'][item])
    #print finished
    print(f'Done, successfully pulled {quantity} submissions from r/{subreddit}')
    
    #return the master list
    return whole_list
    
    
    

### Sub Reddit 1 r/EatCheapAndHealthy

Collect 20,000 submissions

In [16]:
healthy_list = get_data('EatCheapAndHealthy',20_000)

pulling subreddit submissions 500 of 20000
pulling subreddit submissions 1000 of 20000
pulling subreddit submissions 1500 of 20000
pulling subreddit submissions 2000 of 20000
pulling subreddit submissions 2500 of 20000
pulling subreddit submissions 3000 of 20000
pulling subreddit submissions 3500 of 20000
pulling subreddit submissions 4000 of 20000
pulling subreddit submissions 4500 of 20000
pulling subreddit submissions 5000 of 20000
pulling subreddit submissions 5500 of 20000
pulling subreddit submissions 6000 of 20000
pulling subreddit submissions 6500 of 20000
pulling subreddit submissions 7000 of 20000
pulling subreddit submissions 7500 of 20000
pulling subreddit submissions 8000 of 20000
pulling subreddit submissions 8500 of 20000
pulling subreddit submissions 9000 of 20000
pulling subreddit submissions 9500 of 20000
pulling subreddit submissions 10000 of 20000
pulling subreddit submissions 10500 of 20000
pulling subreddit submissions 11000 of 20000
pulling subreddit submissions 

In [17]:
# create healthy dataframe
healthy_df = pd.DataFrame(healthy_list)

In [18]:
len(healthy_list)

20000

In [19]:
# check the time range on submissions, length of Df, and for any duplicates
print('Most recent time stamp: ',healthy_df['created_utc'].max())
print('Oldest time stamp: ',healthy_df['created_utc'].min())
print('Duplicate rows found: ', healthy_df.duplicated(subset='id').sum())
print('Total submissions contained in dataframe: ', len(healthy_df))

Most recent time stamp:  1580419449
Oldest time stamp:  1505483043
Duplicate rows found:  0
Total submissions contained in dataframe:  20000


In [20]:
#save food data to data folder
healthy_df.to_csv('./Data/healthy_food.csv', index = False)

## Collect Subreddit 2 r/DIY 

In [268]:
diy_list = get_data('DIY',20_000)

pulling subreddit submissions 500 of 20000
pulling subreddit submissions 1000 of 20000
pulling subreddit submissions 1500 of 20000
pulling subreddit submissions 2000 of 20000
pulling subreddit submissions 2500 of 20000
pulling subreddit submissions 3000 of 20000
pulling subreddit submissions 3500 of 20000
pulling subreddit submissions 4000 of 20000
pulling subreddit submissions 4500 of 20000
pulling subreddit submissions 5000 of 20000
pulling subreddit submissions 5500 of 20000
pulling subreddit submissions 6000 of 20000
pulling subreddit submissions 6500 of 20000
pulling subreddit submissions 7000 of 20000
pulling subreddit submissions 7500 of 20000
pulling subreddit submissions 8000 of 20000
pulling subreddit submissions 8500 of 20000
pulling subreddit submissions 9000 of 20000
pulling subreddit submissions 9500 of 20000
pulling subreddit submissions 10000 of 20000
pulling subreddit submissions 10500 of 20000
pulling subreddit submissions 11000 of 20000
pulling subreddit submissions 

In [288]:
# create DIY dataframe
diy_df = pd.DataFrame(diy_list)

In [290]:
# check the time range on submissions, length of Df, and for any duplicates
print('Most recent time stamp: ',diy_df['created_utc'].max())
print('Oldest time stamp: ',diy_df['created_utc'].min())
print('Duplicate rows found: ', diy_df.duplicated(subset='id').sum())
print('Total submissions contained in dataframe: ', len(diy_df))

Most recent time stamp:  1580088535
Oldest time stamp:  1565343456
Duplicate rows found:  0
Total submissions contained in dataframe:  20000


In [291]:
#save diy data to data folder
diy_df.to_csv('./Data/diy.csv', index = False)

## Collect Subreddit 3 r/datascience (control set only)

In [295]:
data_list = get_data('datascience',20_000)

pulling subreddit submissions 500 of 20000
pulling subreddit submissions 1000 of 20000
pulling subreddit submissions 1500 of 20000
pulling subreddit submissions 2000 of 20000
pulling subreddit submissions 2500 of 20000
pulling subreddit submissions 3000 of 20000
pulling subreddit submissions 3500 of 20000
pulling subreddit submissions 4000 of 20000
pulling subreddit submissions 4500 of 20000
pulling subreddit submissions 5000 of 20000
pulling subreddit submissions 5500 of 20000
pulling subreddit submissions 6000 of 20000
pulling subreddit submissions 6500 of 20000
pulling subreddit submissions 7000 of 20000
pulling subreddit submissions 7500 of 20000
pulling subreddit submissions 8000 of 20000
pulling subreddit submissions 8500 of 20000
pulling subreddit submissions 9000 of 20000
pulling subreddit submissions 9500 of 20000
pulling subreddit submissions 10000 of 20000
pulling subreddit submissions 10500 of 20000
pulling subreddit submissions 11000 of 20000
pulling subreddit submissions 

In [300]:
# create 'datascience' dataframe
data_df = pd.DataFrame(data_list)

In [301]:
# check the time range on submissions, length of Df, and for any duplicates
print('Most recent time stamp: ',data_df['created_utc'].max())
print('Oldest time stamp: ',data_df['created_utc'].min())
print('Duplicate rows found: ', data_df.duplicated(subset='id').sum())
print('Total submissions contained in dataframe: ', len(data_df))

Most recent time stamp:  1580161720
Oldest time stamp:  1526304766
Duplicate rows found:  0
Total submissions contained in dataframe:  20000


In [303]:
## Collect Subreddit 3 r/datascience (control set only)#save diy data to data folder
data_df.to_csv('./Data/data.csv', index = False)

## Collect Subreddit 4 r/cooking

In [5]:
cooking_list = get_data('Cooking',20_000)

pulling subreddit submissions 500 of 20000
pulling subreddit submissions 1000 of 20000
pulling subreddit submissions 1500 of 20000
pulling subreddit submissions 2000 of 20000
pulling subreddit submissions 2500 of 20000
pulling subreddit submissions 3000 of 20000
pulling subreddit submissions 3500 of 20000
pulling subreddit submissions 4000 of 20000
pulling subreddit submissions 4500 of 20000
pulling subreddit submissions 5000 of 20000
pulling subreddit submissions 5500 of 20000
pulling subreddit submissions 6000 of 20000
pulling subreddit submissions 6500 of 20000
pulling subreddit submissions 7000 of 20000
pulling subreddit submissions 7500 of 20000
pulling subreddit submissions 8000 of 20000
pulling subreddit submissions 8500 of 20000
pulling subreddit submissions 9000 of 20000
pulling subreddit submissions 9500 of 20000
pulling subreddit submissions 10000 of 20000
pulling subreddit submissions 10500 of 20000
pulling subreddit submissions 11000 of 20000
pulling subreddit submissions 

In [6]:
# create 'cooking' dataframe
cooking_df = pd.DataFrame(cooking_list)

In [8]:
# save cooking data to data folder
cooking_df.to_csv('./Data/cooking.csv', index = False)

## Collect Subreddit 5 r/keto

In [9]:
keto_list = get_data('keto',20_000)

pulling subreddit submissions 500 of 20000
pulling subreddit submissions 1000 of 20000
pulling subreddit submissions 1500 of 20000
pulling subreddit submissions 2000 of 20000
pulling subreddit submissions 2500 of 20000
pulling subreddit submissions 3000 of 20000
pulling subreddit submissions 3500 of 20000
pulling subreddit submissions 4000 of 20000
pulling subreddit submissions 4500 of 20000
pulling subreddit submissions 5000 of 20000
pulling subreddit submissions 5500 of 20000
pulling subreddit submissions 6000 of 20000
pulling subreddit submissions 6500 of 20000
pulling subreddit submissions 7000 of 20000
pulling subreddit submissions 7500 of 20000
pulling subreddit submissions 8000 of 20000
pulling subreddit submissions 8500 of 20000
pulling subreddit submissions 9000 of 20000
pulling subreddit submissions 9500 of 20000
pulling subreddit submissions 10000 of 20000
pulling subreddit submissions 10500 of 20000
pulling subreddit submissions 11000 of 20000
pulling subreddit submissions 

In [10]:
# create 'keto' dataframe
keto_df = pd.DataFrame(keto_list)

In [11]:
# save keto data to data folder
keto_df.to_csv('./Data/keto.csv', index = False)

## Collect Subreddit 5 r/budgetfood

In [21]:
budget_list = get_data('budgetfood',20_000)

pulling subreddit submissions 500 of 20000
pulling subreddit submissions 1000 of 20000
pulling subreddit submissions 1500 of 20000
pulling subreddit submissions 2000 of 20000
pulling subreddit submissions 2500 of 20000
pulling subreddit submissions 3000 of 20000
pulling subreddit submissions 3500 of 20000
pulling subreddit submissions 4000 of 20000
pulling subreddit submissions 4500 of 20000
pulling subreddit submissions 5000 of 20000
pulling subreddit submissions 5500 of 20000
pulling subreddit submissions 6000 of 20000
pulling subreddit submissions 6500 of 20000
pulling subreddit submissions 7000 of 20000
pulling subreddit submissions 7500 of 20000
pulling subreddit submissions 8000 of 20000
pulling subreddit submissions 8500 of 20000
pulling subreddit submissions 9000 of 20000


IndexError: list index out of range

In [13]:
# create 'food' dataframe
food_df = pd.DataFrame(food_list)

In [14]:
# save food data to data folder
food_df.to_csv('./Data/food.csv', index = False)

### Continue to next notebook 'EDA' 