In [1]:
# Import libaries
import requests
import time
import pandas as pd

---
## Reddit Post Scraping

In [2]:
posts_url = 'https://api.pushshift.io/reddit/search/submission'

In [3]:
post_params = {
    'subreddit': 'LifeProTips',
    'size': 100, # size limit is 100
    'before': '2022-06-24', # can use this date format instead of UTC
    'num_comments': 10, # selects posts with n comments
    'score': '>10' # score needs to be string
    #'selftext': True # selects only posts that have selftext
}

In [4]:
res = requests.get(posts_url, post_params)

In [5]:
res.status_code

200

In [6]:
post_data = res.json()

In [7]:
posts = post_data['data'] #change to empty list when saving for loop changes

In [8]:
df = pd.DataFrame(posts)

In [9]:
time_list = ['2022-06-24', '2022-06-23', '2022-06-22', '2022-06-21', '2022-06-20',
             '2022-06-19', '2022-06-18', '2022-06-17', '2022-06-16', '2022-06-15',
             '2022-06-14', '2022-06-13', '2022-06-12', '2022-06-11', '2022-06-10', 
             '2022-06-09', '2022-06-08', '2022-06-07', '2022-06-06', '2022-06-05']

In [10]:
# do for loop with multiple 'before' timestamps
# ~2_000 posts per subreddit
posts = []
for timestamp in time_list:
    lpt_post_data = {}
    lpt_post_params = {
    'subreddit': 'LifeProTips',
    'size': 100, # size limit is 100
    'before': timestamp, # can use this date format instead of UTC
    'num_comments': 15, # selects posts with n comments
    'score': '>10' # score needs to be string
    #'selftext': True # selects only posts that have selftext
    }
    res = requests.get(posts_url, lpt_post_params)
    lpt_post_data = res.json() # api pulls info as dictionary
    posts += lpt_post_data['data'] # adds post data for each timestamp to posts
    time.sleep(3) #pauses requests for a few seconds between hits

In [11]:
lpt_df = pd.DataFrame(posts)

In [12]:
# do for loop with multiple 'before' timestamps
# ~2_000 posts per subreddit
for timestamp in time_list:
    ulpt_post_data = {}
    ulpt_post_params = {
    'subreddit': 'UnethicalLifeProTips',
    'size': 100, # size limit is 100
    'before': timestamp, # can use this date format instead of UTC
    'num_comments': 15, # selects posts with n comments
    'score': '>10' # score needs to be string
    #'selftext': True # selects only posts that have selftext
    }
    res = requests.get(posts_url, ulpt_post_params)
    ulpt_post_data = res.json() # api pulls info as dictionary
    posts+=ulpt_post_data['data'] # adds post data for each timestamp to posts
    time.sleep(3) #pauses requests for a few seconds between hits

In [13]:
ulpt_df = pd.DataFrame(posts)

In [14]:
ulpt_df[['subreddit', 'selftext', 'title']].to_csv('data/total_cleaned_data.csv', index = False)

---
## Reddit Comment Scraping

In [29]:
comm_url = 'https://api.pushshift.io/reddit/search/comment'

In [30]:
# do for loop with multiple 'before' timestamps
# ~2_000 posts per subreddit
comments = []
for timestamp in time_list:
    comm_data = {}
    comm_params = {
    'subreddit': 'LifeProTips',
    'size': 100, # size limit is 100
    'before': timestamp, # can use this date format instead of UTC
    'score': '>10' # score needs to be string
    #'selftext': True # selects only posts that have selftext
    }
    res = requests.get(comm_url, comm_params)
    comm_data = res.json() # api pulls info as dictionary
    comments+=comm_data['data'] # adds post data for each timestamp to posts
    time.sleep(3) #pauses requests for a few seconds between hits

In [31]:
comment_df = pd.DataFrame(comments)

In [32]:
comment_df

Unnamed: 0,all_awardings,associated_award,author,author_flair_background_color,author_flair_css_class,author_flair_richtext,author_flair_template_id,author_flair_text,author_flair_text_color,author_flair_type,...,retrieved_on,score,send_replies,stickied,subreddit,subreddit_id,top_awarded_type,total_awards_received,treatment_tags,unrepliable_reason
0,[],,l34rn3d,,,[],,,,text,...,1656376800,18,True,False,LifeProTips,t5_2s5oq,,0,[],
1,[],,TheSinningRobot,,,[],,,,text,...,1656375320,25,True,False,LifeProTips,t5_2s5oq,,0,[],
2,[],,[deleted],,,,,,dark,,...,1656374723,24,True,False,LifeProTips,t5_2s5oq,,0,[],
3,[],,Kidd5,,,[],,,,text,...,1656374671,12,True,False,LifeProTips,t5_2s5oq,,0,[],
4,[],,ResponsibilityOk9913,,,[],,,,text,...,1656373089,15,True,False,LifeProTips,t5_2s5oq,,0,[],
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,[],,MessyRoom,,,[],,,,text,...,1656180140,12,True,False,LifeProTips,t5_2s5oq,,0,[],
1996,[],,UnseenCat,,,[],,,,text,...,1656179448,39,True,False,LifeProTips,t5_2s5oq,,0,[],
1997,[],,hem_27,,,[],,,,text,...,1656178921,28,True,False,LifeProTips,t5_2s5oq,,0,[],
1998,[],,SwanLake74,,,[],,,,text,...,1656177776,99,True,False,LifeProTips,t5_2s5oq,,0,[],


In [33]:
comment_df[['subreddit', 'body']]

Unnamed: 0,subreddit,body
0,LifeProTips,Get a sleep study done. You might be sleeping ...
1,LifeProTips,"No fuck that. If I make a time off request, yo..."
2,LifeProTips,Those people who order more pizza than what th...
3,LifeProTips,This
4,LifeProTips,This
...,...,...
1995,LifeProTips,Can’t see either pic on iPhone 😔😔
1996,LifeProTips,We adopted two absolutely inseparable brothers...
1997,LifeProTips,Thank you for this. All the “you should only h...
1998,LifeProTips,Or stick it in the air fryer… delish
