In [1]:
# Import libaries
import requests
import time
import pandas as pd

---
## Reddit Post Scraping

In [2]:
posts_url = 'https://api.pushshift.io/reddit/search/submission'

In [3]:
time_list = ['2022-06-24', '2022-06-23', '2022-06-22', '2022-06-21', '2022-06-20',
             '2022-06-19', '2022-06-18', '2022-06-17', '2022-06-16', '2022-06-15',
             '2022-06-14', '2022-06-13', '2022-06-12', '2022-06-11', '2022-06-10', 
             '2022-06-09', '2022-06-08', '2022-06-07', '2022-06-06', '2022-06-05',
             '2022-06-04', '2022-06-03', '2022-06-02', '2022-06-01']

In [4]:
# do for loop with multiple 'before' timestamps
# ~2_000 posts per subreddit
posts = []
for timestamp in time_list:
    books_post_data = {}
    books_post_params = {
    'subreddit': 'books',
    'size': 100, # size limit is 100
    'before': timestamp, # can use this date format instead of UTC
    #'num_comments': 10, # selects posts with n comments
    'score': '>10', # score needs to be string
    'selftext': True # selects only posts that have selftext
    }
    res = requests.get(posts_url, books_post_params)
    books_post_data = res.json() # api pulls info as dictionary
    posts += books_post_data['data'] # adds post data for each timestamp to posts
    time.sleep(3) #pauses requests for a few seconds between hits

In [5]:
books_df = pd.DataFrame(posts)
books_df[['subreddit', 'selftext', 'title']]

Unnamed: 0,subreddit,selftext,title
0,books,"Let us imagine a world where musicians, alive ...",What song would you want to be turned into a b...
1,books,I have just finished reading Sam Kean's The Di...,The Disappearing Spoon: a delightful surprise ...
2,books,I know this book probably gets discussed fairl...,"Just finished ""For Whom the Bell Tolls"""
3,books,One of my Covid-19 projects is to discard item...,"A personal library, re-discovering my filing s..."
4,books,"I've just finished it, and I wholly believe it...",A People's History of the United States
...,...,...,...
2395,books,I've read the book and I want to share my thou...,Winston Smith is not the good guy in 1984
2396,books,I really have to thank the people on /r/books ...,In praise of The Master and Margarita
2397,books,First off: I don't read as much as I like. But...,I Just Finished East of Eden by John Steinbeck
2398,books,I bought the kindle version of the Complete Se...,Wheel of Time. Is everyone such a jerk all the...


In [6]:
# do for loop with multiple 'before' timestamps
# ~2_000 posts per subreddit
fantasy_posts = []
for timestamp in time_list:
    fantasy_post_data = {}
    fantasy_post_params = {
    'subreddit': 'Fantasy',
    'size': 100, # size limit is 100
    'before': timestamp, # can use this date format instead of UTC
    #'num_comments': 10, # selects posts with n comments
    'score': '>10', # score needs to be string
    'selftext': True # selects only posts that have selftext
    }
    res = requests.get(posts_url, fantasy_post_params)
    fantasy_post_data = res.json() # api pulls info as dictionary
    fantasy_posts+=fantasy_post_data['data'] # adds post data for each timestamp to posts
    time.sleep(3) #pauses requests for a few seconds between hits

In [7]:
fantasy_df = pd.DataFrame(fantasy_posts)
fantasy_df[['subreddit', 'selftext', 'title']]

Unnamed: 0,subreddit,selftext,title
0,Fantasy,\n\nhttps://preview.redd.it/ophxrj07w5481.jpg...,Review: Cyber Mage by Saad Z. Hossain
1,Fantasy,Welcome to the 4th part of our [Curse of the ...,Curse of the Mistwraith Read-along Chapters 13...
2,Fantasy,Do you like fantasy? Do you like mysteries? Do...,[Book Review/Summary] The Obsidian &amp; Blood...
3,Fantasy,&amp;#x200B;\n\n[Cover Art by Deranged Doctor ...,"Bookclub: Q&amp;A with J.A. Andrews, the autho..."
4,Fantasy,Time to Vote in the [**September 2021 Book of ...,Vote for the September Goodreads Book of the M...
...,...,...,...
2395,Fantasy,I've noticed whenever someone asks for what th...,Are soft magic systems given an unfairly bad rep?
2396,Fantasy,**What is the HEA Bookclub?** [You can read ou...,HEA Book Club: Vote for our November book! (Oc...
2397,Fantasy,&gt; *It was bad luck to name a daughter after...,Para's Proper Reviews: Wicked Like a Wildfire ...
2398,Fantasy,I AM HALFWAY DONE AS OF THIS POST! \[blows par...,Climbing Mount Readmore: Reading Our Top Fanta...


In [9]:
fantasy_books = posts + fantasy_posts
fantasy_books_df = pd.DataFrame(fantasy_books)
fantasy_books_df[['subreddit', 'selftext', 'title']]

Unnamed: 0,subreddit,selftext,title
0,books,"Let us imagine a world where musicians, alive ...",What song would you want to be turned into a b...
1,books,I have just finished reading Sam Kean's The Di...,The Disappearing Spoon: a delightful surprise ...
2,books,I know this book probably gets discussed fairl...,"Just finished ""For Whom the Bell Tolls"""
3,books,One of my Covid-19 projects is to discard item...,"A personal library, re-discovering my filing s..."
4,books,"I've just finished it, and I wholly believe it...",A People's History of the United States
...,...,...,...
4795,Fantasy,I've noticed whenever someone asks for what th...,Are soft magic systems given an unfairly bad rep?
4796,Fantasy,**What is the HEA Bookclub?** [You can read ou...,HEA Book Club: Vote for our November book! (Oc...
4797,Fantasy,&gt; *It was bad luck to name a daughter after...,Para's Proper Reviews: Wicked Like a Wildfire ...
4798,Fantasy,I AM HALFWAY DONE AS OF THIS POST! \[blows par...,Climbing Mount Readmore: Reading Our Top Fanta...


In [11]:
books_df[['subreddit', 'selftext', 'title']].to_csv('data/books_cleaned_data.csv', index = False)
fantasy_df[['subreddit', 'selftext', 'title']].to_csv('data/fantasy_cleaned_data.csv', index = False)
fantasy_books_df[['subreddit', 'selftext', 'title']].to_csv('data/total_cleaned_data.csv', index = False)

---
## Reddit Comment Scraping

In [10]:
comm_url = 'https://api.pushshift.io/reddit/search/comment'

In [11]:
# do for loop with multiple 'before' timestamps
# ~2_000 posts per subreddit
comments = []
for timestamp in time_list:
    comm_data = {}
    comm_params = {
    'subreddit': 'books',
    'size': 100, # size limit is 100
    'before': timestamp, # can use this date format instead of UTC
    'score': '>10' # score needs to be string
    #'selftext': True # selects only posts that have selftext
    }
    res = requests.get(comm_url, comm_params)
    comm_data = res.json() # api pulls info as dictionary
    comments+=comm_data['data'] # adds post data for each timestamp to posts
    time.sleep(3) #pauses requests for a few seconds between hits

In [12]:
comment_df = pd.DataFrame(comments)

In [13]:
comment_df

Unnamed: 0,all_awardings,archived,associated_award,author,author_flair_background_color,author_flair_css_class,author_flair_richtext,author_flair_template_id,author_flair_text,author_flair_text_color,...,subreddit_id,subreddit_name_prefixed,subreddit_type,top_awarded_type,total_awards_received,treatment_tags,unrepliable_reason,edited,awarders,retrieved_on
0,[],False,,Theblackjamesbrown,,,[],,,,...,t5_2qh4i,r/books,public,,0,[],,,,
1,[],False,,mjackson4672,,,[],,,,...,t5_2qh4i,r/books,public,,0,[],,,,
2,[],False,,eve_is_hopeful,,,[],,,,...,t5_2qh4i,r/books,public,,0,[],,,,
3,[],False,,THEREALCABEZAGRANDE,,,[],,,,...,t5_2qh4i,r/books,public,,0,[],,,,
4,[],False,,flisswritesbooks,,,[],,,,...,t5_2qh4i,r/books,public,,0,[],,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2395,[],,,malcontented,,science-fiction,"[{'e': 'text', 't': 'Science Fiction'}]",,Science Fiction,dark,...,t5_2qh4i,,,,0,[],,,[],1.656257e+09
2396,[],,,TheWordThief,,,[],,,,...,t5_2qh4i,,,,0,[],,,[],1.656252e+09
2397,[],,,Elegant_Habit_9269,,,[],,,,...,t5_2qh4i,,,,0,[],,,[],1.656251e+09
2398,[],,,Rururaspberry,,,[],,,,...,t5_2qh4i,,,,0,[],,,[],1.656242e+09


In [14]:
comment_df[['subreddit', 'body']]

Unnamed: 0,subreddit,body
0,books,It's...not really. It's a critique of Soviet C...
1,books,Keep reading The Fifth Season
2,books,"Yeah, hated this ending. Blech."
3,books,It's a disturbing taboo that King uses a lot t...
4,books,"Peeta pressures her into having children, like..."
...,...,...
2395,books,Lolita. Reviled because of the subject matter ...
2396,books,"Not sure it's completely unknown, but *The Wes..."
2397,books,Literally everything by David Brin is brillian...
2398,books,This is such a standout fantasy series. The a...
