# Reddit Scraping Using Pushshift
----

Used this article to utilize pmaw instead of simply pushshift for retrieving more than 100 instances. 
[Source](https://medium.com/swlh/how-to-scrape-large-amounts-of-reddit-data-using-pushshift-1d33bde9286)

[pmaw Documentation](https://pypi.org/project/pmaw/)

In [1]:
## Imports
import pandas as pd
from pmaw import PushshiftAPI
import datetime as dt

In [2]:
# Instantiate
api = PushshiftAPI()

In [3]:
# Convert Date range to epoch time
before = int(dt.datetime(2022,6,24,0,0).timestamp())
after = int(dt.datetime(2021,1,1,0,0).timestamp())

In [25]:
# Rate limit of 60 requests per minute is default for pmaw

In [4]:
# Pull comments for formuladank subreddit
comments = api.search_comments(subreddit = 'formuladank',
                               limit = 10_000,
                               before = before,
                               after = after)

In [5]:
# Create a dataframe out of pulled data
formuladank_df = pd.DataFrame(comments)

In [24]:
# Checking for proper dataframe
formuladank_df.head()

Unnamed: 0,all_awardings,associated_award,author,author_flair_background_color,author_flair_css_class,author_flair_richtext,author_flair_template_id,author_flair_text,author_flair_text_color,author_flair_type,...,collapsed_reason_code,controversiality,distinguished,gilded,retrieved_utc,score_hidden,subreddit_name_prefixed,subreddit_type,unrepliable_reason,media_metadata
0,[],,Zimminar,#ea0027,,"[{'e': 'text', 't': 'Bwoah'}]",3a0e67ce-d25f-11e9-85e3-0e4b0408ab00,Bwoah,light,richtext,...,,,,,,,,,,
1,[],,Friend-Beast,#ffd635,,"[{'e': 'text', 't': 'Claire Williams is waifu ...",9d9c8d52-f2c5-11ea-87a9-0e51bf71a125,Claire Williams is waifu material,dark,richtext,...,,,,,,,,,,
2,[],,lonely_car_guy,#0dd3bb,forsure,"[{'e': 'text', 't': 'BWOAHHHHHHH '}]",8a719d14-7aa3-11e7-978f-0e07e6f3e372,BWOAHHHHHHH,light,richtext,...,,,,,,,,,,
3,[],,439115,,,[],,,,text,...,,,,,,,,,,
4,[],,[deleted],,,,,,dark,,...,,,,,,,,,,


In [6]:
# Verifying that the body is the comment text
formuladank_df['body'].head()

0    Yeah but Kimi's also the most experienced F1 d...
1    What is actually happening in the original wit...
2                                          Yeah insane
3    Hamilton did it before at the Germany slip n s...
4                                            [removed]
Name: body, dtype: object

In [7]:
# Checking column names
formuladank_df.columns

Index(['all_awardings', 'associated_award', 'author',
       'author_flair_background_color', 'author_flair_css_class',
       'author_flair_richtext', 'author_flair_template_id',
       'author_flair_text', 'author_flair_text_color', 'author_flair_type',
       'author_fullname', 'author_patreon_flair', 'author_premium', 'awarders',
       'body', 'collapsed_because_crowd_control', 'comment_type',
       'created_utc', 'gildings', 'id', 'is_submitter', 'link_id', 'locked',
       'no_follow', 'parent_id', 'permalink', 'retrieved_on', 'score',
       'send_replies', 'stickied', 'subreddit', 'subreddit_id',
       'top_awarded_type', 'total_awards_received', 'treatment_tags',
       'author_cakeday', 'archived', 'body_sha1', 'can_gild', 'collapsed',
       'collapsed_reason', 'collapsed_reason_code', 'controversiality',
       'distinguished', 'gilded', 'retrieved_utc', 'score_hidden',
       'subreddit_name_prefixed', 'subreddit_type', 'unrepliable_reason',
       'media_metadata'],
  

In [8]:
# Create a dataframe out of the subreddit and body columns (These are the only things we will use)
fd = formuladank_df[['subreddit', 'body', 'author_flair_text']]

In [9]:
# Ensure the dataframe was made properly
fd.head()

Unnamed: 0,subreddit,body,author_flair_text
0,formuladank,Yeah but Kimi's also the most experienced F1 d...,Bwoah
1,formuladank,What is actually happening in the original wit...,Claire Williams is waifu material
2,formuladank,Yeah insane,BWOAHHHHHHH
3,formuladank,Hamilton did it before at the Germany slip n s...,
4,formuladank,[removed],


In [10]:
# Pulling comments for the formula 1 subreddit
comments_1 = api.search_comments(subreddit = 'formula1',
                               limit = 10_000,
                               before = before,
                               after = after)

In [11]:
# Creating formula1 dataframe
formula1_df = pd.DataFrame(comments_1)

In [12]:
# Samller dataframe with only the subreddit and body
f1 = formula1_df[['subreddit', 'body', 'author_flair_text']]

In [13]:
f1.head()

Unnamed: 0,subreddit,body,author_flair_text
0,formula1,Are they going to be Williams-Alpine or Willia...,:stoffel-vandoorne: Stoffel Vandoorne
1,formula1,So basically if we know it by Albert park we w...,
2,formula1,If you no longer go full throttle for a corner...,:jacques-villeneuve: Jacques Villeneuve
3,formula1,Scotty in 5th!,Daniel Ricciardo
4,formula1,Why did he sleep in a different room from his ...,


In [14]:
fd.to_csv('../data/formuladank_comments.csv', index = False)
f1.to_csv('../data/formula1_comments.csv', index = False)

---
# Pulling posts

In [15]:
posts = api.search_submissions(subreddit = 'formuladank',
                               limit = 5_000,
                               before = before,
                               after = after)

In [16]:
formuladank_df_posts = pd.DataFrame(posts)
formuladank_df_posts.head()

Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_background_color,author_flair_css_class,author_flair_richtext,author_flair_template_id,author_flair_text,author_flair_text_color,author_flair_type,...,is_created_from_ads_ui,media_metadata,author_is_blocked,gallery_data,is_gallery,poll_data,author_cakeday,banned_by,distinguished,gilded
0,[],False,Nabil1510,#0dd3bb,forsure,"[{'e': 'text', 't': 'BWOAHHHHHHH '}]",8a719d14-7aa3-11e7-978f-0e07e6f3e372,BWOAHHHHHHH,light,richtext,...,,,,,,,,,,
1,[],False,[deleted],,,,,,dark,,...,,,,,,,,,,
2,[],False,racingplayer607,,forsure,"[{'e': 'text', 't': 'BWOAHHHHHHH '}]",,BWOAHHHHHHH,dark,richtext,...,,,,,,,,,,
3,[],False,mariya234,,forsure,"[{'e': 'text', 't': 'BWOAHHHHHHH '}]",,BWOAHHHHHHH,dark,richtext,...,,,,,,,,,,
4,[],False,[deleted],,,,,,dark,,...,,,,,,,,,,


In [17]:
fd_posts = formuladank_df_posts[['subreddit', 'title', 'author_flair_text']]

In [18]:
fd_posts

Unnamed: 0,subreddit,title,author_flair_text
0,formuladank,Honestly I need a simple answer. Please,BWOAHHHHHHH
1,formuladank,Yuki Tsunochill,
2,formuladank,Take 3,BWOAHHHHHHH
3,formuladank,formuladank Subreddit Statistics,BWOAHHHHHHH
4,formuladank,We Are just pure racist,
...,...,...,...
4995,formuladank,Daniel and Lando talking to each other on the ...,BWOAHHHHHHH
4996,formuladank,Penalty + penalty,I get my news from Sky Hamilton F1
4997,formuladank,I know it happend in rally,BWOAHHHHHHH
4998,formuladank,Khaby's take on the accident,BWOAHHHHHHH


In [19]:
posts_2 = api.search_submissions(subreddit = 'formula1',
                               limit = 5_000,
                               before = before,
                               after = after)

In [20]:
formula1_df_posts = pd.DataFrame(posts_2)
formula1_df_posts.head()

Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_background_color,author_flair_css_class,author_flair_text,author_flair_text_color,awarders,can_mod_post,contest_mode,...,discussion_type,suggested_sort,crosspost_parent,crosspost_parent_list,poll_data,author_is_blocked,author_cakeday,call_to_action,category,tournament_data
0,[],False,[deleted],,,,dark,[],False,False,...,,,,,,,,,,
1,[],False,[deleted],,,,dark,[],False,False,...,,,,,,,,,,
2,[],False,Ghost_Company,,,,,[],False,False,...,,,,,,,,,,
3,"[{'award_sub_type': 'GLOBAL', 'award_type': 'g...",False,bonyetty,,,,,[],False,False,...,,,,,,,,,,
4,[],False,Mellow200,,,,,[],False,False,...,,,,,,,,,,


In [21]:
f1_posts = formula1_df_posts[['subreddit', 'title', 'author_flair_text']]

In [22]:
f1_posts

Unnamed: 0,subreddit,title,author_flair_text
0,formula1,33 days before the start of the season!,
1,formula1,Romain Grosjean happy at IndyCar test despite ...,
2,formula1,How good was Rosberg really?,
3,formula1,These have been posted up around the park.,
4,formula1,Red Bull Racing mechanics spotted wearing a di...,
...,...,...,...
4995,formula1,Quickly drew up a dodge f1 concept. Lmk what y...,:daniel-ricciardo: Daniel Ricciardo
4996,formula1,Will my F1 TV account work while in the UK?,
4997,formula1,Battle between Yuki Tsunoda and Lewis Hamilton...,:charles-leclerc: Charles Leclerc
4998,formula1,The last four races have had four different ra...,


In [23]:
fd_posts.to_csv('../data/formuladank_titles.csv', index = False)
f1_posts.to_csv('../data/formula1_titles.csv', index = False)