# Data Gathering

### In this notebook you will see how I gathered my data.

In [9]:
# Importing the proper notebooks
import requests
import time

In [4]:
base_url = 'https://api.pushshift.io/reddit/search/submission/'

In [5]:
# This is the initial test to see if I am pulling the information correctly
res_casual_conversation = requests.get(
    base_url, 
    params={
        'subreddit': 'CasualConversation', 
        'size': 500,
    })

res_serious_conversation = requests.get(
    base_url, 
    params={
        'subreddit': 'SeriousConversation', 
        'size': 500,
    })

data_casual_conversation = res_casual_conversation.json()['data']
data_serious_conversation = res_serious_conversation.json()['data']

In [6]:
# Checking to see if the request worked
# Looks like both requests worked
data_casual_conversation[4]

{'all_awardings': [],
 'allow_live_comments': False,
 'author': 'Fatass_Pug',
 'author_flair_css_class': None,
 'author_flair_richtext': [],
 'author_flair_text': None,
 'author_flair_type': 'text',
 'author_fullname': 't2_66qjkhqs',
 'author_patreon_flair': False,
 'author_premium': False,
 'awarders': [],
 'can_mod_post': False,
 'contest_mode': False,
 'created_utc': 1587515006,
 'domain': 'self.CasualConversation',
 'full_link': 'https://www.reddit.com/r/CasualConversation/comments/g5r45q/black_people_what_can_i_do_as_a_white_guy_to/',
 'gildings': {},
 'id': 'g5r45q',
 'is_crosspostable': False,
 'is_meta': False,
 'is_original_content': False,
 'is_reddit_media_domain': False,
 'is_robot_indexable': False,
 'is_self': True,
 'is_video': False,
 'link_flair_background_color': '#f4900c',
 'link_flair_richtext': [{'a': ':thinking:',
   'e': 'emoji',
   'u': 'https://emoji.redditmedia.com/ja91bi9ivxh21_t5_323oy/thinking'},
  {'e': 'text', 't': ' Thoughts &amp; Ideas'}],
 'link_flair_

In [7]:
data_serious_conversation[4]

{'all_awardings': [],
 'allow_live_comments': False,
 'author': 'sochabell',
 'author_flair_css_class': None,
 'author_flair_richtext': [],
 'author_flair_text': None,
 'author_flair_type': 'text',
 'author_fullname': 't2_rjtmt0d',
 'author_patreon_flair': False,
 'author_premium': True,
 'awarders': [],
 'can_mod_post': False,
 'contest_mode': False,
 'created_utc': 1587502895,
 'domain': 'self.SeriousConversation',
 'full_link': 'https://www.reddit.com/r/SeriousConversation/comments/g5nmaa/how_do_you_befriend_a_celebrity_on_instagram/',
 'gildings': {},
 'id': 'g5nmaa',
 'is_crosspostable': True,
 'is_meta': False,
 'is_original_content': False,
 'is_reddit_media_domain': False,
 'is_robot_indexable': True,
 'is_self': True,
 'is_video': False,
 'link_flair_background_color': '#014980',
 'link_flair_richtext': [{'a': ':advice:',
   'e': 'emoji',
   'u': 'https://emoji.redditmedia.com/0c5v7xxmob121_t5_345ie/advice'},
  {'e': 'text', 't': ' Situational Advice'}],
 'link_flair_template_

In [29]:
# Using a function to pull many requests
# This code has been modified from Tim Book's Office Hours
# I updated the function with the argurment epoch. It is more flexible so you can input your own epoch 
# versus having to change inside the function
def get_submissions(subreddit, n_iter, epoch):
    df_list = []
    current_time = epoch
    for _ in range(n_iter):
        res = requests.get(
        base_url, 
        params={
            'subreddit': subreddit, 
            'size': 500,
            'before': current_time
            }
        )
        df = pd.DataFrame(res.json()['data'])
        df = df.loc[:, ['author', 'id', 'num_comments', 'score', 'created_utc', 'selftext', 'title', 'subreddit']]
        df_list.append(df)
        current_time = df.created_utc.min()
        time.sleep(10) # Creating a sleep as to not make too many requests too quickly
    return pd.concat(df_list, axis=0)

In [14]:
# Gathering 10,000 submissions
# I ran this cell before updating my function. I didn't want to run it again, but below is the epoch time I used
# 1587516880 Current date and time 4/21/2020 7:55PM
df_casual_conversation = get_submissions('CasualConversation', 20)

In [15]:
# Checking to see if successful
df_casual_conversation.shape

(10000, 8)

In [16]:
# Quick glance at the DataFrame
df_casual_conversation.head()

Unnamed: 0,author,id,num_comments,score,created_utc,selftext,title,subreddit
0,nothanksbud5,g5rffm,1,2,1587516214,Wow i didn’t realize how much music is about b...,Why is almost all music seem to be about love?,CasualConversation
1,yungwhitedemon,g5re5u,1,1,1587516078,[removed],"If you’re spiritually, emotionally, mentally, ...",CasualConversation
2,Stock_Willingness,g5rcw2,0,1,1587515941,[removed],why does it feel like no one cares about you i...,CasualConversation
3,Infinitely_Chaotic,g5r85a,1,2,1587515438,[removed],"On the morning of my birthday, I woke up disap...",CasualConversation
4,dontknowwhattdo,g5r7z2,3,2,1587515419,I thought that during this time it would be ni...,pieces of advice that have stuck with you?,CasualConversation


In [17]:
# Gathering 10,000 submissions
# I ran this cell before updating my function. I didn't want to run it again, but below is the epoch time I used
# 1587516880 Current date and time 4/21/2020 7:55PM
df_serious_conversation = get_submissions('SeriousConversation', 20)

In [18]:
df_serious_conversation.shape

(10000, 8)

In [19]:
df_serious_conversation.head()

Unnamed: 0,author,id,num_comments,score,created_utc,selftext,title,subreddit
0,yaeyersc,g5qejw,4,1,1587512372,After a week of financial fuckery on this app ...,I wish I had never downloaded that Trading App.,SeriousConversation
1,AMorrisIII,g5p9lq,0,1,1587508349,https://youtu.be/DO4NZXJJYz8\n\nAs humanity re...,Human beings are the ultimate virus. It’s us. ...,SeriousConversation
2,RoughWeekInADay,g5owdd,0,1,1587507079,"This is per my username, but I've had a rough ...",Rough Week in a Day,SeriousConversation
3,AMorrisIII,g5ouq3,2,1,1587506920,[removed],Human beings are the next fucking level. It’s ...,SeriousConversation
4,sochabell,g5nmaa,15,3,1587502895,NOTE: If you're going to come on here just to ...,How do you befriend a celebrity on Instagram?,SeriousConversation


In [20]:
# Concating the two dataframes into one
big_reddit_df = pd.concat([df_casual_conversation, df_serious_conversation], axis=0)

In [21]:
# Checking to see if successful
big_reddit_df.shape

(20000, 8)

In [22]:
# Saving to a csv to pull down and be able to work on it
# I don't want to keep requesting from the Reddit API
big_reddit_df.to_csv('big_reddit_list.csv', index=False)

In [23]:
# Reading in the csv to see if successful and doing a litte check
df = pd.read_csv('big_reddit_list.csv')

In [24]:
df.head()

Unnamed: 0,author,id,num_comments,score,created_utc,selftext,title,subreddit
0,nothanksbud5,g5rffm,1,2,1587516214,Wow i didn’t realize how much music is about b...,Why is almost all music seem to be about love?,CasualConversation
1,yungwhitedemon,g5re5u,1,1,1587516078,[removed],"If you’re spiritually, emotionally, mentally, ...",CasualConversation
2,Stock_Willingness,g5rcw2,0,1,1587515941,[removed],why does it feel like no one cares about you i...,CasualConversation
3,Infinitely_Chaotic,g5r85a,1,2,1587515438,[removed],"On the morning of my birthday, I woke up disap...",CasualConversation
4,dontknowwhattdo,g5r7z2,3,2,1587515419,I thought that during this time it would be ni...,pieces of advice that have stuck with you?,CasualConversation


In [25]:
df.shape

(20000, 8)

In [28]:
# After doing some data cleaning I need some more CasualConversation Submissions
# Grabbing the earliest submission and going to update function below and grab more submissions
df_casual_conversation.sort_values('created_utc').head()

Unnamed: 0,author,id,num_comments,score,created_utc,selftext,title,subreddit
499,ghost-child,fs6vkl,141,1,1585631176,"A while ago, a friend asked participants to de...",Is it weird that I'm kinda enjoying the quaran...,CasualConversation
498,aerobd,fs6w5n,2,1,1585631256,I have holes in my socks. Evenly distributed o...,Sock shopping,CasualConversation
497,AGayWeebForever,fs6wb4,5,1,1585631274,"Seeing that I'm only a pre-teen, cosplayers ho...",Cosplayers!,CasualConversation
496,Regins12,fs6wnu,1,1,1585631319,Hello everyone! I am a health and fitness coac...,How many of you people wanna do serious weight...,CasualConversation
495,bbykyles,fs6yf3,4,1,1585631559,Just a bit sad that I’ve now come to associate...,Working from home woes,CasualConversation
...,...,...,...,...,...,...,...,...
4,dontknowwhattdo,g5r7z2,3,2,1587515419,I thought that during this time it would be ni...,pieces of advice that have stuck with you?,CasualConversation
3,Infinitely_Chaotic,g5r85a,1,2,1587515438,[removed],"On the morning of my birthday, I woke up disap...",CasualConversation
2,Stock_Willingness,g5rcw2,0,1,1587515941,[removed],why does it feel like no one cares about you i...,CasualConversation
1,yungwhitedemon,g5re5u,1,1,1587516078,[removed],"If you’re spiritually, emotionally, mentally, ...",CasualConversation


In [30]:
more_casual_convo = get_submissions('CasualConversation', 20, 1585631176)

In [31]:
more_casual_convo.shape

(10000, 8)

In [32]:
# Adding the new columns to my dataframe
big_reddit_df = pd.concat([big_reddit_df, more_casual_convo], axis=0)

In [33]:
# Updating the csv
big_reddit_df.to_csv('big_reddit_list.csv', index=False)