In [1]:
import pandas as pd
import requests
import csv
import time
from bs4 import BeautifulSoup

In [2]:
url = 'https://api.pushshift.io/reddit/search/submission'

In [3]:
params = {
    'subreddit': 'physics',
    'size': 500,
    'before': 1601970056  
}

In [4]:
res = requests.get(url, params)
res.status_code

200

In [5]:
data = res.json()

In [6]:
posts = data['data']
len(posts)

100

### Why am I only getting 100?
They changed their api

In [7]:
posts_df = pd.DataFrame(posts)
posts_df.columns

Index(['all_awardings', 'allow_live_comments', 'author',
       'author_flair_css_class', 'author_flair_richtext', 'author_flair_text',
       'author_flair_type', 'author_fullname', 'author_patreon_flair',
       'author_premium', 'awarders', 'can_mod_post', 'contest_mode',
       'created_utc', 'domain', 'full_link', 'gildings', 'id',
       'is_crosspostable', 'is_meta', 'is_original_content',
       'is_reddit_media_domain', 'is_robot_indexable', 'is_self', 'is_video',
       'link_flair_background_color', 'link_flair_richtext',
       'link_flair_text_color', 'link_flair_type', 'locked', 'media_only',
       'no_follow', 'num_comments', 'num_crossposts', 'over_18',
       'parent_whitelist_status', 'permalink', 'pinned', 'post_hint',
       'preview', 'pwls', 'retrieved_on', 'score', 'selftext', 'send_replies',
       'spoiler', 'stickied', 'subreddit', 'subreddit_id',
       'subreddit_subscribers', 'subreddit_type', 'thumbnail',
       'thumbnail_height', 'thumbnail_width', 'tit

In [8]:
#only keep columns of interest
posts_df = posts_df[['subreddit', 'selftext', 'title', 'created_utc']]
posts_df.head()

Unnamed: 0,subreddit,selftext,title,created_utc
0,Physics,,Acoustical and optical phonons ! Check this ar...,1601968777
1,Physics,Hey There! \n\nIm a high school physics studen...,"Optical Activity, Specific Rotation and Polari...",1601967664
2,Physics,,How can I prove current strength (i) is a Fund...,1601967618
3,Physics,**A plane is in a spot that is 880km due East...,Can I have some help with this question?,1601966781
4,Physics,Assume the penises are arranged uniformly in a...,How many erect penises would it take to fully ...,1601962026


In [9]:
posts_df.info()
#even though some of the selftext columns are blank, they don't return as null

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   subreddit    100 non-null    object
 1   selftext     100 non-null    object
 2   title        100 non-null    object
 3   created_utc  100 non-null    int64 
dtypes: int64(1), object(3)
memory usage: 3.2+ KB


### What I'm doing here:
- get 100 posts every week for the past 10 weeks
- make sure that the 'selftext' and 'title' columns are real and analyzable
    - so make sure that each one has more than 4 words
- this will make it hard to retain as many rows as expected, so I'll have to scrape for many weeks. Maybe a year?


In [25]:
#see when the earliest posts are from, and make sure to not go beyond
url = 'https://api.pushshift.io/reddit/search/submission'
#time = 1602126873
params = {
        'subreddit': 'physics',
        'sort': 'asc'
    }
res = requests.get(url, params)
data = res.json()
posts_start = data['data']
posts_start[1]['created_utc']
#this is in 2008, so it should be fine if we go back a year.

1205648214

In [39]:
def get_many_requests(subreddit, first_week_number, second_week_number):
    url = 'https://api.pushshift.io/reddit/search/submission'
    time = 1602126873-(first_week_number*604800)
    params = {
            'subreddit': subreddit,
            'size': 500,
        }
    res = requests.get(url, params)
    data = res.json()
    posts_start = data['data']
    posts_start_df = pd.DataFrame(posts_start)
    posts_start_df = posts_start_df[['subreddit', 'selftext', 'title', 'created_utc']]
    for i in range(first_week_number, second_week_number): #start at 1 because I just did zero above, which is what I'm concatenating on
        params = {
            'subreddit': subreddit,
            'size': 500,
            'before': time - i*604800 #for each week, it adds 604800 to the timestamp, so we're going back i number of weeks
        }
        res = requests.get(url, params)
        data = res.json()
        posts = data['data']
        posts_df = pd.DataFrame(posts)
        posts_start_df = pd.concat([posts_start_df, posts_df], ignore_index=True)
        posts_start_df = posts_start_df[['subreddit', 'selftext', 'title', 'created_utc']]
    return posts_start_df

### Note:
Because I can't pull more than 1000 comments at a time, I need to break up the dataframes into week chunks and then concatenate them at the end

In [133]:
def get_posts(subreddit, n_iter, epoch_right_now): # subreddit name and number of times function should run
    # store base url variable
    base_url = 'https://api.pushshift.io/reddit/search/submission/?subreddit='
    # instantiate empty list    
    df_list = []
    # save current epoch, used to iterate in reverse through time
    current_time = epoch_right_now
    # set up for loop
    while len(df_list) < 10:
        # instantiate get request
        res = requests.get(
            # requests.get takes base_url and params
            base_url,
            # parameters for get request
            params = {
                # specify subreddit
                'subreddit' : subreddit,
                # specify number of posts to pull
                'size' : 100,
                # ???
                'lang' : True,
                # pull everything from current time backward
                'before' : current_time }
        )
        # take data from most recent request, store as df
        df = pd.DataFrame(res.json()['data'])
        # pull specific columns from dataframe for analysis
        df = df.loc[:, ['subreddit', 'selftext', 'title', 'created_utc']]
        df_drop = df.loc[df['selftext'].str.split().str.len() > 4]
        df_drop = df_drop.loc[df_drop['title'].str.split().str.len() > 4]
        df_drop = df_drop.drop_duplicates()
        # append to empty dataframe list
        df_list.append(df_drop)
        # add wait time
        time.sleep(30)
        # set current time counter back to last epoch in recently grabbed df
        current_time = df_drop['created_utc'].min()
    # return one dataframe for all requests
    return pd.concat(df_list, axis=0)
# Adapated from Tim Book's Lesson Example

In [134]:
df = get_posts('physics', 4, 1602126873)

KeyboardInterrupt: 

In [18]:
df.head()

In [135]:
def get_posts(subreddit, n_iter, epoch_right_now): # subreddit name and number of times function should run
    # store base url variable
    base_url = 'https://api.pushshift.io/reddit/search/submission/?subreddit='
    # instantiate empty list    
    df_list = []
    # save current epoch, used to iterate in reverse through time
    current_time = epoch_right_now
    # set up for loop
    for post in range(n_iter):
        # instantiate get request
        res = requests.get(
            # requests.get takes base_url and params
            base_url,
            # parameters for get request
            params = {
                # specify subreddit
                'subreddit' : subreddit,
                # specify number of posts to pull
                'size' : 100,
                # ???
                'lang' : True,
                # pull everything from current time backward
                'before' : current_time }
        )
        # take data from most recent request, store as df
        df = pd.DataFrame(res.json()['data'])
        # pull specific columns from dataframe for analysis
        df = df.loc[:, ['subreddit', 'selftext', 'title', 'created_utc']]
        df_drop = df.loc[df['selftext'].str.split().str.len() > 4]
        df_drop = df_drop.loc[df_drop['title'].str.split().str.len() > 4]
        df_drop = df_drop.drop_duplicates()
        # append to empty dataframe list
        df_list.append(df_drop)
        # add wait time
        time.sleep(30)
        # set current time counter back to last epoch in recently grabbed df
        current_time = df_drop['created_utc'].min()
    # return one dataframe for all requests
    #return pd.concat(df_list, axis=0)
    return len(df_list)
# Adapated from Tim Book's Lesson Example

In [136]:
df = get_posts('physics', 4, 1602126873)

In [137]:
df

4

In [130]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 337 entries, 2 to 96
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   subreddit    337 non-null    object
 1   selftext     337 non-null    object
 2   title        337 non-null    object
 3   created_utc  337 non-null    int64 
dtypes: int64(1), object(3)
memory usage: 13.2+ KB


In [116]:
df_drop = df.loc[df['selftext'].str.split().str.len() > 4]
df_drop = df_drop.loc[df_drop['title'].str.split().str.len() > 4]
df_drop = df_drop.drop_duplicates()
df_drop.tail()

Unnamed: 0,title,created_utc,selftext,subreddit,author,media_only,permalink
90,equation I can't figure out,1601484418,Hello so i was doing some work and found a phy...,Physics,Kal-Skirata69,False,/r/Physics/comments/j2psm6/equation_i_cant_fig...
91,Simulation of Young's double slit experiment,1601482914,Python Code to produce simulation: \n\nPart 1:...,Physics,Instantinopaul,False,/r/Physics/comments/j2pazo/simulation_of_young...
92,"[Question] Why, in optics, do we measure the a...",1601482695,Title. \nI've been wondering why the measure...,Physics,Paradox343,False,/r/Physics/comments/j2p8is/question_why_in_opt...
97,How can I study Physics?,1601475631,Hi all decent Physics people :D I just joined ...,Physics,GalynMusic,False,/r/Physics/comments/j2n0ek/how_can_i_study_phy...
98,Quantum Physics and consciousness question for...,1601472767,"So, I am doing some research about an app call...",Physics,NyaCrea,False,/r/Physics/comments/j2m6ca/quantum_physics_and...


In [104]:
def get_weekly_requests(subreddit, max_weeks):
    #specify a list that's every 10, up till max weeks, starting from 10 (10, 20, 30, 40)
    list_a = list(range(10, max_weeks+1, 10))
    #specify a list that's every 10, starting from 1 (1, 11, 21, 31)
    list_b = list(range(1, max_weeks+1, 10))
    #get requests from the days between each of the values in the list
    df_init = pd.DataFrame()
    new_lis = []
    for x in range(len(list_a)):
        new_lis.append(list_a[x]+1)
    return new_lis
    
get_weekly_requests('subreddit', 30)

[11, 21, 31]

In [138]:
#try to automate:
def get_weekly_requests(subreddit, max_weeks):
    #specify a list that's every 10, up till max weeks, starting from 10 (10, 20, 30, 40)
    list_a = list(range(10, max_weeks+1, 10))
    list_a_minus_one = list(range(9, max_weeks+1, 10))
    #specify a list that's every 10, starting from 1 (1, 11, 21, 31)
    list_b = list(range(1, max_weeks+1, 10))
    list_b_plus_one = list(range(2, max_weeks+1, 10))
    #get requests from the days between each of the values in the list
    df_init = pd.DataFrame()
    for x in range(len(list_a)):
        try:
            df = get_many_requests(subreddit=subreddit, first_week_number=list_b[x], second_week_number=list_a[x])

            df_drop = df.loc[df['selftext'].str.split().str.len() > 4]
            df_drop = df_drop.loc[df_drop['title'].str.split().str.len() > 4]
            df_drop = df_drop.drop_duplicates()

            df_init = pd.concat([df_init, df_drop], ignore_index=True)
        except:
            pass
        try:
            df = get_many_requests(subreddit=subreddit, first_week_number=list_b_plus_one[x], second_week_number=list_a_minus_one[x])

            df_drop = df.loc[df['selftext'].str.split().str.len() > 4]
            df_drop = df_drop.loc[df_drop['title'].str.split().str.len() > 4]
            df_drop = df_drop.drop_duplicates()

            df_init = pd.concat([df_init, df_drop], ignore_index=True)
        except:
            pass
        
        
    return df_init

In [139]:
df_phys = get_weekly_requests('physics', 60)

In [140]:
df_phys.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 355 entries, 0 to 354
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   subreddit    355 non-null    object
 1   selftext     355 non-null    object
 2   title        355 non-null    object
 3   created_utc  355 non-null    int64 
dtypes: int64(1), object(3)
memory usage: 11.2+ KB


In [53]:
phys_df_0_10 = get_many_requests('physics', 1, 10)
phys_df_drop_0_10 = phys_df_0_10.loc[phys_df_0_10['selftext'].str.split().str.len() > 4]
phys_df_drop_0_10 = phys_df_drop_0_10.loc[phys_df_drop_0_10['title'].str.split().str.len() > 4]
phys_df_drop_0_10 = phys_df_drop_0_10.drop_duplicates()
phys_df_drop_0_10.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 351 entries, 1 to 999
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   subreddit    351 non-null    object
 1   selftext     351 non-null    object
 2   title        351 non-null    object
 3   created_utc  351 non-null    int64 
dtypes: int64(1), object(3)
memory usage: 13.7+ KB


In [55]:
#keep only the rows that have more than 4 words in each column
phys_df_10_19 = get_many_requests('physics', 11, 19)
phys_df_drop_10_19 = phys_df_10_19.loc[phys_df_10_19['selftext'].str.split().str.len() > 4]
phys_df_drop_10_19 = phys_df_drop_10_19.loc[phys_df_drop_10_19['title'].str.split().str.len() > 4]
phys_df_drop_10_19 = phys_df_drop_10_19.drop_duplicates()
phys_df_drop_10_19.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 234 entries, 1 to 897
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   subreddit    234 non-null    object
 1   selftext     234 non-null    object
 2   title        234 non-null    object
 3   created_utc  234 non-null    int64 
dtypes: int64(1), object(3)
memory usage: 9.1+ KB


In [110]:
phys_df_20_29 = get_many_requests('physics', 21, 29)
phys_df_drop_20_29 = phys_df_20_29.loc[phys_df_20_29['selftext'].str.split().str.len() > 4]
phys_df_drop_20_29 = phys_df_drop_20_29.loc[phys_df_drop_20_29['title'].str.split().str.len() > 4]
phys_df_drop_20_29 = phys_df_drop_20_29.drop_duplicates()
phys_df_drop_20_29.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 318 entries, 1 to 898
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   subreddit    318 non-null    object
 1   selftext     318 non-null    object
 2   title        318 non-null    object
 3   created_utc  318 non-null    int64 
dtypes: int64(1), object(3)
memory usage: 12.4+ KB


In [58]:
phys_df_30_39 = get_many_requests('physics', 31, 39)
phys_df_drop_30_39 = phys_df_30_39.loc[phys_df_30_39['selftext'].str.split().str.len() > 4]
phys_df_drop_30_39 = phys_df_drop_30_39.loc[phys_df_drop_30_39['title'].str.split().str.len() > 4]
phys_df_drop_30_39 = phys_df_drop_30_39.drop_duplicates()
phys_df_drop_30_39.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 274 entries, 0 to 899
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   subreddit    274 non-null    object
 1   selftext     274 non-null    object
 2   title        274 non-null    object
 3   created_utc  274 non-null    int64 
dtypes: int64(1), object(3)
memory usage: 10.7+ KB


In [59]:
phys_df_40_49 = get_many_requests('physics', 40, 49)
phys_df_drop_40_49 = phys_df_40_49.loc[phys_df_40_49['selftext'].str.split().str.len() > 4]
phys_df_drop_40_49 = phys_df_drop_40_49.loc[phys_df_drop_40_49['title'].str.split().str.len() > 4]
phys_df_drop_40_49 = phys_df_drop_40_49.drop_duplicates()
phys_df_drop_40_49.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 301 entries, 0 to 999
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   subreddit    301 non-null    object
 1   selftext     301 non-null    object
 2   title        301 non-null    object
 3   created_utc  301 non-null    int64 
dtypes: int64(1), object(3)
memory usage: 11.8+ KB


In [60]:
phys_df_50_59 = get_many_requests('physics', 50, 59)
phys_df_drop_50_59 = phys_df_50_59.loc[phys_df_50_59['selftext'].str.split().str.len() > 4]
phys_df_drop_50_59 = phys_df_drop_50_59.loc[phys_df_drop_50_59['title'].str.split().str.len() > 4]
phys_df_drop_50_59 = phys_df_drop_50_59.drop_duplicates()
phys_df_drop_50_59.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 329 entries, 0 to 997
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   subreddit    329 non-null    object
 1   selftext     329 non-null    object
 2   title        329 non-null    object
 3   created_utc  329 non-null    int64 
dtypes: int64(1), object(3)
memory usage: 12.9+ KB


In [63]:
#now concatenate all of these to get one final dataframe
phys_df = pd.concat([phys_df_drop_0_10, 
                     phys_df_drop_10_19, 
                     phys_df_drop_20_29, 
                     phys_df_drop_30_39,
                     phys_df_drop_40_49,
                     phys_df_drop_50_59
                    ], ignore_index=True)

In [64]:
phys_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1833 entries, 0 to 1832
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   subreddit    1833 non-null   object
 1   selftext     1833 non-null   object
 2   title        1833 non-null   object
 3   created_utc  1833 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 57.4+ KB


In [73]:
chem_df_0_10 = get_many_requests('chemistry', 1, 10)
chem_df_drop_0_10 = chem_df_0_10.loc[chem_df_0_10['selftext'].str.split().str.len() > 4]
chem_df_drop_0_10 = chem_df_drop_0_10.loc[chem_df_drop_0_10['title'].str.split().str.len() > 4]
chem_df_drop_0_10 = chem_df_drop_0_10.drop_duplicates()
chem_df_drop_0_10.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 303 entries, 0 to 996
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   subreddit    303 non-null    object
 1   selftext     303 non-null    object
 2   title        303 non-null    object
 3   created_utc  303 non-null    int64 
dtypes: int64(1), object(3)
memory usage: 11.8+ KB


In [74]:
chem_df_10_19 = get_many_requests('chemistry', 11, 19)
chem_df_drop_10_19 = chem_df_10_19.loc[chem_df_10_19['selftext'].str.split().str.len() > 4]
chem_df_drop_10_19 = chem_df_drop_10_19.loc[chem_df_drop_10_19['title'].str.split().str.len() > 4]
chem_df_drop_10_19 = chem_df_drop_10_19.drop_duplicates()
chem_df_drop_10_19.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 254 entries, 0 to 898
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   subreddit    254 non-null    object
 1   selftext     254 non-null    object
 2   title        254 non-null    object
 3   created_utc  254 non-null    int64 
dtypes: int64(1), object(3)
memory usage: 9.9+ KB


In [75]:
chem_df_20_29 = get_many_requests('chemistry', 20, 29)
chem_df_drop_20_29 = chem_df_20_29.loc[chem_df_20_29['selftext'].str.split().str.len() > 4]
chem_df_drop_20_29 = chem_df_drop_20_29.loc[chem_df_drop_20_29['title'].str.split().str.len() > 4]
chem_df_drop_20_29 = chem_df_drop_20_29.drop_duplicates()
chem_df_drop_20_29.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 318 entries, 0 to 999
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   subreddit    318 non-null    object
 1   selftext     318 non-null    object
 2   title        318 non-null    object
 3   created_utc  318 non-null    int64 
dtypes: int64(1), object(3)
memory usage: 12.4+ KB


In [76]:
chem_df_30_39 = get_many_requests('chemistry', 31, 39)
chem_df_drop_30_39 = chem_df_30_39.loc[chem_df_30_39['selftext'].str.split().str.len() > 4]
chem_df_drop_30_39 = chem_df_drop_30_39.loc[chem_df_drop_30_39['title'].str.split().str.len() > 4]
chem_df_drop_30_39 = chem_df_drop_30_39.drop_duplicates()
chem_df_drop_30_39.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 342 entries, 0 to 898
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   subreddit    342 non-null    object
 1   selftext     342 non-null    object
 2   title        342 non-null    object
 3   created_utc  342 non-null    int64 
dtypes: int64(1), object(3)
memory usage: 13.4+ KB


In [77]:
chem_df_40_49 = get_many_requests('chemistry', 40, 49)
chem_df_drop_40_49 = chem_df_40_49.loc[chem_df_40_49['selftext'].str.split().str.len() > 4]
chem_df_drop_40_49 = chem_df_drop_40_49.loc[chem_df_drop_40_49['title'].str.split().str.len() > 4]
chem_df_drop_40_49 = chem_df_drop_40_49.drop_duplicates()
chem_df_drop_40_49.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 381 entries, 0 to 998
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   subreddit    381 non-null    object
 1   selftext     381 non-null    object
 2   title        381 non-null    object
 3   created_utc  381 non-null    int64 
dtypes: int64(1), object(3)
memory usage: 14.9+ KB


In [79]:
chem_df_50_59 = get_many_requests('chemistry', 51, 59)
chem_df_drop_50_59 = chem_df_50_59.loc[chem_df_50_59['selftext'].str.split().str.len() > 4]
chem_df_drop_50_59 = chem_df_drop_50_59.loc[chem_df_drop_50_59['title'].str.split().str.len() > 4]
chem_df_drop_50_59 = chem_df_drop_50_59.drop_duplicates()
chem_df_drop_50_59.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 382 entries, 0 to 898
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   subreddit    382 non-null    object
 1   selftext     382 non-null    object
 2   title        382 non-null    object
 3   created_utc  382 non-null    int64 
dtypes: int64(1), object(3)
memory usage: 14.9+ KB


In [80]:
chem_df = pd.concat([chem_df_drop_0_10, 
                     chem_df_drop_10_19, 
                     chem_df_drop_20_29, 
                     chem_df_drop_30_39,
                     chem_df_drop_40_49,
                     chem_df_drop_50_59
                    ], ignore_index=True)

In [81]:
chem_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1980 entries, 0 to 1979
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   subreddit    1980 non-null   object
 1   selftext     1980 non-null   object
 2   title        1980 non-null   object
 3   created_utc  1980 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 62.0+ KB


In [82]:
df_clean = pd.concat([phys_df, chem_df], ignore_index=True)

In [83]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3813 entries, 0 to 3812
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   subreddit    3813 non-null   object
 1   selftext     3813 non-null   object
 2   title        3813 non-null   object
 3   created_utc  3813 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 119.3+ KB


In [52]:
phys_df_drop.info()
#lost a lot of rows but that's ok.

<class 'pandas.core.frame.DataFrame'>
Int64Index: 274 entries, 0 to 999
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   subreddit    274 non-null    object
 1   selftext     274 non-null    object
 2   title        274 non-null    object
 3   created_utc  274 non-null    int64 
dtypes: int64(1), object(3)
memory usage: 10.7+ KB


In [None]:
chem_df = get_many_requests('chemistry')

In [None]:
#keep only the rows that have more than 4 words in each column
chem_df_drop = chem_df.loc[chem_df['selftext'].str.split().str.len() > 4]
chem_df_drop = chem_df_drop.loc[chem_df_drop['title'].str.split().str.len() > 4]
chem_df_drop = chem_df_drop.drop_duplicates()
chem_df_drop.head()

In [19]:
chem_df_drop.info()
#lost a lot of rows but that's ok.

<class 'pandas.core.frame.DataFrame'>
Int64Index: 301 entries, 0 to 998
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   subreddit    301 non-null    object
 1   selftext     301 non-null    object
 2   title        301 non-null    object
 3   created_utc  301 non-null    int64 
dtypes: int64(1), object(3)
memory usage: 11.8+ KB


In [None]:
#save dataframe as a csv


## Using Pushshift
Video notes:
- 

In [24]:
url = 'https://api.pushshift.io/reddit/search/submission'
#this api is to get ALL of the posts of just reddit in general
# want to narrow it down to just the subreddit that we're interested in

In [38]:
params = {
    'subreddit': 'physics',
    'size': 500,
    #'after': '4d' #from posts that were created 4 days ago 
    #'before': '2d' #until posts that were created 2 days ago 
}

In [39]:
res = requests.get(url, params)

In [40]:
res.status_code

200

In [41]:
data = res.json()

In [42]:
posts = data['data'] #list of the 25 most recent posts
len(posts)
#by default get 25 posts back from pushshift api

#WHY AM I ONLY GETTING 100

100

- if you want more than 25 posts, set the parameter 'size' differently. 
- can go up to 500
- Goal:
    - take in texts from two different subreddits
    - out of the two subreddits we choose, we want to build out a model that predicts the subreddit that a post is coming from.
- Need a lot of data to build a robust model

In [43]:
posts[-1]

{'all_awardings': [],
 'allow_live_comments': False,
 'author': 'here_for_frenchmemes',
 'author_flair_css_class': None,
 'author_flair_richtext': [],
 'author_flair_text': None,
 'author_flair_type': 'text',
 'author_fullname': 't2_5fwkfpyv',
 'author_patreon_flair': False,
 'author_premium': False,
 'awarders': [],
 'can_mod_post': False,
 'contest_mode': False,
 'created_utc': 1601825709,
 'domain': 'self.Physics',
 'full_link': 'https://www.reddit.com/r/Physics/comments/j50ukz/conservation_of_angular_rotation/',
 'gildings': {},
 'id': 'j50ukz',
 'is_crosspostable': True,
 'is_meta': False,
 'is_original_content': False,
 'is_reddit_media_domain': False,
 'is_robot_indexable': True,
 'is_self': True,
 'is_video': False,
 'link_flair_background_color': '',
 'link_flair_richtext': [],
 'link_flair_text_color': 'dark',
 'link_flair_type': 'text',
 'locked': False,
 'media_metadata': {'rnc0wkvzj3r51': {'e': 'Image',
   'id': 'rnc0wkvzj3r51',
   'm': 'image/png',
   'p': [{'u': 'https:/

In [21]:
df = pd.DataFrame(posts)

In [22]:
df.head()

Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_type,author_fullname,author_patreon_flair,author_premium,...,preview,thumbnail_height,thumbnail_width,url_overridden_by_dest,removed_by_category,media_metadata,media,media_embed,secure_media,secure_media_embed
0,[],False,AsenseIsGay,,[],,text,t2_63ayn475,False,False,...,,,,,,,,,,
1,[],False,scienceisfun112358,,[],,text,t2_2x6wb893,False,False,...,"{'enabled': False, 'images': [{'id': '-hKcdVcU...",73.0,140.0,https://www.inverse.com/mind-body/how-the-brai...,,,,,,
2,[],False,Zenblendman,,[],,text,t2_ub4z8,False,False,...,"{'enabled': False, 'images': [{'id': 'pmPKbVU9...",140.0,140.0,https://v.redd.it/ev8oeln7l0r51,,,,,,
3,[],False,k10forgotten,,[],,text,t2_60bj0,False,False,...,"{'enabled': False, 'images': [{'id': 'kEku63cQ...",56.0,140.0,https://www.sciencealert.com/physicists-push-t...,moderator,,,,,
4,[],False,EclipseThing,,[],,text,t2_2b4mrsos,False,False,...,,,,,,,,,,


Columns of interest: 
- What subreddit did it come from?
- description of each post
- title

These are the three columns we'll start with for our NLP model to predict the 'subreddit' column

In [23]:
df[['subreddit', 'selftext', 'title']].head()

Unnamed: 0,subreddit,selftext,title
0,Physics,is there work done when youre fishing?,"hey guys, question"
1,Physics,,How the brain creates the experience of time —...
2,Physics,,Have you ever seen behind someone while lookin...
3,Physics,,Physicists Have Successfully Connected Two Lar...
4,Physics,I don't know how to accurately ask this questi...,A physics/math question


Notes:
- start with 1000 posts from two subreddits
- go back and get another 500
    - problem: if we rerun the code, we'll get the same 500 posts
- Use the 'before' and 'after' parameters
    - give it an integer that represents time

In [None]:
#if you want 500 posts be


In [None]:
#example

In [4]:
#The url given below calls for the most recent 1000 comments from threads on r/AskMen.
url = "https://api.pushshift.io/reddit/search/comment/?subreddit=askmen&sort=des&size=1000"

In [5]:
headers = {'User-agent': 'eamonious'}
res = requests.get(url, headers=headers)
res.status_code

200

In [6]:
json = res.json()
comments = pd.DataFrame(json['data'])

In [7]:
comments.columns

Index(['all_awardings', 'associated_award', 'author',
       'author_flair_background_color', 'author_flair_css_class',
       'author_flair_richtext', 'author_flair_template_id',
       'author_flair_text', 'author_flair_text_color', 'author_flair_type',
       'author_fullname', 'author_patreon_flair', 'author_premium', 'awarders',
       'body', 'collapsed_because_crowd_control', 'comment_type',
       'created_utc', 'gildings', 'id', 'is_submitter', 'link_id', 'locked',
       'no_follow', 'parent_id', 'permalink', 'retrieved_on', 'score',
       'send_replies', 'stickied', 'subreddit', 'subreddit_id',
       'top_awarded_type', 'total_awards_received', 'treatment_tags', 'edited',
       'distinguished'],
      dtype='object')

In [8]:
posts = json['data'] #list of the 25 most recent posts
len(posts)

100

In [9]:
comments.head()

Unnamed: 0,all_awardings,associated_award,author,author_flair_background_color,author_flair_css_class,author_flair_richtext,author_flair_template_id,author_flair_text,author_flair_text_color,author_flair_type,...,score,send_replies,stickied,subreddit,subreddit_id,top_awarded_type,total_awards_received,treatment_tags,edited,distinguished
0,[],,timothy53,,,[],,,,text,...,1,True,False,AskMen,t5_2s30g,,0,[],,
1,[],,TheIntrepid1,,,[],,,,text,...,2,True,False,AskMen,t5_2s30g,,0,[],,
2,[],,weirdhairkid,#eb3305,female,"[{'e': 'text', 't': 'Female'}]",f17b2dd0-cd16-11e1-b564-12313d051e91,Female,light,richtext,...,1,True,False,AskMen,t5_2s30g,,0,[],1602116000.0,
3,[],,IwantmyMTZ,,,[],,,,text,...,1,True,False,AskMen,t5_2s30g,,0,[],,
4,[],,the-wheel-deal,,,[],,,,text,...,1,True,False,AskMen,t5_2s30g,,0,[],,


In [10]:
#Removes everything but the features we are interested in.
comments = comments[['body','created_utc','id','parent_id','score','subreddit']]

In [11]:
comments.head()

Unnamed: 0,body,created_utc,id,parent_id,score,subreddit
0,"A few years ahead of you. When I turned 18, 9/...",1602115942,g824n32,t1_g80gfh1,1,AskMen
1,Because that’s how it was when they were our a...,1602115935,g824mmt,t1_g81fqxa,2,AskMen
2,Yes yes♥️ I'm just doing what I'd like to see ...,1602115934,g824mli,t1_g81mpbl,1,AskMen
3,What is missing is decorum. Very badly.,1602115912,g824l9r,t1_g823nal,1,AskMen
4,They were but when we coloreds started getting...,1602115910,g824l4r,t1_g81tspx,1,AskMen


# another test

In [12]:
#Creates the initial dataframe 
#1000 most recent comments at present time (1545243580), filtered to first-tier only
url = "https://api.pushshift.io/reddit/search/comment/?subreddit=askmen&before=1545243580&sort=des&size=1000"
headers = {'User-agent': 'eamonious'}
res = requests.get(url, headers=headers)
json = res.json()
commentsm = pd.DataFrame(json['data'])
commentsm = commentsm[['body','created_utc','id','parent_id','score','subreddit']]
#Filters for first-tier comments
commentsm['parent_id'] = commentsm['parent_id'].map(lambda x: x if 't3_' in x else 0)
commentsm = commentsm[commentsm['parent_id']!=0]
#Gets rid of mod-removed comments
commentsm = commentsm[commentsm['body']!='[removed]']

#Loops backward over 12 day intervals, adding the 1000 most recent comments prior to each timepoint,
#filtered to first-tier only
for i in range(1,80):
    url = "https://api.pushshift.io/reddit/search/comment/?subreddit=askmen&before={}&sort=des&size=1000".format(1545243580 - i*1036800)
    headers = {'User-agent': 'eamonious'}
    res = requests.get(url, headers=headers)
    json = res.json()
    commentbloc = pd.DataFrame(json['data'])
    commentbloc = commentbloc[['body','created_utc','id','parent_id','score','subreddit']]
    commentbloc['parent_id'] = commentbloc['parent_id'].map(lambda x: x if 't3_' in x else 0)
    commentbloc = commentbloc[commentbloc['parent_id']!=0]
    commentbloc = commentbloc[commentbloc['body']!='[removed]']
    commentsm = pd.concat([commentsm, commentbloc], ignore_index=True)

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [None]:
len(commentsm)