## Imports

In [2]:
import pandas as pd
import numpy as np
import datetime as dt
import time
import requests
import json

## API- Data import

In [3]:
def query_pushshift(subreddit, kind='submission', skip=30, times=6, 
                    subfield = ['title', 'selftext', 'subreddit', 'created_utc', 'author', 'num_comments', 'score', 'is_self'],
                    comfields = ['body', 'score', 'created_utc']):

    stem = "https://api.pushshift.io/reddit/search/{}/?subreddit={}&size=500".format(kind, subreddit)
    mylist = []
    
    for x in range(1, times):
        
        URL = "{}&after={}d".format(stem, skip * x)
        print(URL)
        response = requests.get(URL)
        assert response.status_code == 200
        mine = response.json()['data']
        df = pd.DataFrame.from_dict(mine)
        mylist.append(df)
        time.sleep(2)
        
    full = pd.concat(mylist, sort=False)
    
    if kind == "submission":
        
        full = full[subfield]
        
        full = full.drop_duplicates()
        
        full = full.loc[full['is_self'] == True]
        
    def get_date(created):
        return dt.date.fromtimestamp(created)
    
    _timestamp = full["created_utc"].apply(get_date)
    
    full['timestamp'] = _timestamp

    print(full.shape)
    
    return full

## Exploring the Data 

In [4]:
df1=query_pushshift('love')

https://api.pushshift.io/reddit/search/submission/?subreddit=love&size=500&after=30d
https://api.pushshift.io/reddit/search/submission/?subreddit=love&size=500&after=60d
https://api.pushshift.io/reddit/search/submission/?subreddit=love&size=500&after=90d
https://api.pushshift.io/reddit/search/submission/?subreddit=love&size=500&after=120d
https://api.pushshift.io/reddit/search/submission/?subreddit=love&size=500&after=150d
(2476, 9)


In [5]:
df2=query_pushshift('AskReddit')

https://api.pushshift.io/reddit/search/submission/?subreddit=AskReddit&size=500&after=30d
https://api.pushshift.io/reddit/search/submission/?subreddit=AskReddit&size=500&after=60d
https://api.pushshift.io/reddit/search/submission/?subreddit=AskReddit&size=500&after=90d
https://api.pushshift.io/reddit/search/submission/?subreddit=AskReddit&size=500&after=120d
https://api.pushshift.io/reddit/search/submission/?subreddit=AskReddit&size=500&after=150d
(2499, 9)


In [6]:
df1.head()

Unnamed: 0,title,selftext,subreddit,created_utc,author,num_comments,score,is_self,timestamp
0,Does my boss like him in more of a romantic way?,I’m a girl and he is a guy. He’s not “technica...,love,1551837629,kristiesmerrill,1,1,True,2019-03-05
1,Love?,Sometimes they say that being single is more f...,love,1551844579,Kookies_Pizza,0,3,True,2019-03-05
2,LABOUR-INVESTIGATION,\n\nWe are one of the best and recognized lab...,love,1551849679,ravisharma396,0,1,True,2019-03-06
3,PRE MATRIMONIAL INVESTIGATIONS,\n\nMatrimonial dispute develop due to suspic...,love,1551850777,ravisharma396,0,1,True,2019-03-06
4,What is love?,,love,1551860392,Fathomlesssoul,11,5,True,2019-03-06


In [7]:
df2.head()

Unnamed: 0,title,selftext,subreddit,created_utc,author,num_comments,score,is_self,timestamp
0,What's the worst experience you have had with ...,,AskReddit,1551833827,broken-fingers-37,2,2,True,2019-03-05
1,What pets are low maintenance but still provid...,,AskReddit,1551833827,CraftyExpression,22,3,True,2019-03-05
2,What’s something you can comment to get instan...,,AskReddit,1551833829,JAWinks,10,0,True,2019-03-05
3,When is it ok to curse in front of children?,,AskReddit,1551833830,OnlyInMyDreams73,9,1,True,2019-03-05
4,What is the most badass fight scenario you can...,,AskReddit,1551833830,Cheesegratersuicide,5,4,True,2019-03-05


In [109]:
df=pd.concat([df1,df2])

In [110]:
df.head()

Unnamed: 0,title,selftext,subreddit,created_utc,author,num_comments,score,is_self,timestamp
0,Missed opportunities...,I want to take a moment to talk about missed o...,love,1551705499,Djjustinjames,4,13,True,2019-03-04
1,You ever wish you can marry you can marry your...,[removed],love,1551707553,expressnollytv,0,1,True,2019-03-04
2,You ever fall inlove with your own cousin and ...,[removed],love,1551708161,expressnollytv,0,1,True,2019-03-04
3,My Love is My Own,My love is my own.\n\nIt is most powerful and ...,love,1551712603,nursexoxo,1,4,True,2019-03-04
4,The day my stalker became the love of my life....,Things can change in an instant. In one blink ...,love,1551717644,bburgener,2,0,True,2019-03-04


In [111]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4975 entries, 0 to 499
Data columns (total 9 columns):
title           4975 non-null object
selftext        4975 non-null object
subreddit       4975 non-null object
created_utc     4975 non-null int64
author          4975 non-null object
num_comments    4975 non-null int64
score           4975 non-null int64
is_self         4975 non-null bool
timestamp       4975 non-null object
dtypes: bool(1), int64(3), object(5)
memory usage: 354.7+ KB


In [112]:
(df.selftext=='[removed]').sum()

985

In [113]:
(df1.selftext=='[removed]').sum()

446

In [114]:
(df2.selftext=='[removed]').sum()

539

## Saving Data to CSV

In [115]:
df.to_csv('./raw_df.csv')