# Data Preparation
## Scraping with PushShift (Date period from Jan 2022 to Feb 15, 2022)

This is very concise notebook that extracts list of subreddits and save to csv

In [3]:
import pandas as pd
import requests
import json
from datetime import datetime

In [2]:
#subreddit to process
#it will be used as filename for saving the dataframe

g_subreddit_list = ["keto","nutrition"]

In [3]:
def get_reddit_pushshift(from_, to_, subreddit_, list_):
    '''
    function to retrieve post in subreddits
    paramater from_ (start date in unix), 
              to_ (end date in unix), , 
              subreddit_ (subreddit domain)
              list_ (master list to append post)
    
    return the updated list_, 0 if query returned 0 result (need to stop pulling)
    '''
    
    
    #construct the URL, default size=100
    subreddit_url = 'https://api.pushshift.io/reddit/search/submission/?subreddit=' \
            +str(subreddit_)+'&size=1000&after='+str(from_)+'&before='+str(to_)
    
    print("Extracting :", subreddit_url)
  
    #request/extract
    req = requests.get(subreddit_url)
    
    #convert to json 
    posted_data = json.loads(req.text)
    
    #print(len(posted_data['data']))
    
    if len(posted_data['data']) == 0: #no more result
        return (list_, 0) #return
    
    for post in posted_data['data']:
        
        selftext =''
        
        try:
            selftext = post['selftext']
        except:
            selftext = '<notext>'
        
        #print("****post****", post)
        
        post = {'created_utc': post['created_utc'],
                'title': post['title'],
                'selftext':selftext,
                'is_self':post['is_self'],
                'score':post['score'],
                'upvote_ratio': post['upvote_ratio'],
                'num_comments':post['num_comments'],
                'author':post['author'],
                'is_original_content':post['is_original_content'],
                'media_only': post['media_only'],
                'subreddit': post['subreddit']
                }
        
        
        list_.append(post)
        
    
    return (list_,1)

In [4]:
#go through our subreddit list and save it as csv

for subredx in g_subreddit_list:
        
    post_list = []
    
    ####**** Change the dates here! ****####
    post_marker = 1640995200  #Jan 1, 2022
    end_created_utc = 1645259073 #feb 20, 2022
    
    has_result = 1
    
    #get the first 2200 post only, our target is 2000, with estimated 10% data cleanup
    while (len(post_list) < 2000 and has_result):
        
        (post_list, has_result) = get_reddit_pushshift(post_marker, end_created_utc, subredx, post_list)
        post_marker = post_list[-1]['created_utc']

    df_subr = pd.DataFrame(post_list)
    df_subr.to_csv(f'../datasets/subr_{subredx}.csv', index=False)
    
    print("Shape:", df_subr.shape)
    print("Start Date :", datetime.fromtimestamp(df_subr['created_utc'].min()))
    print("End Date   :", datetime.fromtimestamp(df_subr['created_utc'].max()))

Extracting : https://api.pushshift.io/reddit/search/submission/?subreddit=keto&size=1000&after=1640995200&before=1645259073
Extracting : https://api.pushshift.io/reddit/search/submission/?subreddit=keto&size=1000&after=1641148245&before=1645259073
Extracting : https://api.pushshift.io/reddit/search/submission/?subreddit=keto&size=1000&after=1641263551&before=1645259073
Extracting : https://api.pushshift.io/reddit/search/submission/?subreddit=keto&size=1000&after=1641391005&before=1645259073
Extracting : https://api.pushshift.io/reddit/search/submission/?subreddit=keto&size=1000&after=1641498254&before=1645259073
Extracting : https://api.pushshift.io/reddit/search/submission/?subreddit=keto&size=1000&after=1641661631&before=1645259073
Extracting : https://api.pushshift.io/reddit/search/submission/?subreddit=keto&size=1000&after=1641771032&before=1645259073
Extracting : https://api.pushshift.io/reddit/search/submission/?subreddit=keto&size=1000&after=1641890253&before=1645259073
Extracti

***end of code***

In [5]:
df_subr.dtypes

created_utc              int64
title                   object
selftext                object
is_self                   bool
score                    int64
upvote_ratio           float64
num_comments             int64
author                  object
is_original_content       bool
media_only                bool
subreddit               object
dtype: object