# # Scrapping comments data from r/wallstreetbets using Pushshift
### modified code from https://github.com/shergreen/wallstreetbets_sentiment_analysis

### Check the comments dataset scraped using this python file here: https://www.kaggle.com/ssliao/wallstreetbets-comments

In [2]:
import requests
import json
import re
import time
import math
from datetime import datetime
import pandas as pd
from os import path
import string
import datetime as dt

# slightly edited version of code presented here:
# https://www.osrsbox.com/blog/2019/03/18/watercooler-scraping-an-entire-subreddit-2007scape/

PUSHSHIFT_REDDIT_URL = "https://api.pushshift.io/reddit"

def get_date(created):
    if created:
        return dt.datetime.fromtimestamp(created)

def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = str(text).replace(punctuation, '')
    return text

def fetchObjects(**kwargs):
    # Default paramaters for API query
    params = {
        "sort_type":"created_utc",
        "sort":"asc",
        "size":100
        }

    # Add additional paramters based on function arguments
    for key,value in kwargs.items():
        params[key] = value

    # Print API query paramaters
    print(params)

    # Set the type variable based on function input
    # The type can be "comment" or "submission", default is "comment"
    type = "comment"
    if 'type' in kwargs and kwargs['type'].lower() == "submission":
        type = "submission"
    
    # Perform an API request
    r = requests.get(PUSHSHIFT_REDDIT_URL + "/search/" + type, params=params)
    print(r)

    # Check the status code, if successful, process the data
    if r.status_code == 200:
        response = json.loads(r.text)
        data = response['data']
        sorted_data_by_id = sorted(data, key=lambda x: int(x['id'],36))
        return sorted_data_by_id

def extract_reddit_data(**kwargs):
    # Specify the start timestamp 2020, 2, 1
    max_created_utc = math.floor(datetime(2020,2,1).timestamp())
    max_id = 0
    # specify the features
    columns = ['author', 'author_created_utc', 'author_flair_css_class',
       'author_flair_text', 'author_fullname', 'body', 'can_gild',
       'controversiality', 'created_utc', 'distinguished', 'gilded', 'id',
       'link_id', 'nest_level', 'parent_id', 'reply_delay', 'retrieved_on',
       'score', 'stickied', 'subreddit', 'subreddit_id', 'mod_removed',
       'edited', 'user_removed']

    # Open a file for JSON output
    fn = kwargs['subreddit'] + "_" + kwargs['type'] + ".csv"
    if(path.exists(fn)):
        
        existing_data = pd.read_csv(fn)[columns]
        most_recent = existing_data['created_utc'].idxmax()
        recent_created_utc = int(existing_data['created_utc'][most_recent])
        recent_id = int(existing_data['id'][most_recent],36)
        if(recent_created_utc > max_created_utc):
            max_created_utc = recent_created_utc - 1
            max_id = recent_id

    else:
        existing_data = pd.DataFrame(columns = columns)
        existing_data.to_csv(fn)   
    
    

    # While loop for recursive function
    while 1:
#     while max_created_utc <= math.floor(datetime(2020,6,1).timestamp()):
        nothing_processed = True
        # Call the recursive function
        objects = fetchObjects(**kwargs,after=max_created_utc)
        new_df = pd.DataFrame(columns = columns)
        # Loop the returned data, ordered by date
        for object in objects:
            
            id = int(object['id'],36)
            if id > max_id:
                nothing_processed = False
                created_utc = object['created_utc']
                max_id = id
                if created_utc > max_created_utc: 
                    max_created_utc = created_utc
                    if object['author'] == '[deleted]' and object['body'] == '[deleted]':
                        # remove deleted comments
                        continue
                    else:
                        # output new comments to a DataFrame  
                        new_df = new_df.append(object,ignore_index=True)
               
        
        print('The newest comments are at: ', get_date(max_created_utc))
        new_df['body'] = new_df['body'].apply(remove_punctuations)
        new_df = new_df[columns]
        print('Extracted ', new_df.shape[0], 'comments with ', new_df.shape[1], ' features.')
        # append the new dataframe to opened csv file
        new_df.to_csv(fn, mode='a',header=False)        
        
        # Exit if nothing happened
        if nothing_processed: return
        max_created_utc -= 1

        # Sleep a little before the next recursive function call
        time.sleep(1)

# Start program by calling function with:
# 1) Subreddit specified
# 2) The type of data required (comment or submission)
# ex: extract_reddit_data(subreddit="wallstreetbets",type="comment")

# useful codes for removing abnormal data when it fails to find most_recent = existing_data['created_utc'].idxmax()
# mask = pd.to_numeric(df['created_utc'], errors='coerce').isna()
# print(mask.sum())
# df.drop(df[mask == True].index, inplace = True)
extract_reddit_data(subreddit="wallstreetbets",type="comment")