In [6]:
import numpy as np
import pandas as pd
import requests
from datetime import datetime as dt

In [7]:
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

This notebook pulls data from two subreddits to feed into the classification model and does preliminary data cleaning. 

The two subreddits are:

* https://www.reddit.com/r/ModernWarfareII/
* https://www.reddit.com/r/overwatch2/

The data pull relies on pushshift api, which can be found here:

* https://github.com/pushshift/api

In [8]:
def get_reddit_submissions(subreddits: list, max_num: int):
    """
    uses pushshift api to gather subreddit posts
    """
    base_url = "https://api.pushshift.io/reddit/search/submission"
    all_posts = []
    
    for subreddit in subreddits:
        params = {
            'subreddit': subreddit,
            'size': 1000 #doesn't appear to be working
        }
        
        count = 0 #keep track of posts/subreddit
        
        while count < max_num:
            res = requests.get(base_url, params)
            
            if res.status_code == 200:
                posts = pd.DataFrame(res.json()['data'])
                count += len(posts)
                
                all_posts.append(posts)
                
                if len(posts) == 0: 
                    break #break loop if request successful but nothing retrieved
                    
                #get sequential posts from most recent to least    
                params['before'] = posts['created_utc'].min()
            else:
                print(f'status: {res.status_code}')
        print(f'scraped from {subreddit}: {count}')
    
    return pd.concat(all_posts)

# source: Devin Day

#### Overwatch 2 Data

In [10]:
# get overwatch posts
ow = get_reddit_submissions(subreddits=['overwatch'], max_num=3000)

status: 504
scraped from overwatch: 3999


In [11]:
# look at minumum and maximum dates
pd.to_datetime(ow['utc_datetime_str']).min(), pd.to_datetime(ow['utc_datetime_str']).max()

(Timestamp('2022-12-13 22:22:12'), Timestamp('2022-12-21 02:01:20'))

At the time of this project, the pushshift api was undergoing maintainance, causing all the data to be between 11/15/2022 and 12/15/2022

In [None]:
ow.loc[(ow['selftext']!='[removed]')&(ow['selftext']!='')]['selftext'].count()

In [None]:
# create date column
ow['date'] = pd.to_datetime(ow['utc_datetime_str']).dt.strftime('%m/%d')

In [None]:
# save overwatch2 data
ow.to_csv('data/overwatch2_data_pull.csv')

#### COD MW2 Data

In [None]:
# pull cod mw2 data
cod = get_reddit_submissions(subreddits=['ModernWarfareII'], max_num=100_000)

In [None]:
# look at min and max dates
pd.to_datetime(cod['utc_datetime_str']).min(), pd.to_datetime(cod['utc_datetime_str']).max() # 11/15/2022 to 12/15/2022

In [None]:
# create date column
cod['date'] = pd.to_datetime(cod['utc_datetime_str']).dt.strftime('%m/%d')

In [None]:
# save cod data
cod.to_csv('data/cod_data_pull.csv')