# Data Collection and Cleaning

## Import libraries

In [204]:
import requests
import numpy as np
import pandas as pd
import time

## Data Collection

Make a dataframe for each of the subreddits, using text extracted from 50 posts, and add a 'comments' column with comments corresponding to each post.

In [2]:
# Request "submission" data from subreddit "stroke" using the pushshift.io API, and store each post in a dataframe.
params = {
            "subreddit" : "stroke",
            "size" : 50,
            "is_video" : "false"
        }

base_url = 'https://api.pushshift.io/reddit/submission/search/?'
res = requests.get(base_url,params)

if res.status_code != 200:
    print(f'Error Code: {res.status_code}')
else:
    stroke = pd.DataFrame(res.json()['data'])
    stroke.drop(stroke[stroke['selftext'].str.len() < 2].index, axis=0, inplace=True)

In [3]:
# Request "submission" data from subreddit "migraine" using the pushshift.io API, and store each post in a dataframe.
params = {
            "subreddit" : "migraine",
            "size" : 50,
            "is_video" : "false"
        }

base_url = 'https://api.pushshift.io/reddit/submission/search/?'
res = requests.get(base_url,params)

if res.status_code != 200:
    print(f'Error Code: {res.status_code}')
else:
    migraine = pd.DataFrame(res.json()['data'])
    migraine.drop(migraine[migraine['selftext'].str.len() < 2].index, axis=0, inplace=True)

In [20]:
# Request "comment" data from subreddit "stroke" using the pushshift.io API, and store each comment
# in a separate column(in the 'stroke' dataframe) corresponding to the submission(row in dataframe) it's from.
params = {
            "subreddit" : "stroke",
            "size" : 50
            }
base_url = 'https://api.pushshift.io/reddit/comment/search/?'
res = requests.get(base_url,params)

if res.status_code != 200:
    print(f'Error Code: {res.status_code}')
else:
    stroke['comments'] = ''
    for j in range(len(res.json()['data'])):
        if len(res.json()['data'][j]['body']) > 2:
            stroke.loc[stroke['id'] == res.json()['data'][j]['parent_id'][3:], 'comments'] += res.json()['data'][j]['body']
            
utc_stroke_com = res.json()['data'][49]['created_utc']            

In [21]:
# Request "comment" data from subreddit "migraine" using the pushshift.io API, and store each comment
# in a separate column(in the 'migraine' dataframe) corresponding to the submission(row in dataframe) it's from.
params = {
            "subreddit" : "migraine",
            "size" : 50
            }
base_url = 'https://api.pushshift.io/reddit/comment/search/?'
res = requests.get(base_url,params)

if res.status_code != 200:
    print(f'Error Code: {res.status_code}')
else:
    migraine['comments'] = ''
    for j in range(len(res.json()['data'])):
        if len(res.json()['data'][j]['body']) > 2:
            migraine.loc[migraine['id'] == res.json()['data'][j]['parent_id'][3:], 'comments'] += res.json()['data'][j]['body']
            
utc_migraine_com = res.json()['data'][49]['created_utc'] 

Function to get 5000 posts in reverse chronological order from a subreddit with a 30s delay after every 50 requests.

In [78]:
def get_submissions(subreddit, df):  
    
    for i in range(100):
    
        date = min(df['created_utc'])

        params = {
        "subreddit" : subreddit,
        "size" : 50,
        "before" : date,
        "is_video" : "false"
        }
        base_url = 'https://api.pushshift.io/reddit/submission/search/?'
        res = requests.get(base_url,params)

        if res.status_code != 200:
            print(f'Error Code: {res.status_code}')
        else:
            d_f = pd.DataFrame(res.json()['data'])
            d_f.drop(d_f[d_f['selftext'].str.len() < 2].index, axis=0, inplace=True)
            
            df = pd.concat([df, d_f], sort=True)
        
        time.sleep(30)
    
    df.reset_index(drop=True, inplace=True)    
        
    return df
    

Commenting the below requests so that all the data is not collected again.

In [16]:
#stroke = get_submissions('stroke', stroke)

In [7]:
#migraine = get_submissions('migraine', migraine)

Function to get 5000 comments in reverse chronological order from a subreddit with a 30s delay after every 50 requests.

In [96]:
def get_comments(subreddit, df, date):
    
    if df is stroke:
            date = utc_stroke_com
    elif df is migraine:
            date = utc_migraine_com
    
    for i in range(100):
            
        params = {
                "subreddit" : subreddit,
                "size" : 50,
                "before" : date,
            }
        base_url = 'https://api.pushshift.io/reddit/comment/search/?'

        res = requests.get(base_url,params)

        if res.status_code != 200:
            print(f'Error Code: {res.status_code}')
        else:
            for j in range(len(res.json()['data'])):
                if len(res.json()['data'][j]['body']) > 2:
                    df.loc[df['id'] == res.json()['data'][j]['parent_id'][3:],'comments'] += res.json()['data'][j]['body']
        
        date = res.json()['data'][49]['created_utc']
                
        time.sleep(30)
        
    return df, date
            

Commenting the below requests so that all the data is not collected again.

In [95]:
#stroke, utc_stroke_com = get_comments('stroke', stroke, utc_stroke_com)

In [None]:
#migraine, utc_migraine_com = get_comments('migraine', migraine, utc_migraine_com)

## Data Cleaning

Removing all the posts which have been deleted or removed from each subreddit.

In [166]:
stroke.drop(stroke[stroke['selftext'] == '[deleted]'].index, axis=0, inplace=True)
stroke.drop(stroke[stroke['selftext'] == '[removed]'].index, axis=0, inplace=True)
stroke.reset_index(drop=True, inplace=True) 

migraine.drop(migraine[migraine['selftext'] == '[deleted]'].index, axis=0, inplace=True)
migraine.drop(migraine[migraine['selftext'] == '[removed]'].index, axis=0, inplace=True)
migraine.reset_index(drop=True, inplace=True) 

Creating dataframes for each subreddit only with the extracted text, and adding labels to mark the subreddit it is from, then concatenating both the dataframes and downloading the data into a csv file

In [167]:
df_stroke = stroke[['title', 'selftext', 'comments']]
df_migraine = migraine[['title', 'selftext', 'comments']]

df_stroke.loc[:,'label'] = 1
df_migraine.loc[:,'label'] = 0

In [168]:
df = pd.concat([df_stroke, df_migraine])

In [230]:
df.replace({'[deleted]':' ','[removed]':' '}, inplace=True)

In [231]:
df.reset_index(drop=True, inplace=True)

In [232]:
df.to_csv('../data/data.csv', index=False)

Statistics of the length of the posts and comments from each subreddit

In [209]:
stats = pd.DataFrame(columns = ['stroke', 'migraine'],
                     index = ['title_mean','title_median','text_mean','text_median','comment_mean','comment_median'])

In [210]:
stats['stroke'] = [df_stroke['title'].str.len().mean(),
                  df_stroke['title'].str.len().median(),
                  df_stroke['selftext'].str.len().mean(),
                  df_stroke['selftext'].str.len().median(),
                  df_stroke['comments'].str.len().mean(),
                  df_stroke['comments'].str.len().median()]

stats['migraine'] = [df_migraine['title'].str.len().mean(),
                    df_migraine['title'].str.len().median(),
                    df_migraine['selftext'].str.len().mean(),
                    df_migraine['selftext'].str.len().median(),
                    df_migraine['comments'].str.len().mean(),
                    df_migraine['comments'].str.len().median()]

In [211]:
stats

Unnamed: 0,stroke,migraine
title_mean,44.068367,40.625619
title_median,36.0,33.0
text_mean,1068.691914,688.842887
text_median,731.0,504.0
comment_mean,2648.077551,1514.763388
comment_median,1370.0,782.0
