# Data Collection

In [1]:
# Import libraries
import requests
import pandas as pd
import time
import random

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
#create function for scrapping n posts within the past 30 days from subreddit and save as a df

def scrap(subreddit, n, days = 30):
    
    # url
    base_url = 'https://api.pushshift.io/reddit/search/submission'
    full_url = f'{base_url}?subreddit={subreddit}&size=100'
    
    # create an empty list to store the posts
    posts = []
    
    # to modify url after each iteration
    for i in range(1, n+1):
        urlmod = '{}&after={}d'.format(full_url, days*i)
        res_1 = requests.get(urlmod)
        
        # to prevent errors from stopping the codes from running
        try:
            res = requests.get(urlmod)
            assert res.status_code == 200
        except:
            continue
        
        # converting to json
        extracted = res.json()['data']
        # constructing a df from dict
        df = pd.DataFrame.from_dict(extracted)
        # adding the df to post list
        posts.append(df)
        
        # total number of posts scrapped
        total_scraped = sum(len(x) for x in posts)
        
        # to stop if total scrapped > n
        if total_scraped > n:
            break
        
        # generate a random sleep duration to imitate a human user
        sleep_duration = random.randint(1,9)
        time.sleep(sleep_duration)
                
    # creating a list of features of interest
    features_of_interest = ['subreddit', 'title', 'selftext']
    
    # combine all iterations into 1 df
    final_df = pd.concat(posts, sort=False)
    # remove all unrequired columns from the datasets
    final_df = final_df[features_of_interest]
    # dropping any duplicates
    final_df.drop_duplicates(inplace=True)
    return final_df.reset_index(drop=True)

In [3]:
#apply function on anxiety and depression subreddit

anxiety_df = scrap('anxiety', 3000)
depression_df = scrap('depression', 3000)

#print number of submissions scrapped

print(f'Retrieved {len(anxiety_df)} submissions on \'Anxiety\' from Pushshift')
print((f'Retrieved {len(depression_df)} submissions on \'Depression\' from Pushshift'))

Retrieved 3079 submissions on 'Anxiety' from Pushshift
Retrieved 3073 submissions on 'Depression' from Pushshift


In [8]:
#save submissions into csv files

anxiety_df.to_csv('./Dataset/anxiety.csv', index = False)
depression_df.to_csv('./Dataset/depression.csv', index = False)

In [9]:
#take a look at the first 5 records of the df

depression_df.head()

Unnamed: 0,subreddit,title,selftext
0,depression,Does therapy help,Just wondering if therapy even does anything b...
1,depression,How do I stop the cycle?,I have been super depressed for about 4 years ...
2,depression,Confused and lost,Is it just me or does anyone else feel more an...
3,depression,I'm giving up. last therapy appointment today,"Its 330 am. Its been 2 years of ""therapy"". 3 y..."
4,depression,negative butterflys,[removed]


In [10]:
#take a look at the first 5 records of the df

anxiety_df.head()

Unnamed: 0,subreddit,title,selftext
0,Anxiety,My ex friend said me having anxiety is abusive...,Um hi i’m new to the sub and hoped someone cou...
1,Anxiety,Just wanted to share,"Hey everyone , so my anxiety has been up to\na..."
2,Anxiety,Anxious about polygraph at work,"\n\nI managed to get out of my old, toxic jo..."
3,Anxiety,DAE get waves of vertigo even when sitting or ...,Vertigo has been one of my main symptoms of an...
4,Anxiety,Is there someone that I can talk to right now?,Just really need to talk to someone
