# Gather Data from APIs

Import necessary libaries

In [1]:
import pandas as pd
import numpy as np
import requests
import time

Subreddit choices are Home Improvement and Real Estate Investing. I assumed that these would have some overlap and are also somewhat tied to each other. 

In [2]:
pd.options.display.max_columns = 999
pd.options.display.max_rows = 999

In [3]:
url = 'https://api.pushshift.io/reddit/search/submission'
dfs = []
subreddits = ['homeimprovement', 'realestateinvesting']
for subreddit in subreddits:
    before = int(time.time())
    for i in range(20):
        params = {
            'subreddit': subreddit,
            'size': 100,
            'before': before
        }
        res = requests.get(url, params = params)
        if res.status_code == 200:
            data = res.json()
            posts = pd.DataFrame(data['data'])[['author','title', 'selftext', 'subreddit', 'created_utc']]
            dfs.append(posts)
        before = posts['created_utc'].values[-1]
        time.sleep(3) 
        
df = pd.concat(dfs)

In [4]:
df.head()

Unnamed: 0,author,title,selftext,subreddit,created_utc
0,RealDaveCorey,"Trying to straighten my leaning garage, was th...","Hi, I'm looking at torturing my leaning garage...",HomeImprovement,1645985955
1,ZeroCool1,Thoughts from two years into purchasing a fixe...,A few random thoughts after I've wrestled with...,HomeImprovement,1645985873
2,gwenstellamade,HELP! Fireplace contractors screwed up the job,"So, I’m in a bit of a pickle with my current f...",HomeImprovement,1645985865
3,happybaconbit,Kitchen cabinet track is broken. How do I fix it?,Photo: https://imgur.com/a/XgNq0iG \n\nIt’s be...,HomeImprovement,1645985826
4,stonetime10,OTR microwave with stud In the way,"Hello everyone,\nI am almost finished my basem...",HomeImprovement,1645985457


In [5]:
df['subreddit'].value_counts()

realestateinvesting    2000
HomeImprovement        1998
Name: subreddit, dtype: int64

In [6]:
df

Unnamed: 0,author,title,selftext,subreddit,created_utc
0,RealDaveCorey,"Trying to straighten my leaning garage, was th...","Hi, I'm looking at torturing my leaning garage...",HomeImprovement,1645985955
1,ZeroCool1,Thoughts from two years into purchasing a fixe...,A few random thoughts after I've wrestled with...,HomeImprovement,1645985873
2,gwenstellamade,HELP! Fireplace contractors screwed up the job,"So, I’m in a bit of a pickle with my current f...",HomeImprovement,1645985865
3,happybaconbit,Kitchen cabinet track is broken. How do I fix it?,Photo: https://imgur.com/a/XgNq0iG \n\nIt’s be...,HomeImprovement,1645985826
4,stonetime10,OTR microwave with stud In the way,"Hello everyone,\nI am almost finished my basem...",HomeImprovement,1645985457
...,...,...,...,...,...
95,hossmanTK,Who do you use to do your taxes?,I have a duplex that I live in and rent. This ...,realestateinvesting,1643058056
96,Due_Yogurtcloset3390,Southern California Fixers Available,[removed],realestateinvesting,1643057426
97,theVirginAmberRose,what are some questions I should ask a constru...,tell me some stories if you got.\n\nwhat shoul...,realestateinvesting,1643054332
98,Suzyswan,What to do if no real estate agent will contac...,Literally have cash in hand to buy 200k in lan...,realestateinvesting,1643054205


Each time the posts are pulled, the index is set as 0-99. By resetting the index below, the index will be from 0-3998.

In [7]:
df = df.reset_index(drop=True)
df

Unnamed: 0,author,title,selftext,subreddit,created_utc
0,RealDaveCorey,"Trying to straighten my leaning garage, was th...","Hi, I'm looking at torturing my leaning garage...",HomeImprovement,1645985955
1,ZeroCool1,Thoughts from two years into purchasing a fixe...,A few random thoughts after I've wrestled with...,HomeImprovement,1645985873
2,gwenstellamade,HELP! Fireplace contractors screwed up the job,"So, I’m in a bit of a pickle with my current f...",HomeImprovement,1645985865
3,happybaconbit,Kitchen cabinet track is broken. How do I fix it?,Photo: https://imgur.com/a/XgNq0iG \n\nIt’s be...,HomeImprovement,1645985826
4,stonetime10,OTR microwave with stud In the way,"Hello everyone,\nI am almost finished my basem...",HomeImprovement,1645985457
...,...,...,...,...,...
3993,hossmanTK,Who do you use to do your taxes?,I have a duplex that I live in and rent. This ...,realestateinvesting,1643058056
3994,Due_Yogurtcloset3390,Southern California Fixers Available,[removed],realestateinvesting,1643057426
3995,theVirginAmberRose,what are some questions I should ask a constru...,tell me some stories if you got.\n\nwhat shoul...,realestateinvesting,1643054332
3996,Suzyswan,What to do if no real estate agent will contac...,Literally have cash in hand to buy 200k in lan...,realestateinvesting,1643054205


Saving the dataframe with a timestampe (Source: Ben Roberts).

In [8]:
file_path = f'../data/reddit_data_{time.strftime("%Y%m%d-%H%M%S")}.csv'

In [9]:
df.to_csv(file_path)