In [None]:
# library imports

import praw
from dotenv import load_dotenv
import os
import requests
from datetime import datetime
import pandas as pd

## This notebook contains code to grab data from reddit using an api query with the PRAW library

You will need to have an account set up with the api and the associated keys.

In [15]:
# this block will grab keys from your local environment 

load_dotenv()

SECRET_KEY = os.getenv("SECRET_KEY")
CLIENT_ID = os.getenv("CLIENT_ID")
PASS = os.getenv("PASSWORD")
USER = os.getenv("USER")

### Define Functions

In [5]:
############################################################################################################################################

def search_subreddit(subreddit, query:str, limit:int):
    """query a specific subreddit.
    query: string query for praw api
    limit: int number of threads to collect
    returns: dataframe of threads matching query"""

    thread_list = list() # init list for thread data

    for submission in subreddit.search(query=query, limit=limit): # iterate over all threads returned from search

        # grab year and month from thread
        date = datetime.fromtimestamp(submission.created_utc).strftime('%Y-%m-%d %H:%M:%S')
        year = date.split("-")[0]
        month = date.split("-")[1]

        result_dict = { # put thread data in a dict
                "submission_id":submission.id,
                "title":submission.title,
                "text":submission.selftext,
                "year":year,
                "month":month
        }

        thread_list.append(result_dict) # append dict to list

    df = pd.DataFrame(thread_list) # turn list into dataframe
    df.set_index("submission_id", inplace=True) # set index to be thread id

    return df

############################################################################################################################################

def get_comments(id_list:list, reddit):
    """get all comments from a specific thread
    id_list: list of submission ids to get comment data from
    reddit: reddit instance 
    returns: dataframe of comment data"""
    
    comments_list = list()

    for id in id_list:
        
        submission = reddit.submission(id=id)

        for comment in submission.comments.list():

            date = datetime.fromtimestamp(comment.created_utc).strftime("%Y-%m-%d %H:%M:%S")
            year = date.split("-")[0]
            month = date.split("-")[1]

            comment_dict = {
                "submission_id": id,
                "author": comment.author.name if comment.author else None,
                "body": comment.body,
                "score": comment.score,
                "year":year,
                "month":month
            }   

            comments_list.append(comment_dict)

    df = pd.DataFrame(comments_list)

    return df

############################################################################################################################################

### Get Reddit & Subreddit Instance

In [4]:
# Create read only instance of reddit‚àè

reddit = praw.Reddit( 
    client_id=CLIENT_ID,
    client_secret=SECRET_KEY,
    user_agent=USER
)

reddit.read_only # check instance

True

In [5]:
washingtondc = reddit.subreddit("washingtondc") # get subreddit instance

washingtondc

Subreddit(display_name='washingtondc')

In [19]:
DCforRent = reddit.subreddit("DCforRent") # get subreddit instance

DCforRent

Subreddit(display_name='DCforRent')

In [23]:
washdc = reddit.subreddit("washdc") # get subreddit instance

washdc

Subreddit(display_name='washdc')

### Search Threads

In [11]:
threads_df = search_subreddit(washingtondc, query, 50)

threads_df.head()

Unnamed: 0_level_0,title,text,year,month
submission_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
vmsxc7,How safe is this neighborhood?,"Hey you, yeah you. Stop fucking asking this. Y...",2022,6
16hpztv,Where would you buy in DC to feel most safe ab...,Would the northern part of Georgetown/Glover P...,2023,9
3j8vju,Van Ness residents say their neighborhood isn'...,,2015,9
lsoay5,Recommendation of a safe neighborhood in which...,"Obviously not looking for luxury, would like a...",2021,2
ih2u1o,How safe is the Old City/Near-Northeast neighb...,I'm thinking about signing a lease on a place ...,2020,8


In [41]:
threads_washdc_df = search_subreddit(washdc, 'title:"safe" title:"neighborhood" title:"crime"', 150)

threads_washdc_df.head()

Unnamed: 0_level_0,title,text,year,month
submission_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1ax8woa,"After feds dismantle street gang, officials sa...",,2024,2
1bqfvdz,Ritzy DC neighborhood sees yet more crime,,2024,3
1eg7a93,Is this DC neighborhood safe?,My friends and I are considering moving into a...,2024,7
1azpaob,Not even our donuts are safe? Crime in this to...,,2024,2
17t7gfq,"In the 1940s, DC youth crime was out of contro...",‚ÄúThe Junior Police and Citizen Corps ended up ...,2023,11


In [51]:
threads_DCforRent_df = search_subreddit(DCforRent, 'title:"safe" title:"neighborhood" title:"crime"', 50)

threads_DCforRent_df.head()

Unnamed: 0_level_0,title,text,year,month
submission_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
20t3z6,Find out how safe the neighborhood you want to...,,2014,3
y57cbw,"Best quiet, safe but lively neighborhood in DC",I am in my late 30s and haven't lived in DC si...,2022,10


### Grab Comments

In [7]:
comments_df = get_comments(list(threads_df.index), reddit)

comments_df.head()

Unnamed: 0,submission_id,author,body,score,year,month
0,vmsxc7,downvoteyous,cant comment just got murdered in the face due...,613,2022,6
1,vmsxc7,pizzajona,I‚Äôm thinking of moving to Capitol Hill on Janu...,54,2022,6
2,vmsxc7,zero_derivation,Is Columbia Heights a bad neighborhood for rat...,147,2022,6
3,vmsxc7,,I'm living inside the fenced-in perimeter of t...,196,2022,6
4,vmsxc7,BringMeCoffeeOrTea_,I thought everyone who lived in D.C. was murde...,248,2022,6


In [37]:
comments_washdc_df = get_comments(list(threads_washdc_df.index), reddit)

comments_washdc_df.head()

Unnamed: 0,submission_id,author,body,score,year,month
0,1eg7a93,Personal-Wasabi4189,I‚Äôd pick another neighborhood if you have options,25,2024,7
1,1eg7a93,GraceGod6,21st and benning road‚Ä¶.lmaooo good luck sis ü§∑üèΩ...,22,2024,7
2,1eg7a93,Zoroasker,I live in the general vicinity. It‚Äôs relativel...,6,2024,7
3,1eg7a93,,Go walk it at day and then at night,5,2024,7
4,1eg7a93,PigeonParadiso,Please look at DC crime maps and go based on t...,12,2024,7


In [53]:
comments_DCforRent_df = get_comments(list(threads_DCforRent_df.index), reddit)

comments_DCforRent_df.head()

Unnamed: 0,submission_id,author,body,score,year,month
0,20t3z6,,Unfortunately MPD is notorious for underreport...,2,2014,3
1,20t3z6,AUBlazin,I completely understand and believe what you a...,2,2014,3
2,y57cbw,eeek0711,Or Cleveland Park in DC,4,2022,10
3,y57cbw,eeek0711,Takoma Park MD,3,2022,10
4,y57cbw,nonmimeticform,Baltimore,3,2022,10


### Save Data

In [29]:
file_name='test' # EDIT ME - for file name 

In [55]:
# run this block to save to repo data folder
threads_washdc_df.to_csv("./data/"+file_name+"_threads_washdc_df.csv")
comments_washdc_df.to_csv("./data/"+file_name+"_comments_washdc_df.csv")
threads_DCforRent_df.to_csv("./data/"+file_name+"_threads_DCforRent_df.csv")
comments_DCforRent_df.to_csv("./data/"+file_name+"_comments_DCforRent_df.csv")
#threads_df.to_csv("./data/"+file_name+'_threads.csv')
#comments_df.to_csv("./data/"+file_name+'_comments.csv', index=False)
