.

#### Subreddit Data Extraction

<b>Notebook Introduction:</b> This file contains functions used to extract data from various subreddits of interest using the PRAW library. 1000 all time top posts along with 25 comments and 10 replies have been extracted for each subreddit.

* For the first iteration of this project (version 1), data was extracted from subreddits for three Tier 1 cities in India; Delhi, Mumbai and Bangalore.
* For the second iteration of this project (version 2), to work with more english language based data, data was extracted from subreddits for three Tier 1 cities in the USA; NYC, Chicago and Boston

The output dataframes from this EDA notebook are used as input for text processing (text_processing.ipynb)

In [1]:
##standard libraries
import pandas as pd
import os
import time 

##specifically for the reddit api
# !pip install praw
import praw 
from cred import reddit_keys
credentials = reddit_keys()

In [2]:
reddit = praw.Reddit(client_id = credentials["APP_ID"],
                     client_secret = credentials["APP_SECRET"],
                     user_agent = credentials["APP_NAME"],
                     username = credentials["REDDIT_USERNAME"], 
                     password = credentials["REDDIT_PASSWORD"], 
                     check_for_async=False)

In [3]:
def create_df(top_posts):
    """this function uses the data extracted from the subreddit to create corresponding dataframes (postdf,commentdf,replydf) for further analysis"""
    
    ##post related features:
    ##https://praw.readthedocs.io/en/stable/code_overview/models/submission.html
    automodposts = 0
    post_id = []
    posts = []
    post_upvotes = []
    post_dates = []
    post_author = []
    
    ##comment related features
    ## https://praw.readthedocs.io/en/stable/code_overview/models/comment.html
    comment_ids = []
    comments = []
    comment_post_id = []  #post that the comment is associated with
    comment_dates = []
    comment_upvotes = []
    comment_author = []
    
    #comment reply related features
    reply_ids = []
    reply_comment_ids = []
    replies = []
    reply_dates = []
    reply_author = []
    # reply_upvotes= [] #forgot to extract this last time
    
    
    for post in top_posts: #will iterate through each of the top 500 posts

        ##dataframe for posts
        if post.author != "AutoModerator":
            post_author.append(str(post.author))
            post_id.append(str(post.id))
            posts.append(str(post.title + " " + post.selftext))
            post_upvotes.append(int(post.score))
            post_dates.append(pd.to_datetime(post.created_utc, utc = True, unit = 's'))
        

            ##dataframe for top  25 comments associated with each post
            post.comment_sort = "top"
            post.comments.replace_more(limit=0) 
            if post.num_comments >0: 
                for comment in post.comments[:min(25, int(post.num_comments))]: #top 20 comments associated with each post 
                    comment_ids.append(str(comment.id))
                    comments.append(str(comment.body))
                    comment_post_id.append(str(post.id))
                    comment_dates.append(pd.to_datetime(comment.created_utc, utc = True, unit = 's'))
                    comment_author.append(str(comment.author))
                    comment_upvotes.append(int(comment.score))


                    ##dataframe for comment replies upto 10 replies
                    if len(comment.replies) >0:
                        for reply in comment.replies[:min(10, len(comment.replies))]:
                            reply_ids.append(str(reply.id))
                            replies.append(str(reply.body))
                            reply_comment_ids.append(str(comment.id))
                            reply_author.append(str(reply.author))
                            reply_dates.append(pd.to_datetime(reply.created_utc, utc = True, unit = 's'))
                            # reply_upvotes.append(reply.score)
        else:
            automodposts += 1

    print(f"Number of mod posts excluded: {automodposts}")
    top_posts_df = pd.DataFrame({'id' : post_id, 'text' : posts, 'date' : post_dates, 'author':post_author, 'upvotes': post_upvotes})
    comments_df = pd.DataFrame({'id' : comment_ids, 'text' : comments, 'date' : comment_dates, 'post_id' :comment_post_id, 
                                'author':comment_author, 'upvotes': comment_upvotes})
    comment_replies_df = pd.DataFrame({'id' : reply_ids, 'text' : replies, 'reply_comment_id' :reply_comment_ids,
                                      'date' : reply_dates, 'author':reply_author}) #forgot ,'upvotes': 'upvotes': reply_upvotes})
    return top_posts_df, comments_df, comment_replies_df

#### Dataset for Version 1
Subreddits considered: r/delhi, r/mumbai and r/bangalore. Each have a comparable user base ~700K subscribers

##### Delhi

In [3]:
r_delhi = reddit.subreddit('delhi') 
print(f"r/{r_delhi.display_name} has {r_delhi.subscribers} subscribers") #788236 subscribers
#https://praw.readthedocs.io/en/stable/code_overview/models/subreddit.html

In [36]:
start_time =  time.time()
r_del_subs = r_delhi.top(time_filter="all", limit = 1000) #returns the top 1000 posts of all time.
top_posts_del, comments_del, replies_del = create_df(r_del_subs)
end_time = time.time()
print(f"Run time: {round((end_time-start_time)/60,2)}")
print(f"Extracted {top_posts_del.shape[0]} top posts, {comments_del.shape[0] + replies_del.shape[0]} top comments from these posts.")

# top_posts_del.to_pickle('data/top_posts_del.pkl')
# comments_del.to_pickle('data/comments_del.pkl')
# replies_del.to_pickle('data/replies_del.pkl')
top_posts_del.sample(5)

Number of mod posts excluded: 0
Run time: 23.03
Extracted 986 top posts, 49731 top comments from these posts.


Unnamed: 0,id,text,date,author,upvotes
347,1b6v93l,Dilliwale these days,2024-03-05 03:31:42+00:00,pyaracetamol-143mg,1354
273,t0bpqp,War is not an option,2022-02-24 14:00:56+00:00,sisenor99,1560
335,1chkyq9,"The moment I hit 60, the first thing I am gonn...",2024-05-01 12:30:12+00:00,in_the_stars_iCU,1382
603,wo6kbz,Tim Hortons khulte hi Dilliwale line mein!,2022-08-14 13:35:33+00:00,nevigskcufon,1013
227,v4t1ys,The suburban life,2022-06-04 17:00:04+00:00,AgreeableReality5451,1691


In [37]:
comments_del.sample(3)

Unnamed: 0,id,text,date,post_id,author,upvotes
2219,juvzhi9,Bkl cfbr cfbr karna band karo isse kuch nahi h...,2023-08-05 12:11:34+00:00,15ir688,FitSignificance2100,46
16497,j4p5f0s,Wow! Where do you work?,2023-01-17 07:27:04+00:00,10e618y,,6
22315,i70bwg4,God bless you dost!,2022-05-02 09:36:59+00:00,ugkxzj,,1


In [38]:
replies_del.sample(3)

Unnamed: 0,id,text,reply_comment_id,date,author
20124,hx4qjft,It's worth visiting at least once.,hx0bflb,2022-02-16 04:08:17+00:00,Tarun24_12
14225,jhy4ipo,![gif](giphy|CAYVZA5NRb529kKQUc|downsized)\n\n...,jhy1209,2023-04-27 17:56:10+00:00,sarcasmka14
14935,kaih128,That's what she said.,kagbdxj,2023-11-24 01:37:18+00:00,online_karate_expert


In [5]:
r_mumbai = reddit.subreddit('mumbai')
print(f"r/{r_mumbai.display_name} has {r_mumbai.subscribers} subscribers") #686004 subscribers
# vars(r_mumbai)

r_mum_subs = r_mumbai.top(time_filter="all", limit = 1000) 
top_posts_mum, comments_mum, replies_mum = create_df(r_mum_subs)
end_time = time.time()
print(round((end_time-start_time)/60,2))
print(f"Extracted {top_posts_mum.shape[0]} top posts, {comments_mum.shape[0] + replies_mum.shape[0]} top comments/replies from these posts.")

# top_posts_mum.to_pickle('data/top_posts_mum.pkl')
# comments_mum.to_pickle('data/comments_mum.pkl')
# replies_mum.to_pickle('data/replies_mum.pkl')
top_posts_mum.sample(5)

Number of mod posts excluded: 0
19.91
Extracted 999 top posts, 48334 top comments/replies from these posts.


Unnamed: 0,id,text,date,author,upvotes
651,145ck3h,Shot this view of Bandra-Worli Sea Link from 📍...,2023-06-09 18:15:54+00:00,nameeribrahim,1058
348,12af9lj,How even Designed these ads?!,2023-04-03 09:47:50+00:00,Ready-Artichoke1515,1504
359,1dzljq0,Mumbai is full of cool history,2024-07-10 03:14:06+00:00,Dry-Neat-2818,1483
647,m2kost,"Walking through the by lanes of Mumbai, I came...",2021-03-11 07:32:57+00:00,kimeysia,1057
233,11jp934,Local being local,2023-03-06 05:13:00+00:00,akhandbharatvarshi,1827


In [6]:
comments_mum.sample(3)

Unnamed: 0,id,text,date,post_id,author,upvotes
13945,j4vvryw,Baju se jaate waqt paad dene ka inke aisle me,2023-01-18 17:01:09+00:00,10f7s1p,Logicaldump,206
12121,hwwu9uw,Do you work there?,2022-02-14 14:32:55+00:00,ss4yfv,,2
13406,jnvd2wz,"Nope, it does not at all look like foreign",2023-06-12 06:09:56+00:00,147b1bc,,3


In [7]:
replies_mum.sample(3)

Unnamed: 0,id,text,reply_comment_id,date,author
2243,jva09ez,During peak hour yes ghatkopar is a small stat...,jv9wxlx,2023-08-08 09:19:20+00:00,saviour_sam
13918,l06m8h9,Pune sub Wale ikde kadhi ale 😛,l06k1jm,2024-04-18 18:43:40+00:00,archieshahh
12693,kv3di61,Same,kv33oly,2024-03-16 03:30:21+00:00,Dark_Knight_108


In [8]:
r_bangalore = reddit.subreddit('bangalore')
print(f"r/{r_bangalore.display_name} has {r_bangalore.subscribers} subscribers") #697351 subscribersstart_time =  time.time()

start_time =  time.time()
r_ban_subs = r_bangalore.top(time_filter="all", limit = 1000)
top_posts_ban, comments_ban, replies_ban = create_df(r_ban_subs)
end_time = time.time()
print(round((end_time-start_time)/60,2))
print(f"Extracted {top_posts_ban.shape[0]} top posts, {comments_ban.shape[0] + replies_ban.shape[0]} top comments from these posts.")

# top_posts_ban.to_pickle('data/top_posts_ban.pkl')
# comments_ban.to_pickle('data/comments_ban.pkl')
# replies_ban.to_pickle('data/replies_ban.pkl')
top_posts_ban.sample(5)

Number of mod posts excluded: 0
18.93
Extracted 986 top posts, 47853 top comments from these posts.


Unnamed: 0,id,text,date,author,upvotes
390,185nv3u,WFO is an absolute joke. I know the title seem...,2023-11-28 04:46:42+00:00,infinitypolarbear,949
808,18rtkc7,Anyone planning on visiting or travelling to P...,2023-12-27 06:04:49+00:00,Rainman515,649
867,1ag64l3,Help! A cat is stuck above a grilled fencing f...,2024-02-01 08:16:11+00:00,DataAnalyst1994,627
370,17v7lkq,Am I the only one? Whenever I treat swiggy guy...,2023-11-14 17:40:19+00:00,Full_Order_2061,971
784,w0k7jj,How do deal with the ambulance mafia ? Disclai...,2022-07-16 16:16:00+00:00,time_rare_eternal,666


In [9]:
comments_ban.sample(3)

Unnamed: 0,id,text,date,post_id,author,upvotes
9336,j1mrsaz,This looks like a COVID hotspot.!,2022-12-25 18:08:38+00:00,zv2h0w,Choice-Anybody6388,7
6826,l0lhcff,Good for Nandini. We see Amul products here in...,2024-04-21 14:03:45+00:00,1c9cqyr,denommonkey,6
14351,ic3s8mh,LOL what an idiot. \n\nAll the more reason to...,2022-06-12 16:52:35+00:00,valv3k,rowschank,0


In [10]:
replies_ban.sample(3)

Unnamed: 0,id,text,reply_comment_id,date,author
20476,ful5bas,*Laughs in Chennai*,ful0oip,2020-06-12 11:19:56+00:00,
14858,j3fykfm,"Doesn't matter, what matters is that we are th...",j3cia6g,2023-01-08 07:33:27+00:00,um3shg
23895,l49whff,Do you live in East Bangalore?\n\nWhitefield -...,l49mw5y,2024-05-16 07:12:52+00:00,elslyknight


#### Dataset for version 2

In [4]:
r_nyc = reddit.subreddit('nyc')
print(f"r/{r_nyc.display_name} has {r_nyc.subscribers} subscribers")

r_boston = reddit.subreddit('boston')
print(f"r/{r_boston.display_name} has {r_boston.subscribers} subscribers")
                           
r_chicago = reddit.subreddit('chicago')
print(f"r/{r_chicago.display_name} has {r_chicago.subscribers} subscribers")    

r/nyc has 905706 subscribers
r/boston has 634827 subscribers
r/chicago has 564939 subscribers


In [18]:
start_time =  time.time()
r_nyc_subs = r_nyc.top(time_filter="all", limit = 1000)
top_posts_nyc, comments_nyc, replies_nyc = create_df(r_nyc_subs)
end_time = time.time()
print(round((end_time-start_time)/60,2))

print(f"Extracted {top_posts_nyc.shape[0]} top posts, {comments_nyc.shape[0] + replies_nyc.shape[0]} top comments from these posts.")

# top_posts_nyc.to_pickle('data/top_posts_nyc.pkl')
# comments_nyc.to_pickle('data/comments_nyc.pkl')
# replies_nyc.to_pickle('data/replies_nyc.pkl')
top_posts_nyc.sample(5)

Number of mod posts excluded: 0
31.41
Extracted 998 top posts, 58101 top comments from these posts.


Unnamed: 0,id,text,date,author,upvotes
57,4ntdg4,1 WTC,2016-06-13 01:30:35+00:00,solateor,4022
316,dryldl,The Colorful Coat Closet of a New Yorker,2019-11-05 12:41:55+00:00,b-rad62,2584
259,1qvg2p,The most beautiful NYC missed connection I've ...,2013-11-18 04:16:13+00:00,,2749
95,ray90b,I painted the laundromat outside my apartment ...,2021-12-07 12:35:55+00:00,onewordpoet,3594
647,dbsvsf,Spotted on the G train this evening,2019-10-01 12:47:25+00:00,psychothumbs,2061


In [5]:
start_time =  time.time()
r_boston_subs = r_boston.top(time_filter="all", limit = 1000)
top_posts_bos, comments_bos, replies_bos = create_df(r_boston_subs)
end_time = time.time()
print(round((end_time-start_time)/60,2))

print(f"Extracted {top_posts_bos.shape[0]} top posts, {comments_bos.shape[0] + replies_bos.shape[0]} top comments from these posts.")

# top_posts_bos.to_pickle('data/top_posts_bos.pkl')
# comments_bos.to_pickle('data/comments_bos.pkl')
# replies_bos.to_pickle('data/replies_bos.pkl')
top_posts_bos.sample(5)

In [6]:
start_time =  time.time()
r_chicago_subs = r_chicago.top(time_filter="all", limit = 1000)
top_posts_chi, comments_chi, replies_chi = create_df(r_chicago_subs)
end_time = time.time()
print(round((end_time-start_time)/60,2))

print(f"Extracted {top_posts_chi.shape[0]} top posts, {comments_chi.shape[0] + replies_chi.shape[0]} top comments from these posts.")

# top_posts_chi.to_pickle('data/top_posts_chi.pkl')
# comments_chi.to_pickle('data/comments_chi.pkl')
# replies_chi.to_pickle('data/replies_chi.pkl')
top_posts_chi.sample(5)

Number of mod posts excluded: 0
31.06
Extracted 996 top posts, 52297 top comments from these posts.


Unnamed: 0,id,text,date,author,upvotes
285,fzjo6z,Inside Jokes in Old Town,2020-04-11 21:37:45+00:00,WhoopieKush,2340
50,gygkwx,Searching for beauty in mayhem,2020-06-07 17:34:10+00:00,sinful_one,3500
747,ozxf0l,Chicago Water Lilies. LOL Found this on intern...,2021-08-07 17:23:28+00:00,doughsa,1740
452,cirx03,"Caesar, an employee at the McDonald’s on Weste...",2019-07-28 05:06:24+00:00,juanyworldwide,2056
531,317j3g,"PSA: If you witness a kidnapping, or any other...",2015-04-02 16:45:54+00:00,allkindsofmamba,1943
