# Description
Get the data about the conversations of Uber drivers on their subreddit __https://www.reddit.com/r/uberdrivers/__ using the Python Reddit API Wrapper (**PRAW**).
### Classes
There are three important classes for this project:
* `post`: equivalent to `thread` of UberPeople
* `comment`: equivalent to `message`
* `author`: equivalent to `user`

### Limitations
Currently, there seems to be a few important limitations of this API:
1. Most data requests return a listing generator with a limit of 1000 objects. This greatly reduces the size of the dataset, although quality may be acceptable.
2. Cannot get followers/friends of redditors of this subreddit. This is very crucial as we cannot perform any direct social network analysis.

### Process
As a result of these limitations, I'm currently employing workarounds, esp. for the first point. Thankfully, this API lets search the top 1000 posts (equivalent of threads in UberPeople) by various measures - top, hottest, newest, most relevant - as defined by Reddit. It also allows querying the subreddit which is particularly useful for our analysis because one can use keywords relevant to the topic (in this case - strikes & organizing behavior of the drivers). I combine these two methods to gather the relevant posts and pickle the obtained objects. I then extract the comment and author data from these post objects and store the three types of objects as pandas dataframes.

# Setup

In [23]:
import praw             # main PRAW package for requesting data
import prawcore         # for a specific exception in requesting data (not really necessary)
import pickle           # for storing objects in binary code
import pandas as pd     # for storing object details as tables
from tqdm import tqdm   # for visual progress bar
import datetime as dt
from time import time

In [21]:
# create the reddit object
# needs username and password for elevated privilege
reddit = praw.Reddit(client_id='wejqi7u3P_dE4A',
                    client_secret='21iuFXgESnIg4BaUMzSSt6yinAA',
                    user_agent='uber_drivers_reddit',
                    username='emphasent',
                    password='Wje3Rj!ShVZ9dPg')

In [22]:
# get the subreddit object
sub = reddit.subreddit('uberdrivers')

# Get post objects

## Top 1000 posts
Reddit API only allows getting the first 1000 objects in any request through a listing generator. This is a limitation of the API.

In [None]:
# have the highest difference of upvotes and downvotes
toppest = list(sub.top(limit=1000))

In [None]:
# hottest posts, with hotness = Log(abs(Upvotes-Downvotes)) + (age/45000)
# src: https://www.reddit.com/r/explainlikeimfive/comments/1u0q4s/
hottest = list(sub.hot(limit=1000))

In [None]:
newest = list(sub.new(limit=1000))

In [None]:
all_time_popular = set(toppest + hottest + newest)

In [None]:
# also get the top 1000 posts of this year
cur_year_popular = set(list(sub.top(limit=1000, time_filter='year')) +\
    list(sub.hot(limit=1000)) + list(sub.new(limit=1000)))

In [None]:
total_popular = all_time_popular.union(cur_year_popular)

In [None]:
print(len(toppest))
print(len(hottest))
print(len(newest))
print(len(all_time_popular))
print(len(cur_year_popular))
print(len(total_popular))

In [None]:
# pickle the data
# pickle.dump(total_popular, open('data/all_top_posts.pickle', 'wb'))

## Search by keyword

In [None]:
def get_posts_by_keyword(subreddit, keywords, limit=1000,
                         methods=['relevance','top','hot','new']):
    '''Get post objects containing given list of keywords'''
    def add_post(map_, generator, word):
        '''Add posts from a generator to a map/dict along with its search word'''
        i = 0
        for post in generator:
            i += 1
            # if this post has not been observed before, add it to dict
            if post not in posts:
                map_[post] = set([word])
            # if this post has been observed before, add this word to its set
            else:
                map_[post].add(word)
        # return the no. of posts in the generator, just for reference
        return i
    
    posts = {}
    for word in keywords:
        for method in methods:
            # top all-time posts for current method
            all_time = subreddit.search(word, sort=method, limit=limit)
            num_posts = add_post(posts, all_time, word)
            
            # if no. of posts in the all-time generator is equal to the limit,
            # then there's a chance that some top posts of current year that
            # could not make it to the all-time top list can be extracted
            if num_posts == limit:
                # for the current year only
                cur_year = subreddit.search(word, sort=method,
                                            limit=limit, time_filter='year')
                add_post(posts, cur_year, word)
    
    return posts

In [None]:
keywords = ['strike','protest','organize','discrimination','union','frustrated',
            'rights','labor','wage','protection','unemployment','covid','corona',
            'treat','policy','surge','pricing','support']

In [None]:
# search posts by keyword

# ---------------- CAUTION: SLOW ---------------------

# kw_data = get_posts_by_keyword(sub, keywords)

In [None]:
def get_keyword_count(posts):
    '''Find the counts of keywords in a posts map'''
    counts = {}
    for post, words in posts.items():
        for word in list(words):
            if word not in counts:
                counts[word] = 1
            else:
                counts[word] += 1
    return counts

In [None]:
for k, v in get_keyword_count(kw_data).items():
    print(k, v)

In [None]:
# pickle the data
# pickle.dump(kw_data, open('data/keywords_top1000.pickle', 'wb'))

## Combine the top & keyword  posts

In [None]:
all_posts = {p: set() for p in total_popular}

In [None]:
for post, words in kw_data.items():
    all_posts[post] = words

In [None]:
len(all_posts)

In [None]:
# save the posts data to pickle
pickle.dump(all_posts, open('data/posts_top1000_with_keywords.pickle', 'wb'))

# Comment & author objects
Get the comment and author objects from the stored post objects.

In [9]:
# read the posts map from disk
all_posts = pickle.load(open('data/posts_top_with_keywords.pickle', 'rb'))

# convert to list, removing the info of search keywords
all_posts_list = list(all_posts.keys())

In [11]:
def get_comments_authors(posts):
    '''Get the comment and author objects from a list of posts'''
    comments = []
    authors = set()
    
    # show progress bar for the posts
    for post in tqdm(posts):
        try:
            authors.add(post.author)
            for com in post.comments:
                comments.append(com)
                authors.add(com.author)
        except AttributeError as e:
            log_error('{}: {}'.format(post, e))
    authors.discard(None)
    
    return comments, authors

In [10]:
# get the comment and author objects from all the posts
coms, auths = get_comments_authors(all_posts_list)

100%|████████████████████████████████████████████████████████████████████████████| 6371/6371 [1:43:56<00:00,  1.01s/it]


In [15]:
# save these to pickle
pickle.dump({'posts': all_posts, 'comments': coms, 'authors': auths},
           open('data/obj_of_top_posts.pickle', 'wb'))

# Objects to tables
Convert post, comment, and author objects into tables by filtereing a select columns/fields.

## Read the objects

In [18]:
# read the collected objects
obj = pickle.load(open('data/obj_of_top_posts.pickle', 'rb'))

In [19]:
print(len(obj['posts']))
print(len(obj['comments']))
print(len(obj['authors']))

6371
60814
12055


## Convert objects to tables

In [28]:
def log_error(msg, file='data/data_read_errors.log'):
    '''Log the data read errors
    (not using `logging` module coz of some error)'''
    with open(file, 'a') as f:
        f.write('%s: %s\n' % (dt.datetime.now().isoformat(), msg))

In [29]:
def obj_to_table(objs, fields):
    '''Extract given fields of a list of PRAW objects into a Pandas
    dataframe and optionally save it to a file'''
    data = []
    profile = {f: 0 for f in fields}
    for obj in tqdm(objs):
        try:
            record = {}
            for field in fields:
                t = time()
                record[field] = getattr(obj, field)
                profile[field] += time() - t
            data.append(record)
        except (AttributeError, prawcore.exceptions.NotFound) as e:
            log_error('{}: {}'.format(obj, e))
    # convert to pandas dataframe    
    df = pd.DataFrame(data)
    return df, profile

### Posts

In [62]:
# get the normal fields
post_df = obj_to_table(list(obj['posts'].keys()),
    ['id','created','author_fullname','num_comments','permalink',
     'score','title','upvote_ratio','selftext'])

post_df[0].to_csv('data/posts.csv', index=False)

100%|███████████████████████████████████████████████████████████████████████████| 6371/6371 [00:00<00:00, 27765.82it/s]


### Comments

In [63]:
com_df = obj_to_table(
    obj['comments'],
    ['id','link_id','created_utc','author_fullname','is_submitter','parent_id','score','body'])

com_df[0].to_csv('data/comments.csv', index=False)

100%|██████████████████████████████████████████████████████████████████████████| 60814/60814 [00:08<00:00, 7228.04it/s]


### Authors

In [30]:
# not reading 'id' because it takes awfully long, not sure why
auth_df, profile = obj_to_table(list(obj['authors']),
    ['id','name','comment_karma','created_utc','fullname','is_mod','is_gold','link_karma'])

auth_df.to_csv('data/authors.csv', index=False)

print(profile)

100%|██████████████████████████████████████████████████████████████████████████| 12055/12055 [3:19:23<00:00,  1.00it/s]


{'id': 11774.861673355103, 'name': 0.005012989044189453, 'comment_karma': 0.002001047134399414, 'created_utc': 0.0010306835174560547, 'fullname': 0.031147241592407227, 'is_mod': 0.0050811767578125, 'is_gold': 0.0030248165130615234, 'link_karma': 0.0009953975677490234}


## (Old) Direct read to disk
Read thd, msg, auth data directly from the subreddit into dictionaries and directly write to disk

In [None]:
def read_write_submissions(subreddit, top):
    '''Read top submissions and save them to disk 
    (without storing in memory)'''
    # open the files
    f_thd = open(files['thd'], 'w', newline='', encoding='utf-8')
    f_msg = open(files['msg'], 'w', newline='', encoding='utf-8')
    f_auth = open(files['auth'], 'w', newline='', encoding='utf-8')
    
    # create the writer objects & write the header row
    w_thd = csv.writer(f_thd)
    w_msg = csv.writer(f_msg)
    w_auth = csv.writer(f_auth)
    w_thd.writerow(fields['thd'])
    w_msg.writerow(fields['msg'])
    w_auth.writerow(fields['auth'])
    
    # iterate over the submissions
    post_num = 0
    for post in subreddit.top(limit=top):
        post_num += 1
        print('#%d:' % post_num, post)
        try:
            # get the thread details
            thd = get_obj_info(post, fields['thd'])
            if post.author is not None:
                thd['auth_id'] = post.author.id
                auth = get_obj_info(post.author, fields['auth'])
                w_auth.writerow(auth.values())
            else:
                thd['auth_id'] = ''
            w_thd.writerow(thd.values())
            
            # get the messages/comments
            for com in post.comments:
                try:
                    msg = get_obj_info(com, fields['msg'])
                    # get the author details
                    if com.author is not None:
                        msg['auth_id'] = com.author.id
                        auth = get_obj_info(com.author, fields['auth'])
                        w_auth.writerow(auth.values())
                    else:
                        msg['auth_id'] = ''
                    w_msg.writerow(msg.values())
                except TypeError as e:
                    log('msg[{}]:{}'.format(com.id, e))
                except AttributeError as e:
                    log('msg[{}]:{}'.format(com.id, e))
        except Exception as e:
            log('post[{}]:unexpected error:{}'.format(post_num, e))
    
    # close the files
    f_thd.close()
    f_msg.close()
    f_auth.close()