# All of the data for this project was collected in this notebook.

# Jump to:

## [Getting posts for 2A](#Love-and-Law)

## [Getting posts and comments for 2B](#Assholery)

# Imports

In [1]:
import pandas as pd
import json
import requests
from time import sleep

pd.set_option('max_rows', 99)
pd.set_option('max_columns', 99)

# Library

In [2]:
def get_posts(subreddit, scrape_size=100, score_threshold=">150", num_rows_wanted=2500):
    """
    Takes in the name of the subreddit that you want to pull posts for.
    By default, it assumes you want to pull only 100 posts a day with scores over 150 and 2500 posts total.
    Will almost always overshoot number of rows; I'm not cutting the df down to perfect size here, because
    having a few extra rows is better for wiggle room. Paring down will happen before classification.
    Prints messages throughout to give updates on progress.
    Returns nothing. Saves a .csv file.
    """
    base_url = 'https://api.pushshift.io/reddit/search/submission/?'

    keep_cols = ['author', 'author_fullname', 'created_utc', 'domain', 'full_link', 'id', 'num_comments',\
                'permalink', 'score', 'selftext', 'subreddit', 'subreddit_id', 'title', 'url']

    num_rows_scraped = 0

    num_days_backtracked = 1
    
    print(f"Pulling r/{subreddit}'s posts.")
    
    # loops through until enough rows have been found, each pass increments the number of days 'ago'
    # writes to csv on every pass through after reading in existing csv and appending this pass's rows to it
    while(num_rows_scraped < num_rows_wanted):

        params = {
            'subreddit':subreddit,
            'size':scrape_size,
            'score':score_threshold,
            'after': str(num_days_backtracked) + 'd'
        }

        csv_path = "./datasets/r_" + params['subreddit'] + '.csv'

        if num_days_backtracked == 1:
            df = pd.DataFrame()
        else:
            try:
                df = pd.read_csv(csv_path)
            except:
                df = pd.DataFrame()

        res = requests.get(base_url, params)
        curr = pd.DataFrame(res.json()['data'])
        df = pd.concat([df, curr])
        num_rows_scraped = df.shape[0]
        if num_rows_scraped > 0:
            df = df[keep_cols]
            df = df.drop_duplicates('id')
            df = df.dropna(subset=['selftext'])
            num_rows_scraped = df.shape[0]
        df.to_csv(csv_path, index=False)
        print('\tdone with', num_rows_scraped, 'rows,', 'backtracked for', num_days_backtracked, 'day(s)')
        num_days_backtracked += 1
        sleep(5)
    print(f"Done getting r/{subreddit}'s posts.")
    print(f"Backtracked for {num_days_backtracked} and pulled {df.shape[0]} rows.")

In [3]:
def get_posts_with_comments(subreddit, rows_wanted=2500, max_comments=500):
    """
    Takes in the name of the subreddit you want comments of.
    If you've already pulled the posts for that subreddit, it reads in that file and uses those posts.
    If not (i.e., if that csv is not present), it calls get_posts() for that subreddit before proceeding.
    For each post, it collects the links of up to a specified number (default is 500) of comments.
    It then collects the actual text of those comments, concatenates the results, and then appends that as a column.
    Prints messages throughout to give updates on progress.
    Returns nothing. Saves a .csv file (abridged version of full subreddit file, with comments appended).
    """
    # checks to see if posts have already been pulled for this subreddit
    try:
        print(f"Reading in r/{subreddit}'s posts.")
        df = pd.read_csv('./datasets/r_' + subreddit + '.csv')
        print()
    except:
        print(f"It looks like r/{subreddit}'s posts haven't been pulled yet. Pulling now...")
        get_posts(subreddit, num_rows_wanted=rows_wanted)
        df = pd.read_csv('./datasets/r_' + subreddit + '.csv')
        print()
    
    # creates abridged df
    posts_with_comments = df[['id', 'title', 'selftext']].copy()
    posts_with_comments['comments'] = ""
    
    # saving this version of the df
    csv_path = './datasets/with_comments/r_' + subreddit + '_with_comments.csv'
    posts_with_comments.to_csv(csv_path, index=False)

    # getting comments
    print(f"Getting comments for {posts_with_comments.shape[0]} posts...")
    
    comment_ids_base_url = "https://api.pushshift.io/reddit/submission/comment_ids/"
    comment_text_base_url = "https://api.pushshift.io/reddit/comment/search?ids="
    
    # for each post, it:
        # reads the whole abridged file in;
        # collects the comment ids for the current post;
        # limits the number of comments as per specifications;
        # collects the actual comments and appends to the abridged file;
        # and writes the file back in
    all_posts = posts_with_comments['id'].values
    for p in range(len(all_posts)):
        post_id = all_posts[p]
        
        # reading in df which should update every time
        df = pd.read_csv(csv_path)
                
        post_comments_url = comment_text_base_url + ",".join(requests.get(comment_ids_base_url + post_id).json()['data'])
        
        # length of base url + length of all n comment ids themselves + length of commas (n - 1 commas)
        maxlength = len(comment_text_base_url) + (7 * max_comments) + (max_comments - 1)
        
        # limiting to n comments at most
        if len(post_comments_url) > maxlength:
            post_comments_url = post_comments_url[:maxlength]
        
        comments = requests.get(post_comments_url).json()['data']
        sleep(5)
        
        comment_bodies = ""
        # if r/AmITheAsshole, comments[1:] to lose the automod comment
        if subreddit.lower() == 'amitheasshole':
            for comm in comments[1:]:
                comment_bodies += comm['body'] + '\n'
        else:
            for comm in comments:
                comment_bodies += comm['body'] + '\n'
        
        df.loc[df['id'] == post_id, 'comments'] = comment_bodies
        
        # saving df which should happen every time
        df.to_csv(csv_path, index=False)
        if p % 100 == 0:
            print(f"\tdone pulling comments for {p} of {len(all_posts)} posts")
        
    print(f"Done getting r/{subreddit}'s posts with comments.")


In [4]:
def get_info(df):
    """
    Prints out some EDA-type information about the given dataframe.
    """
    print('Number of empty posts:', df['selftext'].isnull().sum())
    print('Number of unique authors:', df['author'].nunique())
    print('Most popular posts:')
    display(df.sort_values('score', ascending=False).head(3))
    print('Most discussed posts:')
    display(df.sort_values('num_comments', ascending=False).head(3))

## Love and Law

[return to top](#Jump-to:)

### r/legaladvice (2500 posts with a score of at least 150)

In [6]:
%%time

get_posts('legaladvice')

done with 0 rows, backtracked for 1 day(s)
done with 0 rows, backtracked for 2 day(s)
done with 1 rows, backtracked for 3 day(s)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




done with 8 rows, backtracked for 4 day(s)
done with 17 rows, backtracked for 5 day(s)
done with 27 rows, backtracked for 6 day(s)
done with 37 rows, backtracked for 7 day(s)
done with 47 rows, backtracked for 8 day(s)
done with 56 rows, backtracked for 9 day(s)
done with 64 rows, backtracked for 10 day(s)
done with 72 rows, backtracked for 11 day(s)
done with 85 rows, backtracked for 12 day(s)
done with 94 rows, backtracked for 13 day(s)
done with 103 rows, backtracked for 14 day(s)
done with 120 rows, backtracked for 15 day(s)
done with 126 rows, backtracked for 16 day(s)
done with 136 rows, backtracked for 17 day(s)
done with 146 rows, backtracked for 18 day(s)
done with 156 rows, backtracked for 19 day(s)
done with 163 rows, backtracked for 20 day(s)
done with 174 rows, backtracked for 21 day(s)
done with 183 rows, backtracked for 22 day(s)
done with 193 rows, backtracked for 23 day(s)
done with 202 rows, backtracked for 24 day(s)
done with 209 rows, backtracked for 25 day(s)
done 

done with 1778 rows, backtracked for 180 day(s)
done with 1788 rows, backtracked for 181 day(s)
done with 1801 rows, backtracked for 182 day(s)
done with 1810 rows, backtracked for 183 day(s)
done with 1822 rows, backtracked for 184 day(s)
done with 1832 rows, backtracked for 185 day(s)
done with 1839 rows, backtracked for 186 day(s)
done with 1853 rows, backtracked for 187 day(s)
done with 1867 rows, backtracked for 188 day(s)
done with 1878 rows, backtracked for 189 day(s)
done with 1893 rows, backtracked for 190 day(s)
done with 1901 rows, backtracked for 191 day(s)
done with 1910 rows, backtracked for 192 day(s)
done with 1926 rows, backtracked for 193 day(s)
done with 1939 rows, backtracked for 194 day(s)
done with 1951 rows, backtracked for 195 day(s)
done with 1968 rows, backtracked for 196 day(s)
done with 1981 rows, backtracked for 197 day(s)
done with 1993 rows, backtracked for 198 day(s)
done with 2001 rows, backtracked for 199 day(s)
done with 2014 rows, backtracked for 200

In [None]:
# 2503 rows, 238 days, 21.8 minutes.

In [8]:
path = './datasets/r_legaladvice.csv'
legaladvice = pd.read_csv(path)
legaladvice.shape

(2503, 14)

In [13]:
get_info(legaladvice)

Number of empty posts: 32
Number of unique authors: 2395
Most popular posts:


Unnamed: 0,author,author_fullname,created_utc,domain,full_link,id,num_comments,permalink,score,selftext,subreddit,subreddit_id,title,url
1702,poksoooosoo28,t2_3jaicttf,1556209785,self.legaladvice,https://www.reddit.com/r/legaladvice/comments/...,bhaof3,4,/r/legaladvice/comments/bhaof3/update_neighbor...,55763,Original post: https://www.reddit.com/r/legala...,legaladvice,t5_2rawz,(Update) Neighbors child has disabilities and ...,https://www.reddit.com/r/legaladvice/comments/...
1653,throwaway_18701,t2_3dma3xwj,1556477313,self.legaladvice,https://www.reddit.com/r/legaladvice/comments/...,bieir5,3,/r/legaladvice/comments/bieir5/update_pa_i_fol...,38858,This is an update to my [original post](https:...,legaladvice,t5_2rawz,[UPDATE] [PA] I followed and reported a drunk ...,https://www.reddit.com/r/legaladvice/comments/...
1760,throwmeinalake123,t2_3gba89ml,1555643214,self.legaladvice,https://www.reddit.com/r/legaladvice/comments/...,beuf9y,4,/r/legaladvice/comments/beuf9y/update_my_fathe...,35337,[Here's my original post](https://www.reddit.c...,legaladvice,t5_2rawz,"UPDATE: My father and step mother died, leavin...",https://www.reddit.com/r/legaladvice/comments/...


Most discussed posts:


Unnamed: 0,author,author_fullname,created_utc,domain,full_link,id,num_comments,permalink,score,selftext,subreddit,subreddit_id,title,url
2314,Zanctmao,t2_nszex,1551735326,self.legaladvice,https://www.reddit.com/r/legaladvice/comments/...,axcpzw,599,/r/legaladvice/comments/axcpzw/megathread_it_w...,905,This was initially discussed in [this megathre...,legaladvice,t5_2rawz,[Megathread] It will become a federal crime to...,https://www.reddit.com/r/legaladvice/comments/...
1328,GoonDaFirst,t2_zmh7d,1558893052,self.legaladvice,https://www.reddit.com/r/legaladvice/comments/...,btauw7,390,/r/legaladvice/comments/btauw7/mi_usa_consider...,9511,"I have a big family, with around 20 cousins, ...",legaladvice,t5_2rawz,"[MI, USA] Considering suing my uncle after he ...",https://www.reddit.com/r/legaladvice/comments/...
1927,poksoooosoo28,t2_3jaicttf,1554323940,self.legaladvice,https://www.reddit.com/r/legaladvice/comments/...,b93dx3,353,/r/legaladvice/comments/b93dx3/neighbors_child...,18239,For the past 7 months after moving in to my ne...,legaladvice,t5_2rawz,Neighbors child has disabilities and won't sto...,https://www.reddit.com/r/legaladvice/comments/...


### r/relationships (2500 posts with a score of at least 150)

In [7]:
%%time

get_posts('relationships')

done with 0 rows, backtracked for 1 day(s)
done with 8 rows, backtracked for 2 day(s)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




done with 21 rows, backtracked for 3 day(s)
done with 32 rows, backtracked for 4 day(s)
done with 46 rows, backtracked for 5 day(s)
done with 58 rows, backtracked for 6 day(s)
done with 65 rows, backtracked for 7 day(s)
done with 76 rows, backtracked for 8 day(s)
done with 86 rows, backtracked for 9 day(s)
done with 95 rows, backtracked for 10 day(s)
done with 113 rows, backtracked for 11 day(s)
done with 125 rows, backtracked for 12 day(s)
done with 134 rows, backtracked for 13 day(s)
done with 147 rows, backtracked for 14 day(s)
done with 165 rows, backtracked for 15 day(s)
done with 177 rows, backtracked for 16 day(s)
done with 189 rows, backtracked for 17 day(s)
done with 197 rows, backtracked for 18 day(s)
done with 213 rows, backtracked for 19 day(s)
done with 224 rows, backtracked for 20 day(s)
done with 234 rows, backtracked for 21 day(s)
done with 243 rows, backtracked for 22 day(s)
done with 251 rows, backtracked for 23 day(s)
done with 261 rows, backtracked for 24 day(s)
don

done with 2254 rows, backtracked for 178 day(s)
done with 2273 rows, backtracked for 179 day(s)
done with 2296 rows, backtracked for 180 day(s)
done with 2320 rows, backtracked for 181 day(s)
done with 2337 rows, backtracked for 182 day(s)
done with 2350 rows, backtracked for 183 day(s)
done with 2364 rows, backtracked for 184 day(s)
done with 2376 rows, backtracked for 185 day(s)
done with 2395 rows, backtracked for 186 day(s)
done with 2417 rows, backtracked for 187 day(s)
done with 2435 rows, backtracked for 188 day(s)
done with 2448 rows, backtracked for 189 day(s)
done with 2463 rows, backtracked for 190 day(s)
done with 2483 rows, backtracked for 191 day(s)
done with 2495 rows, backtracked for 192 day(s)
done with 2508 rows, backtracked for 193 day(s)
CPU times: user 30.1 s, sys: 2.55 s, total: 32.7 s
Wall time: 18min 9s


In [None]:
# 2508 rows, 193 days, 18.2 minutes.

In [14]:
path = './datasets/r_relationships.csv'
relationships = pd.read_csv(path)
relationships.shape

(2508, 14)

In [16]:
get_info(relationships)

Number of empty posts: 0
Number of unique authors: 2453
Most popular posts:


Unnamed: 0,author,author_fullname,created_utc,domain,full_link,id,num_comments,permalink,score,selftext,subreddit,subreddit_id,title,url
2234,LadyGrey90,t2_10tayt,1555589790,self.relationships,https://www.reddit.com/r/relationships/comment...,beku6e,760,/r/relationships/comments/beku6e/update_my_28f...,9726,[Original post](https://www.reddit.com/r/relat...,relationships,t5_2qjvn,[UPDATE] My (28F) colleague (30F) didn't invit...,https://www.reddit.com/r/relationships/comment...
480,Gerrrrtty,t2_4bd4qle8,1567170329,self.relationships,https://www.reddit.com/r/relationships/comment...,cxgnob,185,/r/relationships/comments/cxgnob/my_30f_husban...,8758,Update to https://www.reddit.com/r/relationshi...,relationships,t5_2qjvn,My (30F) husband (32M) impulse bought a dog. [...,https://www.reddit.com/r/relationships/comment...
726,big_platypus_,t2_39rxxrpc,1565219527,self.relationships,https://www.reddit.com/r/relationships/comment...,cnd80k,538,/r/relationships/comments/cnd80k/update_my_24_...,8107,Link to original post [https://www.reddit.com/...,relationships,t5_2qjvn,UPDATE: My [24 F] boyfriend [26 M] of 6 months...,https://www.reddit.com/r/relationships/comment...


Most discussed posts:


Unnamed: 0,author,author_fullname,created_utc,domain,full_link,id,num_comments,permalink,score,selftext,subreddit,subreddit_id,title,url
1934,Bee5431,t2_3h02g4j0,1557250985,self.relationships,https://www.reddit.com/r/relationships/comment...,bltp4x,1166,/r/relationships/comments/bltp4x/im_ready_to_l...,5409,I (31F) am so overwhelmed and literally about ...,relationships,t5_2qjvn,I'm ready to leave my (31F) husband (30M) over...,https://www.reddit.com/r/relationships/comment...
1419,unbeliavable211,t2_3y6jvse9,1560597460,self.relationships,https://www.reddit.com/r/relationships/comment...,c0w90i,1092,/r/relationships/comments/c0w90i/partner29m_sa...,3884,We've been together for 5 years and in the las...,relationships,t5_2qjvn,Partner[29M] says he will divorce me [26F] if ...,https://www.reddit.com/r/relationships/comment...
1554,SweetButteryStacks,t2_3tux27b2,1559743398,self.relationships,https://www.reddit.com/r/relationships/comment...,bx2x4a,1077,/r/relationships/comments/bx2x4a/i_29f_think_i...,3574,My bf and I have been together for 8 years. Du...,relationships,t5_2qjvn,I [29/F] think I need to leave my bf[31/M] of ...,https://www.reddit.com/r/relationships/comment...


## Assholery

[return to top](#Jump-to:)

### r/AmITheAsshole (2000 posts with a score of at least 150; with up to 500 comments each)

In [8]:
%%time

get_posts("AmITheAsshole", num_rows_wanted=2000)

Pulling r/AmITheAsshole's posts.
	done with 0 rows, backtracked for 1 day(s)
	done with 0 rows, backtracked for 2 day(s)
	done with 0 rows, backtracked for 3 day(s)
	done with 8 rows, backtracked for 4 day(s)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




	done with 62 rows, backtracked for 5 day(s)
	done with 117 rows, backtracked for 6 day(s)
	done with 173 rows, backtracked for 7 day(s)
	done with 228 rows, backtracked for 8 day(s)
	done with 283 rows, backtracked for 9 day(s)
	done with 334 rows, backtracked for 10 day(s)
	done with 381 rows, backtracked for 11 day(s)
	done with 437 rows, backtracked for 12 day(s)
	done with 492 rows, backtracked for 13 day(s)
	done with 544 rows, backtracked for 14 day(s)
	done with 600 rows, backtracked for 15 day(s)
	done with 658 rows, backtracked for 16 day(s)
	done with 708 rows, backtracked for 17 day(s)
	done with 754 rows, backtracked for 18 day(s)
	done with 804 rows, backtracked for 19 day(s)
	done with 852 rows, backtracked for 20 day(s)
	done with 910 rows, backtracked for 21 day(s)
	done with 968 rows, backtracked for 22 day(s)
	done with 1019 rows, backtracked for 23 day(s)
	done with 1072 rows, backtracked for 24 day(s)
	done with 1124 rows, backtracked for 25 day(s)
	done with 1188 

In [None]:
# 2002 rows, 41 days, 3.75 minutes

In [22]:
path = './datasets/r_AmITheAsshole.csv'
aita = pd.read_csv(path)
aita.shape

(2534, 14)

In [23]:
aita.columns

Index(['author', 'author_fullname', 'created_utc', 'domain', 'full_link', 'id',
       'num_comments', 'permalink', 'score', 'selftext', 'subreddit',
       'subreddit_id', 'title', 'url'],
      dtype='object')

In [24]:
get_info(aita)

Number of empty posts: 0
Number of unique authors: 2501
Most popular posts:


Unnamed: 0,author,author_fullname,created_utc,domain,full_link,id,num_comments,permalink,score,selftext,subreddit,subreddit_id,title,url
1111,DarthCharizard,t2_7a646,1568998300,self.AmItheAsshole,https://www.reddit.com/r/AmItheAsshole/comment...,d6xoro,6748,/r/AmItheAsshole/comments/d6xoro/meta_this_sub...,71339,[removed],AmItheAsshole,t5_2xhvq,META: This sub is moving towards a value syste...,https://www.reddit.com/r/AmItheAsshole/comment...
909,aitabikini,t2_4n9mqxl5,1569197811,self.AmItheAsshole,https://www.reddit.com/r/AmItheAsshole/comment...,d7yuot,4417,/r/AmItheAsshole/comments/d7yuot/aita_for_wear...,45595,So it was my birthday couple months ago. Had a...,AmItheAsshole,t5_2xhvq,Aita for wearing the “joke” bikini my friend g...,https://www.reddit.com/r/AmItheAsshole/comment...
676,inappropriatedress77,t2_46j0iwtd,1569680504,self.AmItheAsshole,https://www.reddit.com/r/AmItheAsshole/comment...,daglhs,4253,/r/AmItheAsshole/comments/daglhs/aita_for_tell...,43456,So my son had a long-distance gf recently for ...,AmItheAsshole,t5_2xhvq,AITA for telling my son he deserved his gf bre...,https://www.reddit.com/r/AmItheAsshole/comment...


Most discussed posts:


Unnamed: 0,author,author_fullname,created_utc,domain,full_link,id,num_comments,permalink,score,selftext,subreddit,subreddit_id,title,url
560,farqueenhell,t2_4p4es35y,1569838334,self.AmItheAsshole,https://www.reddit.com/r/AmItheAsshole/comment...,db9dtn,12491,/r/AmItheAsshole/comments/db9dtn/aita_for_refu...,25919,For the past month my 24 year old niece has be...,AmItheAsshole,t5_2xhvq,AITA for refusing to give my niece our dog's r...,https://www.reddit.com/r/AmItheAsshole/comment...
2448,isaoas,t2_4glrq467,1566910725,self.AmItheAsshole,https://www.reddit.com/r/AmItheAsshole/comment...,cw43oc,9427,/r/AmItheAsshole/comments/cw43oc/aita_for_refu...,33899,"Hi all. \n\nI gave birth four weeks ago, and w...",AmItheAsshole,t5_2xhvq,AITA for refusing to pay back my cousin after ...,https://www.reddit.com/r/AmItheAsshole/comment...
2042,touchofspice84,t2_4ifbh8uz,1567522198,self.AmItheAsshole,https://www.reddit.com/r/AmItheAsshole/comment...,cz5vk2,8625,/r/AmItheAsshole/comments/cz5vk2/aita_for_refu...,28875,"When I was 25, we found out that my father had...",AmItheAsshole,t5_2xhvq,AITA for refusing to use the money I inherited...,https://www.reddit.com/r/AmItheAsshole/comment...


In [38]:
%%time

get_posts_with_comments("AmITheAsshole")

Reading in r/AmITheAsshole's posts.

Getting comments for 2002 posts...
	done pulling comments for 0 of 2002 posts
	done pulling comments for 100 of 2002 posts
	done pulling comments for 200 of 2002 posts
	done pulling comments for 300 of 2002 posts
	done pulling comments for 400 of 2002 posts
	done pulling comments for 500 of 2002 posts
	done pulling comments for 600 of 2002 posts
	done pulling comments for 700 of 2002 posts
	done pulling comments for 800 of 2002 posts
	done pulling comments for 900 of 2002 posts
	done pulling comments for 1000 of 2002 posts
	done pulling comments for 1100 of 2002 posts
	done pulling comments for 1200 of 2002 posts
	done pulling comments for 1300 of 2002 posts
	done pulling comments for 1400 of 2002 posts
	done pulling comments for 1500 of 2002 posts
	done pulling comments for 1600 of 2002 posts
	done pulling comments for 1700 of 2002 posts
	done pulling comments for 1800 of 2002 posts
	done pulling comments for 1900 of 2002 posts
	done pulling commen

In [None]:
# 5 hours, 39 minutes

In [39]:
aita_with_comments = pd.read_csv('./datasets/with_comments/r_AmITheAsshole_with_comments.csv')
print(aita_with_comments.shape)
print(aita_with_comments.isnull().sum())

(2002, 4)
id          0
title       0
selftext    0
comments    0
dtype: int64


In [40]:
aita_with_comments.head()

Unnamed: 0,id,title,selftext,comments
0,dgjarh,AITA for not wanting my dads new gf to sing at...,So I (28F) am getting married next year. \n\nI...,\nIf you want your comment to count toward jud...
1,dgjgap,AITA for “looking poor”?,Soo my family used to be really poor but my pa...,\nIf you want your comment to count toward jud...
2,dgjvfw,AITA for calling my family out on their eating...,"My brother is 15, turning 16 soon. He gets no...",\nIf you want your comment to count toward jud...
3,dgk74h,AITA for using the street parking in front of ...,I strongly dislike my neighbors and that is ma...,\nIf you want your comment to count toward jud...
4,dgkgu6,AITA for refusing to cook my boyfriends steak ...,I’ve been with my boyfriend for a couple years...,\nIf you want your comment to count toward jud...


In [42]:
# one of the automoderator's comments at the top
aita_with_comments['comments'][0]

'\nIf you want your comment to count toward judgment, include *only ONE* of the following abbreviations in your comment. If you don\'t include a judgement abbreviation, the bot will ignore you when it looks for the top voted comment.\n\n\nJudgment | Abbreviation\n-- | :--:\n**You\'re the Asshole** (&amp; the other party is not) |**YTA**|\nYou\'re **Not the A-hole** (&amp; the other party is) |**NTA**\n**Everyone Sucks Here**|**ESH**\n**No A-holes here**|**NAH**\n**Not Enough Info**|**INFO**\n\n\n#[Click Here For Our Full Rulebook](https://www.reddit.com/r/AmItheAsshole/wiki/index)\n\n#[Click Here For Our FAQ](https://www.reddit.com/r/AmItheAsshole/wiki/faq)\n\n\n*I am a bot, and this action was performed automatically. Please [contact the moderators of this subreddit](/message/compose/?to=/r/AmItheAsshole) if you have any questions or concerns.*\nNTA\nNTA\nNTA. It\'s your wedding and she\'s a stranger. Not to mention how awkward that\'s going to be that she has a pivotal part in the ce

## Other (tests + examples)

### r/nosleep (2500 posts with a score of at least 150)

In [24]:
%%time

get_posts('nosleep')

done with 0 rows, backtracked for 1 day(s)
done with 11 rows, backtracked for 2 day(s)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




done with 27 rows, backtracked for 3 day(s)
done with 43 rows, backtracked for 4 day(s)
done with 52 rows, backtracked for 5 day(s)
done with 70 rows, backtracked for 6 day(s)
done with 81 rows, backtracked for 7 day(s)
done with 95 rows, backtracked for 8 day(s)
done with 108 rows, backtracked for 9 day(s)
done with 123 rows, backtracked for 10 day(s)
done with 136 rows, backtracked for 11 day(s)
done with 151 rows, backtracked for 12 day(s)
done with 160 rows, backtracked for 13 day(s)
done with 170 rows, backtracked for 14 day(s)
done with 180 rows, backtracked for 15 day(s)
done with 191 rows, backtracked for 16 day(s)
done with 202 rows, backtracked for 17 day(s)
done with 215 rows, backtracked for 18 day(s)
done with 226 rows, backtracked for 19 day(s)
done with 240 rows, backtracked for 20 day(s)
done with 249 rows, backtracked for 21 day(s)
done with 260 rows, backtracked for 22 day(s)
done with 273 rows, backtracked for 23 day(s)
done with 283 rows, backtracked for 24 day(s)
d

done with 2160 rows, backtracked for 178 day(s)
done with 2175 rows, backtracked for 179 day(s)
done with 2188 rows, backtracked for 180 day(s)
done with 2201 rows, backtracked for 181 day(s)
done with 2214 rows, backtracked for 182 day(s)
done with 2225 rows, backtracked for 183 day(s)
done with 2236 rows, backtracked for 184 day(s)
done with 2248 rows, backtracked for 185 day(s)
done with 2259 rows, backtracked for 186 day(s)
done with 2271 rows, backtracked for 187 day(s)
done with 2282 rows, backtracked for 188 day(s)
done with 2291 rows, backtracked for 189 day(s)
done with 2303 rows, backtracked for 190 day(s)
done with 2315 rows, backtracked for 191 day(s)
done with 2329 rows, backtracked for 192 day(s)
done with 2344 rows, backtracked for 193 day(s)
done with 2357 rows, backtracked for 194 day(s)
done with 2372 rows, backtracked for 195 day(s)
done with 2383 rows, backtracked for 196 day(s)
done with 2398 rows, backtracked for 197 day(s)
done with 2422 rows, backtracked for 198

In [None]:
# 2510 rows, 204 days, 20.5 minutes.

In [17]:
path = './datasets/r_nosleep.csv'
nosleep = pd.read_csv(path)
nosleep.shape

(2510, 14)

In [19]:
get_info(nosleep)

Number of empty posts: 0
Number of unique authors: 973
Most popular posts:


Unnamed: 0,author,author_fullname,created_utc,domain,full_link,id,num_comments,permalink,score,selftext,subreddit,subreddit_id,title,url
1745,Verastahl,t2_11d38ddb,1558456902,self.nosleep,https://www.reddit.com/r/nosleep/comments/brco...,brco33,434,/r/nosleep/comments/brco33/my_job_is_watching_...,13648,\nThree years ago I was looking at the local j...,nosleep,t5_2rm4d,My job is watching a woman trapped in a room.,https://www.reddit.com/r/nosleep/comments/brco...
1767,deathbyproxy,t2_hw0t0,1558270858,self.nosleep,https://www.reddit.com/r/nosleep/comments/bqgo...,bqgosx,351,/r/nosleep/comments/bqgosx/my_sister_discovere...,12503,My sister is a genius. When she was about thir...,nosleep,t5_2rm4d,"My sister discovered a universal language, but...",https://www.reddit.com/r/nosleep/comments/bqgo...
2478,flard,t2_98k0a,1553431567,self.nosleep,https://www.reddit.com/r/nosleep/comments/b4vy...,b4vyxc,553,/r/nosleep/comments/b4vyxc/she_sold_happiness_...,12392,"The poster read, “Happiness! Sold in Glass Jar...",nosleep,t5_2rm4d,She Sold Happiness in Glass Jars,https://www.reddit.com/r/nosleep/comments/b4vy...


Most discussed posts:


Unnamed: 0,author,author_fullname,created_utc,domain,full_link,id,num_comments,permalink,score,selftext,subreddit,subreddit_id,title,url
2126,queen_of_the_moths,t2_v63grsa,1555788832,self.nosleep,https://www.reddit.com/r/nosleep/comments/bfg4...,bfg42c,1365,/r/nosleep/comments/bfg42c/how_do_i_get_my_gir...,10257,So I've been dating my girlfriend for almost a...,nosleep,t5_2rm4d,How do I get my girlfriend to knock off this a...,https://www.reddit.com/r/nosleep/comments/bfg4...
1849,Aleksandrovitch,t2_bir83,1557614075,self.nosleep,https://www.reddit.com/r/nosleep/comments/bnhy...,bnhy1m,1107,/r/nosleep/comments/bnhy1m/you_all_need_to_sto...,7119,You all need to stop posting. Right now.\n\nIt...,nosleep,t5_2rm4d,You all need to stop posting. Right now.,https://www.reddit.com/r/nosleep/comments/bnhy...
1691,RisingMac,t2_2i83a7hy,1558767178,self.nosleep,https://www.reddit.com/r/nosleep/comments/bsrm...,bsrmy5,1031,/r/nosleep/comments/bsrmy5/my_sons_camera_moni...,12304,I'm a nurse and I currently work nights. It's ...,nosleep,t5_2rm4d,My son's camera monitor alerted in the middle ...,https://www.reddit.com/r/nosleep/comments/bsrm...


### r/AskReddit (2500 posts with a score of at least 150)

In [49]:
%%time

get_posts("askreddit")

done with 0 rows, backtracked for 1 day(s)
done with 36 rows, backtracked for 2 day(s)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




done with 73 rows, backtracked for 3 day(s)
done with 110 rows, backtracked for 4 day(s)
done with 144 rows, backtracked for 5 day(s)
done with 187 rows, backtracked for 6 day(s)
done with 229 rows, backtracked for 7 day(s)
done with 259 rows, backtracked for 8 day(s)
done with 300 rows, backtracked for 9 day(s)
done with 334 rows, backtracked for 10 day(s)
done with 375 rows, backtracked for 11 day(s)
done with 416 rows, backtracked for 12 day(s)
done with 451 rows, backtracked for 13 day(s)
done with 490 rows, backtracked for 14 day(s)
done with 532 rows, backtracked for 15 day(s)
done with 561 rows, backtracked for 16 day(s)
done with 596 rows, backtracked for 17 day(s)
done with 636 rows, backtracked for 18 day(s)
done with 677 rows, backtracked for 19 day(s)
done with 725 rows, backtracked for 20 day(s)
done with 773 rows, backtracked for 21 day(s)
done with 814 rows, backtracked for 22 day(s)
done with 850 rows, backtracked for 23 day(s)
done with 893 rows, backtracked for 24 day

In [None]:
# 2502 rows, 64 days, 7.5 minutes.

In [25]:
path = './datasets/r_askreddit.csv'
ask = pd.read_csv(path)
ask.shape

(2502, 14)

In [27]:
get_info(ask)

Number of empty posts: 2494
Number of unique authors: 2307
Most popular posts:


Unnamed: 0,author,author_fullname,created_utc,domain,full_link,id,num_comments,permalink,score,selftext,subreddit,subreddit_id,title,url
266,Slade_Sez,t2_16cl1r,1570140030,self.AskReddit,https://www.reddit.com/r/AskReddit/comments/dc...,dcxylq,11837,/r/AskReddit/comments/dcxylq/anthony_bourdain_...,134861,,AskReddit,t5_2qh1i,"Anthony Bourdain once said ""There's a guy in m...",https://www.reddit.com/r/AskReddit/comments/dc...
1431,Marambal17,t2_4ejtplno,1567789988,self.AskReddit,https://www.reddit.com/r/AskReddit/comments/d0...,d0jjc2,24843,/r/AskReddit/comments/d0jjc2/the_2010s_decade_...,100653,,AskReddit,t5_2qh1i,The 2010's decade will be over in 4 months. Wh...,https://www.reddit.com/r/AskReddit/comments/d0...
110,pterv2112,t2_2b4pt7dm,1570482947,self.AskReddit,https://www.reddit.com/r/AskReddit/comments/de...,depwkk,17384,/r/AskReddit/comments/depwkk/what_looks_easy_p...,90579,,AskReddit,t5_2qh1i,What looks easy peasy lemon squeezy but is act...,https://www.reddit.com/r/AskReddit/comments/de...


Most discussed posts:


Unnamed: 0,author,author_fullname,created_utc,domain,full_link,id,num_comments,permalink,score,selftext,subreddit,subreddit_id,title,url
413,seven_wings,t2_6nc7y,1569954472,self.AskReddit,https://www.reddit.com/r/AskReddit/comments/db...,dbxf3n,75900,/r/AskReddit/comments/dbxf3n/if_your_reddit_us...,54965,,AskReddit,t5_2qh1i,If your reddit username would predict the way ...,https://www.reddit.com/r/AskReddit/comments/db...
709,JTDriver13,t2_3e9rx8oe,1569243350,self.AskReddit,https://www.reddit.com/r/AskReddit/comments/d8...,d865nf,63084,/r/AskReddit/comments/d865nf/the_last_text_you...,47496,,AskReddit,t5_2qh1i,The last text you sent is what you scream when...,https://www.reddit.com/r/AskReddit/comments/d8...
2316,Mewse_,t2_69haa,1565732291,self.AskReddit,https://www.reddit.com/r/AskReddit/comments/cp...,cpzvbu,60980,/r/AskReddit/comments/cpzvbu/what_is_your_stro...,52180,,AskReddit,t5_2qh1i,What is your strongest held opinion?,https://www.reddit.com/r/AskReddit/comments/cp...
