In [1]:
# Import useful mathematical libraries
import numpy as np
import pandas as pd
import glob

# Import useful Machine learning libraries
import gensim

# Import utility files
from utils import read_df, remove_links, clean_sentence, save_object, load_object

In [2]:
# set the model name
model_name = "example"

In [3]:
# set the folder for saved objects
import os
directories = ['objects', 'objects/subreddit_post_analysis']
for dirname in directories:
    if not os.path.exists(dirname):
        os.makedirs(dirname)

In [4]:
# Get the data from the csv, assumed to be in a directory 'data'
#indexed by name of the author MAKE SURE author is in column index 2 (position 3)
# This version skips over deleted authors to speed up analysis
dirname = 'data'
extension = "/*.csv"

df = pd.DataFrame()
df_list =[]
fnames = glob.glob(dirname + extension)
for fname in fnames:
    df_chunks = pd.read_csv(fname, header=0, index_col = 2, iterator=True, chunksize=1000)
    df = pd.concat([chunk[chunk.index != '[deleted]'] for chunk in df_chunks])
    df_list.append(df)
df = pd.concat(df_list)

In [5]:
# save the data frame of posts
save_object(df, 'objects/', model_name + "-subreddit_user_analysis_Posts_dataframe")

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 292 entries, dediobst to zach84
Data columns (total 14 columns):
title           292 non-null object
created_utc     292 non-null int64
ups             292 non-null int64
downs           292 non-null int64
num_comments    292 non-null int64
name            292 non-null object
id              292 non-null object
from            0 non-null float64
from_id         0 non-null float64
selftext        120 non-null object
subreddit       292 non-null object
score           292 non-null int64
url             292 non-null object
permalink       292 non-null object
dtypes: float64(2), int64(5), object(7)
memory usage: 34.2+ KB


In [7]:
df.head()

Unnamed: 0_level_0,title,created_utc,ups,downs,num_comments,name,id,from,from_id,selftext,subreddit,score,url,permalink
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
dediobst,Side effects after break from medication?,1325521921,2,0,7,t3_nzvpw,nzvpw,,,"When I first began taking Vyvanse (40 mg), I w...",ADHD,2,http://www.reddit.com/r/ADHD/comments/nzvpw/si...,/r/ADHD/comments/nzvpw/side_effects_after_brea...
choraule,"Notes from ADHD in Women: A Hidden Disorder, A...",1325486071,24,2,12,t3_nzi53,nzi53,,,"Dr. Quinn, and expert and speaker on ADHD, cam...",ADHD,22,http://www.reddit.com/r/ADHD/comments/nzi53/no...,/r/ADHD/comments/nzi53/notes_from_adhd_in_wome...
hajojebi,Resources?,1325479085,3,0,1,t3_nzdy6,nzdy6,,,"I have ADHD. I however, do not have a full tim...",ADHD,3,http://www.reddit.com/r/ADHD/comments/nzdy6/re...,/r/ADHD/comments/nzdy6/resources/
LiliBlume,ADHD drugs do not increase risk of heart disea...,1325584742,13,1,3,t3_o0y6c,o0y6c,,,,ADHD,12,http://www.washingtonpost.com/national/health-...,/r/ADHD/comments/o0y6c/adhd_drugs_do_not_incre...
stupidestgirl,What to do when medication seems ineffective?,1325560701,4,1,8,t3_o0m51,o0m51,,,"Hi, so I'm currently taking Adderall XR 30mg w...",ADHD,3,http://www.reddit.com/r/ADHD/comments/o0m51/wh...,/r/ADHD/comments/o0m51/what_to_do_when_medicat...


In [8]:
# create a list of all authors
authors = []
author_frequency = []
for author in df.index:
    if not (author in authors):
        authors.append(author)
        author_frequency.append(1)
    else:
        author_frequency[authors.index(author)] += 1

In [10]:
# print the most frequent author
authors[author_frequency.index(max(author_frequency))]

'[deleted]'

In [11]:
# move deleted authors into separate variables for faster run DONE with loading DFs
#authors_deleted = authors.index('[deleted]')
#authors_frequency_deleted = author_frequency[authors.index('[deleted]')]
#del author_frequency[authors.index('[deleted]')]
#del authors[authors.index('[deleted]')]

In [15]:
# number of posts before deletion
print('Before Deletion: ' + str(len(df)))
# number of posts after deletion
print('After Deletion: ' + str(len(df)-len(df.loc[['[deleted]']])))

Before Deletion: 551900
After Deletion: 292


In [9]:
len(authors)

84

In [12]:
#for i in range(len(df.loc[[authors[2]]])):
#    print(df.loc[[authors[2]]].iloc[[i][0]].subreddit)

In [10]:
# create a list of author subreddit counts
author_subreddit_counts = []
total_subreddit_count = []
# iterate through the authors
for author in authors:
    subreddits = []
    subreddit_count = []
    sub = []
    # find posts from that author in dataframe
    for i in range(len(df.loc[[author]])):
        # if this is the author's first post in the subreddit, add the subreddit name to subreddits list, and add one to the occcurces
        if not (df.loc[[author]].iloc[[i][0]].subreddit in subreddits):
            subreddits.append(df.loc[[author]].iloc[[i][0]].subreddit)
            subreddit_count.append(1)
        # else, add one to the subreddit's occurences at the subreddits index within subreddit count
        else:
            subreddit_count[subreddits.index(df.loc[[author]].iloc[[i][0]].subreddit)] += 1
    # after going through all the data, create a list of lists, which contain a subreddit and its occurence
    for i in range(len(subreddits)):
        sub.append([subreddits[i],subreddit_count[i]])
    # append this list to the author_subreddits list
    author_subreddit_counts.append(sub)

In [11]:
# save array of author counts
save_object(author_subreddit_counts, 'objects/', model_name + "-subreddit_analysis_author_subreddit_counts")

In [12]:
# create lists for subreddits, subreddit totals
subreddits = []
subreddit_post_totals = []
# iterate through the list of lists of lists to find all the occurences of a subreddit
for i in range(len(author_subreddit_counts)):
    # if a new subreddit is found, append it to all_subreddits, and add its occurences to the correct position in total_posts
    # if it has already been found, add its occurences from that user tothe correct position in total_posts
    for j in range(len(author_subreddit_counts[i])):
        if not(author_subreddit_counts[i][j][0] in subreddits):
            subreddits.append(author_subreddit_counts[i][j][0])
            subreddit_post_totals.append(author_subreddit_counts[i][j][1])
        else:
            subreddit_post_totals[subreddits.index(author_subreddit_counts[i][j][0])] += author_subreddit_counts[i][j][1]
            
#sort the subreddits and their post totals
subreddit_post_totals, subreddits = (list(t) for t in zip(*sorted(zip(subreddit_post_totals, subreddits))))

In [13]:
# print the percentages of posts in each subreddit present
sum_posts = 0
for posts in subreddit_post_totals:
    sum_posts += posts
    
for subreddit in subreddits:
    print(str(subreddit), end=": ")
    print(subreddit_post_totals[subreddits.index(subreddit)]*100/sum_posts)

ADHD: 34.93150684931507
Agronomy: 0.3424657534246575
Anthropology: 0.684931506849315
Archaeology: 1.7123287671232876
AskReddit: 2.0547945205479454
Atlanta: 0.3424657534246575
Bayes: 1.0273972602739727
BipolarReddit: 0.3424657534246575
BodyAcceptance: 3.767123287671233
Christianity: 0.3424657534246575
Coffee: 0.3424657534246575
Communications: 1.0273972602739727
CrappyDesign: 0.3424657534246575
DesktopDetective: 0.3424657534246575
DoesAnybodyElse: 0.3424657534246575
Drugs: 0.684931506849315
EarthPorn: 0.3424657534246575
Fitness: 0.3424657534246575
Games: 0.3424657534246575
GradSchool: 1.7123287671232876
IDAP: 0.3424657534246575
LadiesofScience: 0.684931506849315
LadyBoners: 0.3424657534246575
LongDistance: 0.3424657534246575
Medicaid: 0.3424657534246575
Military: 0.3424657534246575
Minecraft: 1.0273972602739727
Music: 0.3424657534246575
Names: 1.36986301369863
Naruto: 0.3424657534246575
Ohio: 0.3424657534246575
Paleontology: 0.3424657534246575
RedditDayOf: 0.3424657534246575
SFGiants: 0

In [14]:
# find percentage of authors who post in each subreddit
# create list that holds the number of authors that post in each subreddit, ordered by subreddit
num_authors_in_subreddits = []
for subreddit in subreddits:
    num_authors_in_subreddits.append(0)
# update the list with occurences
for i in range(len(author_subreddit_counts)):
    for j in range(len(author_subreddit_counts[i])):
        num_authors_in_subreddits[subreddits.index(author_subreddit_counts[i][j][0])] += 1

In [15]:
# print percentages of users in each subreddit present
for subreddit in subreddits:
    print(str(subreddit), end=": ")
    print(num_authors_in_subreddits[subreddits.index(subreddit)]/len(authors))

ADHD: 1.0
Agronomy: 0.011904761904761904
Anthropology: 0.011904761904761904
Archaeology: 0.011904761904761904
AskReddit: 0.05952380952380952
Atlanta: 0.011904761904761904
Bayes: 0.011904761904761904
BipolarReddit: 0.011904761904761904
BodyAcceptance: 0.011904761904761904
Christianity: 0.011904761904761904
Coffee: 0.011904761904761904
Communications: 0.011904761904761904
CrappyDesign: 0.011904761904761904
DesktopDetective: 0.011904761904761904
DoesAnybodyElse: 0.011904761904761904
Drugs: 0.011904761904761904
EarthPorn: 0.011904761904761904
Fitness: 0.011904761904761904
Games: 0.011904761904761904
GradSchool: 0.011904761904761904
IDAP: 0.011904761904761904
LadiesofScience: 0.011904761904761904
LadyBoners: 0.011904761904761904
LongDistance: 0.011904761904761904
Medicaid: 0.011904761904761904
Military: 0.011904761904761904
Minecraft: 0.011904761904761904
Music: 0.011904761904761904
Names: 0.011904761904761904
Naruto: 0.011904761904761904
Ohio: 0.011904761904761904
Paleontology: 0.011904761