In [5]:
import pickle 
import multiprocessing
import pprint
import gzip
import numpy as np

In [2]:
# load dict storing submissions (all votes/commented received per user) for the 4 target subreddits        
submissions = pickle.load(gzip.open("../../Results/submissions.pickle.gz","rb"))
# load dict storing list of moderators and bots
ignore = pickle.load(gzip.open("../../Results/ignore.pickle.gz","rb"))

In [6]:
def prepare_dict(submissions,submissions_analysis,key,comments,votes,num_authors,sum_comments,sum_votes,num_posts):
    submissions_analysis[key] = {
            'values': {
                'sum_votes':sum_votes,
                'sum_comments':sum_comments,
                'votes': votes,
                'comments': comments,
                'deleted_posts_votes': submissions[key]['submissions']['[deleted]']['votes'],
                'deleted_posts_comments': submissions[key]['submissions']['[deleted]']['comments'],
            },
            'deleted_posts':{
                'votes':{
                    'sum': sum(submissions[key]['submissions']['[deleted]']['votes']),
                    'median': np.median(submissions[key]['submissions']['[deleted]']['votes']),
                    'mean': round(np.mean(submissions[key]['submissions']['[deleted]']['votes']),2),
                    'std': round(np.std(submissions[key]['submissions']['[deleted]']['votes']),2),
                },
                'comments':{
                    'sum': sum(submissions[key]['submissions']['[deleted]']['comments']),
                    'median': np.median(submissions[key]['submissions']['[deleted]']['comments']),
                    'mean': round(np.mean(submissions[key]['submissions']['[deleted]']['comments']),2),
                    'std': round(np.std(submissions[key]['submissions']['[deleted]']['comments']),2),
                }
            },
            'counts' : {
                'authors': num_authors,
                'comments': sum(sum_comments),
                'posts': len(votes),
                'votes': sum(sum_votes),
                'deleted_posts': len(submissions[key]['submissions']['[deleted]']['votes']),
            },
            'votes_received': {
                'per_user': {
                    'median':np.median(sum_votes),
                    'mean': round(np.mean(sum_votes),2),
                    'std': round(np.std(sum_votes),2),
                    'percentile': {
                        1: np.percentile(sum_votes,1),
                        5: np.percentile(sum_votes,5),
                        25: np.percentile(sum_votes,25),
                        50: np.percentile(sum_votes,50),
                        75: np.percentile(sum_votes,75),
                        90: np.percentile(sum_votes,90),
                        95: np.percentile(sum_votes,95),
                        99: np.percentile(sum_votes,99),
                    }
                },
                'per_post': {
                    'median': np.median(votes),
                    'mean': round(np.mean(votes),2),
                    'std': round(np.std(votes),2),
                    'percentile': {
                        1: np.percentile(votes,1),
                        5: np.percentile(votes,5),
                        25: np.percentile(votes,25),
                        50: np.percentile(votes,50),
                        75: np.percentile(votes,75),
                        90: np.percentile(votes,90),
                        95: np.percentile(votes,95),
                        99: np.percentile(votes,99),
                    }
                }
            },
            'comments_received': {
                'per_user': {
                    'median':np.median(sum_comments),
                    'mean': round(np.mean(sum_comments),2),
                    'std': round(np.std(sum_comments),2),
                    'percentile': {
                        1: np.percentile(sum_comments,1),
                        5: np.percentile(sum_comments,5),
                        25: np.percentile(sum_comments,25),
                        50: np.percentile(sum_comments,50),
                        75: np.percentile(sum_comments,75),
                        90: np.percentile(sum_comments,90),
                        95: np.percentile(sum_comments,95),
                        99: np.percentile(sum_comments,99),
                    }
                },
                'per_post': {
                    'median': np.median(comments),
                    'mean': round(np.mean(comments),2),
                    'std': round(np.std(comments),2),
                    'percentile': {
                        1: np.percentile(comments,1),
                        5: np.percentile(comments,5),
                        25: np.percentile(comments,25),
                        50: np.percentile(comments,50),
                        75: np.percentile(comments,75),
                        90: np.percentile(comments,90),
                        95: np.percentile(comments,95),
                        99: np.percentile(comments,99),
                    }
                }
            },
            'posts': {
                'per_user': {
                    'median':np.median(num_posts),
                    'mean': round(np.mean(num_posts),2),
                    'std': round(np.std(num_posts),2),
                    'percentile': {
                        1: np.percentile(num_posts,1),
                        5: np.percentile(num_posts,5),
                        25: np.percentile(num_posts,25),
                        50: np.percentile(num_posts,50),
                        75: np.percentile(num_posts,75),
                        90: np.percentile(num_posts,90),
                        95: np.percentile(num_posts,95),
                        99: np.percentile(num_posts,99),
                    }
                }
            }

        }
    return submissions_analysis[key]

In [7]:
submissions_analysis = {}
subreddits = ['politics','SandersForPresident','The_Donald','Conservative','news','NeutralPolitics', \
             'democrats','hillaryclinton','Republican','ukpolitics','worldnews','TrueReddit','progressive']
for subreddit in subreddits:
    print(subreddit)
    num_submissioners = 0
    num_posts = []
    sum_votes = []
    votes = []
    sum_comments = []
    comments = []
    
    for submissioner in submissions[subreddit]['submissioners']:
        if submissioner not in ignore:
            num_submissioners += 1
            num_posts.append(len(submissions[subreddit]['submissions'][submissioner]['votes']))
            sum_votes.append(sum(submissions[subreddit]['submissions'][submissioner]['votes']))
            sum_comments.append(sum(submissions[subreddit]['submissions'][submissioner]['comments']))
            votes.extend(submissions[subreddit]['submissions'][submissioner]['votes'])
            comments.extend(submissions[subreddit]['submissions'][submissioner]['comments'])

            
    submissions_analysis[subreddit] = \
    prepare_dict(submissions,submissions_analysis,subreddit, \
                 comments,votes,num_submissioners,sum_comments,sum_votes,num_posts)
    

pickle_out = open("../../Results/submissions_analysis.pickle","wb")
pickle.dump(submissions_analysis, pickle_out)
pickle_out.close()

politics
SandersForPresident
The_Donald
Conservative
news
NeutralPolitics
democrats
hillaryclinton
Republican
ukpolitics
worldnews
TrueReddit
progressive


In [8]:
def percentage_required(values):
    current_sum,start,count = 0,0,0
    len_values = len(values)
    sum_values = sum(values)
    percentages = {}
    for percentage in [25,50,80,95,99]:
        for key,value in enumerate(values[start:]):
            count += 1
            current_sum += value
            if current_sum >= sum_values * (percentage/100):
                start = key
                break
        percentages[percentage] = round((count / len_values) * 100,2)
    return percentages

In [10]:
percentages = {}

for subreddit in subreddits:
   
    values_array = {
        '%_of_users_required_for_%_of_votes_received': sorted(submissions_analysis[subreddit]['values']['sum_votes'],reverse=True),
        '%_of_posts_required_for_%_of_votes_received': sorted(submissions_analysis[subreddit]['values']['votes'],reverse=True),
        '%_of_users_required_for_%_of_comments_received': sorted(submissions_analysis[subreddit]['values']['sum_comments'],reverse=True),
        '%_of_posts_required_for_%_of_comments_received': sorted(submissions_analysis[subreddit]['values']['comments'],reverse=True)
    }
    
    percentages[subreddit] = {}
    
    for metric in values_array:
        percentages[subreddit][metric] = percentage_required(values_array[metric])
        
pickle_out = open("../../Results/sub_perc_req.pickle","wb")
pickle.dump(percentages,pickle_out)
pickle_out.close()