In [1]:
import pickle 
import multiprocessing
import pprint
import gzip
import numpy as np

In [2]:
# load dict storing submissions (all votes/commented received per user) for the 4 target subreddits        
comments = pickle.load(gzip.open("../../Results/comments.pickle.gz","rb"))
# load dict storing list of moderators and bots
ignore = pickle.load(gzip.open("../../Results/ignore.pickle.gz","rb"))

In [3]:
def prepare_dict(comments,comments_analysis,key,votes,num_users,sum_votes,num_comments):
    comments_analysis[key] = {
            'values': {
                'sum_votes':sum_votes,
                'votes': votes,
                'deleted_comments_votes': comments[key]['comments']['[deleted]']['votes'],
            },
            'deleted_comments':{
                'votes':{
                    'sum': sum(comments[key]['comments']['[deleted]']['votes']),
                    'median': np.median(comments[key]['comments']['[deleted]']['votes']),
                    'mean': round(np.mean(comments[key]['comments']['[deleted]']['votes']),2),
                    'std': round(np.std(comments[key]['comments']['[deleted]']['votes']),2),
                },
            },
            'counts' : {
                'users': num_users,
                'comments': len(votes),
                'deleted_comments': len(comments[key]['comments']['[deleted]']['votes']),
                'votes': sum(sum_votes),
            },
            'votes_received': {
                'per_user': {
                    'median':np.median(sum_votes),
                    'mean': round(np.mean(sum_votes),2),
                    'std': round(np.std(sum_votes),2),
                    'percentile': {
                        1: np.percentile(sum_votes,1),
                        5: np.percentile(sum_votes,1),
                        25: np.percentile(sum_votes,25),
                        50: np.percentile(sum_votes,50),
                        75: np.percentile(sum_votes,75),
                        90: np.percentile(sum_votes,90),
                        95: np.percentile(sum_votes,95),
                        99: np.percentile(sum_votes,99),
                    }
                },
                'per_comment': {
                    'median': np.median(votes),
                    'mean': round(np.mean(votes),2),
                    'std': round(np.std(votes),2),
                    'percentile': {
                        1: np.percentile(votes,1),
                        5: np.percentile(votes,5),
                        25: np.percentile(votes,25),
                        50: np.percentile(votes,50),
                        75: np.percentile(votes,75),
                        90: np.percentile(votes,90),
                        95: np.percentile(votes,95),
                        99: np.percentile(votes,99),
                    }
                }
            },
            'comments': {
                'per_user': {
                    'median':np.median(num_comments),
                    'mean': round(np.mean(num_comments),2),
                    'std': round(np.std(num_comments),2),
                    'percentile': {
                        1: np.percentile(num_comments,1),
                        1: np.percentile(num_comments,5),
                        25: np.percentile(num_comments,25),
                        50: np.percentile(num_comments,50),
                        75: np.percentile(num_comments,75),
                        90: np.percentile(num_comments,90),
                        90: np.percentile(num_comments,95),
                        99: np.percentile(num_comments,99),
                    }
                }
            }

        }
    return comments_analysis[key]

In [5]:
subreddits = ['politics','SandersForPresident','The_Donald','Conservative','news','NeutralPolitics', \
             'democrats','hillaryclinton','Republican','ukpolitics','worldnews','TrueReddit','progressive']

In [6]:
comments_analysis = {}

for subreddit in subreddits:
    
    num_commentators = 0
    num_comments = []
    sum_votes = []
    votes = []
    
    for commentator in comments[subreddit]['commentators']:
        if commentator not in ignore:
            num_commentators += 1
            num_comments.append(len(comments[subreddit]['comments'][commentator]['votes']))
            sum_votes.append(sum(comments[subreddit]['comments'][commentator]['votes']))
            votes.extend(comments[subreddit]['comments'][commentator]['votes'])

            
    comments_analysis[subreddit] = \
    prepare_dict(comments,comments_analysis,subreddit, \
                 votes,num_commentators,sum_votes,num_comments)
    
pickle_out = open("../../Results/comments_analysis.pickle","wb")
pickle.dump(comments_analysis, pickle_out)
pickle_out.close()

In [7]:
def percentage_required(values):
    current_sum,start,count = 0,0,0
    len_values = len(values)
    sum_values = sum(values)
    percentages = {}
    for percentage in [25,50,80,95,99]:
        for key,value in enumerate(values[start:]):
            count += 1
            current_sum += value
            if current_sum >= sum_values * (percentage/100):
                start = key
                break
        percentages[percentage] = round((count / len_values) * 100,2)
    return percentages

In [9]:
percentages = {}

for subreddit in subreddits:
   
    values_array = {
        '%_of_users_required_for_%_of_votes_received': sorted(comments_analysis[subreddit]['values']['sum_votes'],reverse=True),
        '%_of_comments_required_for_%_of_votes_received': sorted(comments_analysis[subreddit]['values']['votes'],reverse=True),
    }
    
    percentages[subreddit] = {}
    
    for metric in values_array:
        percentages[subreddit][metric] = percentage_required(values_array[metric])

pickle_out = open("../../Results/com_perc_req.pickle","wb")
pickle.dump(percentages,pickle_out)
pickle_out.close()