### Import required Libaries

In [None]:
import lzma
import bz2
import pickle 
import json
import os
import multiprocessing

### Define function for extracting comment information for a specific month

In [None]:
def extract_comments_information(file_path):
    print(file_path[-8:-3])
    
    subreddits = {}
    commentators = {}
    
    if '.bz2' in file_path:
        _open = bz2.open
    else:
        _open = lzma.open
    
    with _open(file_path,'rt') as comments_data:
        for i,comment in enumerate(comments_data):
            comment = json.loads(comment)
            if 'subreddit' in comment:
                if comment['subreddit'] not in subreddits:
                    subreddits[comment['subreddit']] = {
                        'comments': {},
                        'commentators': set()
                    }
                if comment['author'] not in subreddits[comment['subreddit']]['comments']:
                    subreddits[comment['subreddit']]['comments'][comment['author']] = {
                        'timestamps': [],
                        'votes': []
                    }
                    
                subreddits[comment['subreddit']]['comments'][comment['author']]['timestamps'].append(comment['created_utc'])
                subreddits[comment['subreddit']]['comments'][comment['author']]['votes'].append(comment['score'])
                subreddits[comment['subreddit']]['commentators'].add(comment['author'])
                
                if comment['author'] not in commentators:
                    commentators[comment['author']] = {
                        'timestamps': [],
                        'votes': []
                    }
                    
                commentators[comment['author']]['timestamps'].append(comment['created_utc'])
                commentators[comment['author']]['votes'].append(comment['score'])
                
            if i % 1000000 == 0:
                print(file_path,i)

        #save the subreddit dict in a file
        pickle_out = open("../../Results/comments_" + file_path[-8:-3] + ".pickle","wb")
        pickle.dump(subreddits, pickle_out)
        pickle_out.close()
        
        #save the subreddit dict in a file
        pickle_out = open("../../Results/commentators_" + file_path[-8:-3] + ".pickle","wb")
        pickle.dump(commentators, pickle_out)
        pickle_out.close()

### Identify all files containing information about comments 

In [None]:
# path storing the data regarding the comments
folder = '/home/data/reddit/comments/'
files = os.listdir(folder)
required_files = []

for key,file in enumerate(files):
    if file[3:7] >= "2016" and ('bz2' in file or 'xz' in file):
        required_files.append(folder + file)

### Run the calculations parallelised over the number of cores available

In [None]:
p = multiprocessing.Pool(len(required_files))
print(p.map(extract_comments_information, required_files))
p.close()

In [None]:
subreddits = {}
commentators = {}

for file in required_files:
    commentators_temp = pickle.load(open("Results/commentators_" + file + ".pickle.gz","rb"))
    subreddits_temp = pickle.load(open("Results/comments_" + file + ".pickle.gz","rb"))
    
    for commentator in commentators_temp:
        
        if commentator not in commentators:
            commentators[commentator] = {
                        'timestamps': [],
                        'votes': []
                    }
        commentators[commentator]['timestamps'].extend(
            commentators_temp[commentator]['timestamps']
        )
        commentators[commentator]['votes'].extend(
            commentators_temp[commentator]['votes']
        )
        
    for subreddit in subreddits_temp:
        
        if subreddit not in subreddits:
            subreddits[subreddit] = {
                'comments': {},
                'commentators': set()
            }
        
        subreddits[subreddit]['commentators'].update(
            subreddits_temp[subreddit]['commentators']
        )
        
        for commentator in subreddits_temp[subreddit]['comments']:
            
            if commentator not in subreddits[subreddit]['comments']:
                subreddits[subreddit]['comments'][commentator] = {
                    'timestamps': [],
                    'votes': []
                }
                
            subreddits[subreddit]['comments'][commentator]['timestamps'].extend(
                subreddits_temp[subreddit]['comments'][commentator]['timestamps']
            )
            
            subreddits[subreddit]['comments'][commentator]['votes'].extend(
                subreddits_temp[subreddit]['comments'][commentator]['votes']
            )
            
    print(len(commentators))
    print(len(subreddits['politics']['commentators']))
                        
pickle_out = open("../../Results/commentators.pickle","wb")
pickle.dump(commentators, pickle_out)
pickle_out.close()

pickle_out = open("../../Results/comments.pickle","wb")
pickle.dump(subreddits, pickle_out)
pickle_out.close()