### Import required Libaries

In [None]:
import lzma
import bz2
import pickle 
import json
import os
import multiprocessing

### Define function for extracting comment information for a specific month

In [None]:
def extract_submissions_information(file_path):
    print(file_path[-8:-3])
    
    subreddits = {}
    submissioners = {}
    
    if '.bz2' in file_path:
        _open = bz2.open
    else:
        _open = lzma.open
    
    with _open(file_path,'rt') as submissions_data:
        for i,submission in enumerate(submissions_data):
            submission = json.loads(submission)
            if 'subreddit' in submission:
                if submission['subreddit'] not in subreddits:
                    subreddits[submission['subreddit']] = {
                        'submissions': {},
                        'submissioners': set()
                    }
                if submission['author'] not in subreddits[submission['subreddit']]['submissions']:
                    subreddits[submission['subreddit']]['submissions'][submission['author']] = {
                        'timestamps': [],
                        'votes': [],
                        'comments': []
                    }
                    
                subreddits[submission['subreddit']]['submissions'][submission['author']]['timestamps'].append(submission['created_utc'])
                subreddits[submission['subreddit']]['submissions'][submission['author']]['votes'].append(submission['score'])
                subreddits[submission['subreddit']]['submissions'][submission['author']]['comments'].append(submission['num_comments'])
                subreddits[submission['subreddit']]['submissioners'].add(submission['author'])
                
                if submission['author'] not in submissioners:
                    submissioners[submission['author']] = {
                        'timestamps': [],
                        'votes': [],
                        'comments': []
                    }
                    
                submissioners[submission['author']]['timestamps'].append(submission['created_utc'])
                submissioners[submission['author']]['votes'].append(submission['score'])
                submissioners[submission['author']]['comments'].append(submission['num_comments'])
                
            if i % 1000000 == 0:
                print(file_path,i)

        #save the subreddit dict in a file
        pickle_out = open("../../Results/submissions_" + file_path[-8:-3] + ".pickle","wb")
        pickle.dump(subreddits, pickle_out)
        pickle_out.close()
        
        #save the subreddit dict in a file
        pickle_out = open("../../Results/submissioners_" + file_path[-8:-3] + ".pickle","wb")
        pickle.dump(submissioners, pickle_out)
        pickle_out.close()

### Identify all files containing information about submissions 

In [None]:
# path storing the data regarding the submissions
folder = '/home/data/reddit/submissions/'
files = os.listdir(folder)
required_files = []


for key,file in enumerate(files):
    if file[3:7] >= "2016" and ('bz2' in file or 'xz' in file):
        required_files.append(folder + file)

### Run the calculations parallelised over the number of cores available

In [None]:
# run the calculation
p = multiprocessing.Pool(len(required_files))
print(p.map(worker, required_files))
p.close()

In [None]:
subreddits = {}
submissioners = {}

for file in required_files:
    submissioners_temp = pickle.load(open("Results/submissioners_" + file + ".pickle.gz","rb"))
    subreddits_temp = pickle.load(open("Results/submissions_" + file + ".pickle.gz","rb"))
    
    for submissioner in submissioners_temp:
        
        if submissioner not in submissioners:
            submissioners[submissioner] = {
                'timestamps': [],
                'votes': [],
                'comments': []
                        
            }
            
        submissioners[submissioner]['timestamps'].extend(
            submissioners_temp[submissioner]['timestamps']
        )
        submissioners[submissioner]['votes'].extend(
            submissioners_temp[submissioner]['votes']
        )
        submissioners[submissioner]['comments'].extend(
            submissioners_temp[submissioner]['comments']
        )
        
    for subreddit in subreddits_temp:
        
        if subreddit not in subreddits:
            subreddits[subreddit] = {
                'submissions': {},
                'submissioners': set()
            }
        
        subreddits[subreddit]['submissioners'].update(
            subreddits_temp[subreddit]['submissioners']
        )
        
        for submissioner in subreddits_temp[subreddit]['submissions']:
            
            if submissioner not in subreddits[subreddit]['submissions']:
                subreddits[subreddit]['submissions'][submissioner] = {
                    'timestamps': [],
                    'votes': [],
                    'comments': []
                }
                
            subreddits[subreddit]['submissions'][submissioner]['timestamps'].extend(
                subreddits_temp[subreddit]['submissions'][submissioner]['timestamps']
            )
            
            subreddits[subreddit]['submissions'][submissioner]['votes'].extend(
                subreddits_temp[subreddit]['submissions'][submissioner]['votes']
            )
            
            subreddits[subreddit]['submissions'][submissioner]['comments'].extend(
                subreddits_temp[subreddit]['submissions'][submissioner]['comments']
            )
    print(len(submissioners))
    print(len(subreddits['politics']['submissioners']))

                        
pickle_out = open("../../Results/submissioners.pickle","wb")
pickle.dump(submissioners, pickle_out)
pickle_out.close()

pickle_out = open("../../Results/submissions.pickle","wb")
pickle.dump(subreddits, pickle_out)
pickle_out.close()