### Import all required libraries

In [None]:
import json
import os
import multiprocessing
import pickle
import bz2
import lzma
from IPython.display import display, clear_output

### Define target boards

In [None]:
target_boards = ['politics','NeutralPolitics','SandersForPresident','The_Donald','hillaryclinton',
                 'Republican','democrats','The_Farage','Le_Pen','altright','progressive','Conservative',
                 'ukpolitics','LateStageCapitalism','Libertarian']

### Function for extracting the messages of the comments for a specific month

In [None]:
# input: the path of a file containing all comments posted on Reddit for one month 
# output: dictionary mapping each target board to the all comments posted in that month
def extract_comments(file_path):
    print(file_path[32:37])
    
    # init dictionary which will hold an array for each of the target boards
    # with all comments posted in the target month
    comments_corpus = {}
    for board in target_boards:
        comments_corpus[board] = []    
        
    if '.bz2' in file_path:
        _open = bz2.open
    else:
        _open = lzma.open
    
    # iterate over all comments posted on Reddit in the selected month
    with _open(file_path,'rt') as comments_data:
        for i,comment in enumerate(comments_data):
            # load comment into json object
            comment = json.loads(comment)
            # check if comment contains information about where it was posted
            # all comments should include this information
            if 'subreddit' in comment:
                # check if the subreddit where the comment was posted
                # is in the target boards
                if comment['subreddit'] in target_boards:
                    # if yes append the body of the comment
                    comments_corpus[comment['subreddit']].append(comment['body'])
                
            # display progress
            if i % 1000000 == 0:
                clear_output(wait=True)
                print(file_path,i)
            
        # serlialize the result for this month
        pickle_out = open("../../Results/comments_corpus_" + file_path[32:37] + ".pickle","wb")
        pickle.dump(comments_corpus, pickle_out)
        pickle_out.close()

### Extract all files containing the comments posted on Reddit. Each file contains information about one single month

In [None]:
# path storing the data regarding the comments
folder = '/home/data/reddit/comments/'
files = os.listdir(folder)
required_files = []

for key,file in enumerate(files):
    if file[3:7] >= "2016" and ('bz2' in file or 'xz' in file):
        required_files.append(folder + file)

### Run the calculation and parallelise over the number of cores available for the months

In [None]:
# run the calculation
print(len(required_files))
p = multiprocessing.Pool(len(required_files))
print(p.map(extract_comments, required_files))
p.close()

In [None]:
comments_corpus = {}

for file in sorted(required_files):
    comments_corpus_temp = pickle.load(open("../../Results/comments_corpus_" + file + ".pickle","rb"))
    for subreddit in comments_corpus_temp:
        if subreddit not in comments_corpus:
            comments_corpus[subreddit] = []
        comments_corpus[subreddit].extend(
            comments_corpus_temp[subreddit]
        )
        print(subreddit,len(comments_corpus[subreddit]))
                        
pickle_out = open("../../Results/comments_corpus.pickle","wb")
pickle.dump(comments_corpus, pickle_out)
pickle_out.close()