## Preprocess raw reddit data

In [1]:
import os
from preprocess import *
data_dir = "../data/"
category = ['Alt', 'Center', 'Left', 'Right']

### stop words

In [2]:
with open(os.path.join(data_dir, "StopWords")) as f:
    stop_words = set(f.read().splitlines())

### Function definition

In [3]:
### fetch a batch of strings from raw data
import json
import html
def fetchBatch(source, batch_size=None):
    """
    return Array of string
    """
    with open(source) as f:
        data = json.load(f)
    if not batch_size:
        size = len(data)
    else:
        size = batch_size
    output = [html.unescape(json.loads(data[i])['body']) for i in range(size)]
    return output
    
        

In [4]:
### function to extract features and return set for tag, lemma, entity
### return features in a way of dict
### using a spacy pipeline with multi-threading
import time
import pickle
from collections import Counter
def extract(nlp, data, batch_size, n_threads, category, stopwords=[], store=True, report_per_doc=10):
    start = time.time()
    tag = set()
    lemma = set()
    entity = set()
    feats = []
    num = 0
    total = len(data)
    for doc in nlp.pipe(data, batch_size=batch_size, n_threads=n_threads):
        num += 1
        if num % report_per_doc == 0:
            print(f"{num}/{total}")
        t = Counter()
        l = Counter()
        e = Counter()
        length = 0
        for i in doc:
            if not i.is_punct and i.tag_!='' and i.tag_ not in".:," and i.lemma_.isalpha() and i.lemma_ not in stopwords:
                length += 1
                tag.add(i.tag_)
                t.update([i.tag_])
                lemma.add(i.lemma_)
                l.update([i.lemma_])
        entity.update([ent.label_ for ent in doc.ents])
        e.update([ent.label_ for ent in doc.ents])
        f = {}
        f.update(dict(t))
        f.update(dict(l))
        f.update(dict(e))
        f.update({"LENGTH":length})
        feats.append(f)
    print(time.time() - start)
    if store:
        with open(category +"tag.pk", "wb+") as f:
            pickle.dump(tag, f)
        with open(category + "lemma.pk", "wb+") as f:
            pickle.dump(lemma, f)
        with open(category + "entity.pk", "wb+") as f:
            pickle.dump(entity, f)
        with open(category +"feats.pk", "wb+") as f:
            pickle.dump(feats, f)
    return tag, lemma, entity, feats


### Using spacy 'en_core_web_sm' model to tag, lemmatize, and figure out entities

In [8]:
import spacy
nlp = spacy.load('en', disable=['parser'])

In [10]:
for cat in category[]:
    data = fetchBatch(os.path.join(data_dir, cat))
    extract(nlp, data, 1000, 4, cat, stopwords=stop_words, store=True, report_per_doc=10000)

10000/200272
20000/200272
30000/200272
40000/200272
50000/200272
60000/200272
70000/200272
80000/200272
90000/200272
100000/200272
110000/200272
120000/200272
130000/200272
140000/200272
150000/200272
160000/200272
170000/200272
180000/200272
190000/200272
200000/200272
1135.243642091751
