## Preprocess Newsgroup Posts

Data for this example comes from the [20 Newsgroups Dataset](http://qwone.com/~jason/20Newsgroups/). This consists of approximately 18,800 newsgroup posts from 20 different neesgroups.

The data is provided as a tar.gz file. Unzip and untar the file into a folder `data/20-newsgroups`. This will create a `20news-bydate-test` and `20news-bydate-train` folder below this directory. Under each of these two folders, additional folders corresponding to each of the 20 newsgroups contain the posting files, one file per post.

This notebook will parse out the text from each post, regardless of folder, and write the text body for each post into a single line in the `texts.tsv` output file. We also capture the labels (name of newsgroup) into a `labels.tsv` file.

In [1]:
import os
import spacy
import string

from spacy.lang.en.stop_words import STOP_WORDS

In [2]:
def remove_headers(text_lines):
    """ remove email headers -- email headers are separated from body
        by a single empty line """
    for i in range(len(text_lines)):
        if text_lines[i] == "\n":
            start = i + 1
            break
    return text_lines[start:]


def preprocess_line(line):
    """ line level preprocessing -- remove tabs and punctuation """
    line = line.strip()
    line = line.replace('\t', " ")
    translator = str.maketrans('', '', string.punctuation)
    return line.translate(translator)


def preprocess_tokens(tokens, stopwords):
    """ token level preprocessing -- see steps """
    # remove quotes around characters
    tokens = map(
        lambda x: x[1:-1] if (x.startswith("'") and x.endswith("'")) else x, 
        tokens)
    tokens = map(
        lambda x: x[1:-1] if (x.startswith("\"") and x.endswith("\"")) else x, 
        tokens)
    # remove numeric tokens
    tokens = [x for x in tokens if not x.isdigit()]
    # remove short words
    tokens = [x for x in tokens if len(x) > 2]
    # lowercase the tokens
    tokens = [x.lower() for x in tokens]
    # remove stopwords
    tokens = [x for x in tokens if x not in stopwords]
    return tokens


def read_file_contents(file_path):
    """ read file, preprocess text, return preprocessed text """
    with open(file_path, "r", errors="replace") as fin:
        text_lines = fin.readlines()
    # remove headers
    text_lines = remove_headers(text_lines)
    tokens = []
    for line in text_lines:
        line = preprocess_line(line)
        line_tokens = line.split(" ")
        line_tokens = preprocess_tokens(line_tokens, STOP_WORDS)
        tokens.extend(line_tokens)
    return " ".join(tokens)

In [3]:
DATA_DIR = "../data"

input_dir = os.path.join(DATA_DIR, "20-newsgroups")
output_dir = os.path.join(DATA_DIR, "docs")

subdir_lookup = {x:i for i, x in enumerate(os.listdir(input_dir))}
print(subdir_lookup)
subdir = list(subdir_lookup.keys())[0]
label_lookup = {x:i for i, x in enumerate(os.listdir(os.path.join(input_dir, subdir)))}
print(label_lookup)

stopwords = spacy.lang.en.stop_words.STOP_WORDS
num_written = 0
ftexts = open(os.path.join(DATA_DIR, "texts.tsv"), "w")
flabels = open(os.path.join(DATA_DIR, "labels.tsv"), "w")
for subdir in os.listdir(input_dir):
    for label in os.listdir(os.path.join(input_dir, subdir)):
        for filename in os.listdir(os.path.join(input_dir, subdir, label)):
            if num_written % 1000 == 0:
                print("{:d} files processed".format(num_written))
            text = read_file_contents(os.path.join(input_dir, subdir, label, filename))
            if len(text.strip()) == 0:
                continue
            doc_id = "{:d}-{:d}-{:s}".format(
                subdir_lookup[subdir], label_lookup[label], filename)
            ftexts.write("\t".join([doc_id, text]) + "\n")
            flabels.write("\t".join([doc_id, label]) + "\n")
            num_written += 1

print("{:d} files processed, COMPLETE".format(num_written))
ftexts.close()
flabels.close()

{'20news-bydate-test': 0, '20news-bydate-train': 1}
{'alt.atheism': 0, 'comp.graphics': 1, 'comp.os.ms-windows.misc': 2, 'comp.sys.ibm.pc.hardware': 3, 'comp.sys.mac.hardware': 4, 'comp.windows.x': 5, 'misc.forsale': 6, 'rec.autos': 7, 'rec.motorcycles': 8, 'rec.sport.baseball': 9, 'rec.sport.hockey': 10, 'sci.crypt': 11, 'sci.electronics': 12, 'sci.med': 13, 'sci.space': 14, 'soc.religion.christian': 15, 'talk.politics.guns': 16, 'talk.politics.mideast': 17, 'talk.politics.misc': 18, 'talk.religion.misc': 19}
0 files processed
1000 files processed
2000 files processed
3000 files processed
4000 files processed
5000 files processed
6000 files processed
7000 files processed
8000 files processed
9000 files processed
10000 files processed
11000 files processed
12000 files processed
13000 files processed
14000 files processed
15000 files processed
16000 files processed
17000 files processed
18000 files processed
18810 files processed, COMPLETE
