In [1]:
import pickle

import numpy as np

from lda2vec import preprocess, Corpus
from sklearn.datasets import fetch_20newsgroups

In [2]:
# Fetch data
remove = ('headers', 'footers', 'quotes')
texts = fetch_20newsgroups(subset = 'train', remove = remove).data

In [3]:
# Remove tokens with these substrings
bad = set(["ax>", '`@("', '---', '===', '^^^'])

def clean(line):
    return ' '.join(w for w in line.split() if not any(t in w for t in bad))  

In [4]:
# Preprocess data
max_length = 10000
texts = [str(clean(d)) for d in texts if len(str(clean(d))) > 0]

In [5]:
texts[0]

'I was wondering if anyone out there could enlighten me on this car I saw the other day. It was a 2-door sports car, looked to be from the late 60s/ early 70s. It was called a Bricklin. The doors were really small. In addition, the front bumper was separate from the rest of the body. This is all I know. If anyone can tellme a model name, engine specs, years of production, where this car is made, history, or whatever info you have on this funky looking car, please e-mail.'

In [6]:
tokens, vocab = preprocess.tokenize(texts, max_length, merge = False,
                                    n_threads = 4)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [8]:
vocab

{0: 'i',
 1: 'was',
 2: 'wondering',
 3: 'if',
 4: 'anyone',
 5: 'out',
 6: 'there',
 7: 'could',
 8: 'enlighten',
 9: 'me',
 10: 'on',
 11: 'this',
 12: 'car',
 13: 'saw',
 14: 'the',
 15: 'other',
 16: 'day',
 17: '.',
 18: 'it',
 19: 'a',
 20: '2-door',
 21: 'sports',
 22: ',',
 23: 'looked',
 24: 'to',
 25: 'be',
 26: 'from',
 27: 'late',
 28: '60s/',
 29: 'early',
 30: '70s',
 31: 'called',
 32: 'bricklin',
 33: 'doors',
 34: 'were',
 35: 'really',
 36: 'small',
 37: 'in',
 38: 'addition',
 39: 'front',
 40: 'bumper',
 41: 'separate',
 42: 'rest',
 43: 'of',
 44: 'body',
 45: 'is',
 46: 'all',
 47: 'know',
 48: 'can',
 49: 'tellme',
 50: 'model',
 51: 'name',
 52: 'engine',
 53: 'specs',
 54: 'years',
 55: 'production',
 56: 'where',
 57: 'made',
 58: 'history',
 59: 'or',
 60: 'whatever',
 61: 'info',
 62: 'you',
 63: 'have',
 64: 'funky',
 65: 'looking',
 66: 'please',
 67: 'e',
 68: '-',
 69: 'mail',
 70: 'fair',
 71: 'number',
 72: 'brave',
 73: 'souls',
 74: 'who',
 75: 'upgr

In [9]:
corpus = Corpus()

# Make a ranked list of rare vs frequent words
corpus.update_word_count(tokens)
corpus.finalize()

# The tokenization uses spaCy indices, and so may have gaps
# between indices for words that aren't present in our dataset.
# This builds a new compact index
compact = corpus.to_compact(tokens)

# Remove extremely rare words
pruned = corpus.filter_count(compact, min_count = 30)

# Convert the compactified arrays into bag of words arrays
bow = corpus.compact_to_bow(pruned)

# Words tend to have power law frequency, so selectively
# downsample the most prevalent words
clean = corpus.subsample_frequent(pruned)

# Now flatten a 2D array of document per row and word position
# per column to a 1D array of words. This will also remove skips
# and OoV words
doc_ids = np.arange(pruned.shape[0])
flattened, (doc_ids,) = corpus.compact_to_flat(pruned, doc_ids)

In [10]:
assert flattened.min() >= 0

In [11]:
# Download WordVectors
# wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"
# unzip GoogleNews-vectors-negative300.bin.gz
# mv GoogleNews-vectors-negative300.bin ../models/embeddings/

In [12]:
# Fill in the pretrained word vectors
n_dim = 300
fn_wordvc = '../models/embeddings/GoogleNews-vectors-negative300.bin'
vectors, s, f = corpus.compact_word_vectors(vocab, filename = fn_wordvc)

In [14]:
# Save all of the preprocessed files
pickle.dump(vocab, open('../data/vocab.pkl', 'wb'))
pickle.dump(corpus, open('../data/corpus.pkl', 'wb'))
np.save("../data/flattened", flattened)
np.save("../data/doc_ids", doc_ids)
np.save("../data/pruned", pruned)
np.save("../data/bow", bow)
np.save("../data/vectors", vectors)