In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import fetch_20newsgroups
import numpy as np
import pickle
import random
from scipy import sparse
import itertools
from scipy.io import savemat, loadmat
import re
import string

# 1 Load Corpus

In [2]:
# Read data
print('reading data...')
train_data = fetch_20newsgroups(subset='train')
test_data = fetch_20newsgroups(subset='test')

# retrieving data
# the following regular expression would strip any space and change line symbol (/n)
# note: \w = [a-zA-Z0-9_] 
init_docs_tr = [re.findall(r'''[\w']+|[.,!?;-~{}`´_<=>:/@*()&'$%#"]''', train_data.data[doc]) for doc in range(len(train_data.data))]
init_docs_ts = [re.findall(r'''[\w']+|[.,!?;-~{}`´_<=>:/@*()&'$%#"]''', test_data.data[doc]) for doc in range(len(test_data.data))]

def contains_punctuation(w):
    return any(char in string.punctuation for char in w)

def contains_numeric(w):
    return any(char.isdigit() for char in w)

# pre-processing
init_docs = init_docs_tr + init_docs_ts
# remove punctuation & lowerize characters
init_docs = [[w.lower() for w in init_docs[doc] if not contains_punctuation(w)] for doc in range(len(init_docs))]
# remove numeric
init_docs = [[w for w in init_docs[doc] if not contains_numeric(w)] for doc in range(len(init_docs))]
# remove single character, e.g., "a", "b"
init_docs = [[w for w in init_docs[doc] if len(w)>1] for doc in range(len(init_docs))]
# unnest the nested list into format ["tokens of first document 1". "......"]
init_docs = [" ".join(init_docs[doc]) for doc in range(len(init_docs))] # len(init_docs) = 18846

reading data...


In [3]:
# see an example of preprocessing
print(train_data.data[1])
print("== After processing ==\n")
print(init_docs[1])


From: guykuo@carson.u.washington.edu (Guy Kuo)
Subject: SI Clock Poll - Final Call
Summary: Final call for SI clock reports
Keywords: SI,acceleration,clock,upgrade
Article-I.D.: shelley.1qvfo9INNc3s
Organization: University of Washington
Lines: 11
NNTP-Posting-Host: carson.u.washington.edu

A fair number of brave souls who upgraded their SI clock oscillator have
shared their experiences for this poll. Please send a brief message detailing
your experiences with the procedure. Top speed attained, CPU rated speed,
add on cards and adapters, heat sinks, hour of usage per day, floppy disk
functionality with 800 and 1.4 m floppies are especially requested.

I will be summarizing in the next two days, so please add to the network
knowledge base if you have done the clock upgrade and haven't answered this
poll. Thanks.

Guy Kuo <guykuo@u.washington.edu>

== After processing ==

from guykuo carson washington edu guy kuo subject si clock poll final call summary final call for si clock reports ke

# 2 Build Vocabulary Mapping (id2word, word2id)

In [4]:
# Create count vectorizer
# Maximum / minimum document frequency
max_df = 0.7
min_df = 10  # choose desired value for min_df

print('counting document frequency of words...')
cvectorizer = CountVectorizer(min_df=min_df, max_df=max_df, stop_words=None)
# cvz is a document-term matrix
# cvz is a 18846x19148 sparse matrix
cvz = cvectorizer.fit_transform(init_docs).sign()

counting document frequency of words...


In [5]:
# Get vocabulary
print('building the vocabulary...')
sum_counts = cvz.sum(axis=0) # calculate (axis=0) frequency of each term
v_size = sum_counts.shape[1]
sum_counts_np = np.zeros(v_size, dtype=int)
for v in range(v_size):
    sum_counts_np[v] = sum_counts[0,v]
# word2id, a dicitonary maps word to id
word2id = dict([(w, cvectorizer.vocabulary_.get(w)) for w in cvectorizer.vocabulary_])
# id2word, a dicitonary maps id to word
id2word = dict([(cvectorizer.vocabulary_.get(w), w) for w in cvectorizer.vocabulary_])
del cvectorizer
print('  initial vocabulary size: {}'.format(v_size)) # initial vocabulary size: 19148

# sort words in vocabulary, which put the frequent words first 
idx_sort = np.argsort(sum_counts_np) # return index of original vocab that would be used for sorting the dictionary later
vocab_aux = [id2word[idx_sort[cc]] for cc in range(v_size)] # return a list of sorted terms

# filter out stopwords (if any)
# read stopwords
with open('stops.txt', 'r') as f:
    stops = f.read().split('\n')

vocab_aux = [w for w in vocab_aux if w not in stops]
print('  vocabulary size after removing stopwords from list: {}'.format(len(vocab_aux)))

# create dictionary and inverse dictionary
vocab = vocab_aux
del vocab_aux
word2id = dict([(w, j) for j, w in enumerate(vocab)])
id2word = dict([(j, w) for j, w in enumerate(vocab)])

building the vocabulary...
  initial vocabulary size: 19148
  vocabulary size after removing stopwords from list: 18677


# 3 Split Corpus into Training/Validation/Testing

In [6]:
# Split in train/test/valid
print('tokenizing documents and splitting into train/test/valid...')
num_docs_tr = len(init_docs_tr)
trSize = num_docs_tr-100 # training size
tsSize = len(init_docs_ts) # testing size
vaSize = 100 # validation size
# np.random.permutation randomly generates a list of indexes (i.e., idx_permute)
# idx_permute is then used for split training, validation , and testing dataset
idx_permute = np.random.permutation(num_docs_tr).astype(int)
""" example
>>> np.random.permutation(10)
array([1, 7, 4, 3, 0, 9, 2, 5, 8, 6]) # random
"""


tokenizing documents and splitting into train/test/valid...


' example\n>>> np.random.permutation(10)\narray([1, 7, 4, 3, 0, 9, 2, 5, 8, 6]) # random\n'

In [7]:
# remove words not in train_data
# as you can see, idx_permute[idx_d] is used for picking the document in init_docs
vocab = list(set([w for idx_d in range(trSize) for w in init_docs[idx_permute[idx_d]].split() if w in word2id]))
word2id = dict([(w, j) for j, w in enumerate(vocab)])
id2word = dict([(j, w) for j, w in enumerate(vocab)])
print('  vocabulary after removing words not in train: {}'.format(len(vocab)))

# split in train/test/valid
docs_tr = [[word2id[w] for w in init_docs[idx_permute[idx_d]].split() if w in word2id] for idx_d in range(trSize)]
docs_va = [[word2id[w] for w in init_docs[idx_permute[idx_d+trSize]].split() if w in word2id] for idx_d in range(vaSize)]
docs_ts = [[word2id[w] for w in init_docs[idx_d+num_docs_tr].split() if w in word2id] for idx_d in range(tsSize)] # no permutation on the test data

print('  number of documents (train): {} [this should be equal to {}]'.format(len(docs_tr), trSize))
print('  number of documents (test): {} [this should be equal to {}]'.format(len(docs_ts), tsSize))
print('  number of documents (valid): {} [this should be equal to {}]'.format(len(docs_va), vaSize))

# remove empty documents
print('removing empty documents...')

def remove_empty(in_docs):
    return [doc for doc in in_docs if doc!=[]]

docs_tr = remove_empty(docs_tr)
docs_ts = remove_empty(docs_ts)
docs_va = remove_empty(docs_va)

# remove test documents with length=1
docs_ts = [doc for doc in docs_ts if len(doc)>1]

# split test set in 2 halves 
# this is required input for the document completion task
print('splitting test documents in 2 halves...')
docs_ts_h1 = [[w for i,w in enumerate(doc) if i<=len(doc)/2.0-1] for doc in docs_ts]
docs_ts_h2 = [[w for i,w in enumerate(doc) if i>len(doc)/2.0-1] for doc in docs_ts]

"""
vocabulary after removing words not in train: 18625
  number of documents (train): 11214 [this should be equal to 11214]
  number of documents (test): 7532 [this should be equal to 7532]
  number of documents (valid): 100 [this should be equal to 100]

The format:
    docs_tr: [[token1_id, token2_id, ...], ...]
"""

  vocabulary after removing words not in train: 18626
  number of documents (train): 11214 [this should be equal to 11214]
  number of documents (test): 7532 [this should be equal to 7532]
  number of documents (valid): 100 [this should be equal to 100]
removing empty documents...
splitting test documents in 2 halves...


'\nvocabulary after removing words not in train: 18625\n  number of documents (train): 11214 [this should be equal to 11214]\n  number of documents (test): 7532 [this should be equal to 7532]\n  number of documents (valid): 100 [this should be equal to 100]\n\nThe format:\n    docs_tr: [[token1_id, token2_id, ...], ...]\n'

# 4 Create Bag-of-word using sparse.coo

In [8]:
# getting lists of words and doc_indices
print('creating lists of words...')

def create_list_words(in_docs):
    return [word for doc in in_docs for word in doc]

words_tr = create_list_words(docs_tr)
words_ts = create_list_words(docs_ts)
words_ts_h1 = create_list_words(docs_ts_h1)
words_ts_h2 = create_list_words(docs_ts_h2)
words_va = create_list_words(docs_va)

print('  len(words_tr): ', len(words_tr))
print('  len(words_ts): ', len(words_ts))
print('  len(words_ts_h1): ', len(words_ts_h1))
print('  len(words_ts_h2): ', len(words_ts_h2))
print('  len(words_va): ', len(words_va))

# get doc indices
print('getting doc indices...')

def create_doc_indices(in_docs):
    # j is the document index
    # replicate j by len(doc) times
    aux = [[j for i in range(len(doc))] for j, doc in enumerate(in_docs)]
    return [int(index) for index_list in aux for index in index_list]

doc_indices_tr = create_doc_indices(docs_tr)
doc_indices_ts = create_doc_indices(docs_ts)
doc_indices_ts_h1 = create_doc_indices(docs_ts_h1)
doc_indices_ts_h2 = create_doc_indices(docs_ts_h2)
doc_indices_va = create_doc_indices(docs_va)

print('  len(np.unique(doc_indices_tr)): {} [this should be {}]'.format(len(np.unique(doc_indices_tr)), len(docs_tr)))
print('  len(np.unique(doc_indices_ts)): {} [this should be {}]'.format(len(np.unique(doc_indices_ts)), len(docs_ts)))
print('  len(np.unique(doc_indices_ts_h1)): {} [this should be {}]'.format(len(np.unique(doc_indices_ts_h1)), len(docs_ts_h1)))
print('  len(np.unique(doc_indices_ts_h2)): {} [this should be {}]'.format(len(np.unique(doc_indices_ts_h2)), len(docs_ts_h2)))
print('  len(np.unique(doc_indices_va)): {} [this should be {}]'.format(len(np.unique(doc_indices_va)), len(docs_va)))

# Number of documents in each set
n_docs_tr = len(docs_tr)
n_docs_ts = len(docs_ts)
n_docs_ts_h1 = len(docs_ts_h1)
n_docs_ts_h2 = len(docs_ts_h2)
n_docs_va = len(docs_va)

# Remove unused variables
del docs_tr
del docs_ts
del docs_ts_h1
del docs_ts_h2
del docs_va

creating lists of words...
  len(words_tr):  1339507
  len(words_ts):  860828
  len(words_ts_h1):  428566
  len(words_ts_h2):  432262
  len(words_va):  9599
getting doc indices...
  len(np.unique(doc_indices_tr)): 11214 [this should be 11214]
  len(np.unique(doc_indices_ts)): 7532 [this should be 7532]
  len(np.unique(doc_indices_ts_h1)): 7532 [this should be 7532]
  len(np.unique(doc_indices_ts_h2)): 7532 [this should be 7532]
  len(np.unique(doc_indices_va)): 100 [this should be 100]


In [9]:
len(doc_indices_ts)

860828

In [10]:
# Create bow representation
print('creating bow representation...')

def create_bow(doc_indices, words, n_docs, vocab_size):
    """ this function helps build document-term matrix in a scipy.sparse matrix format
    API Reference: https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.coo_matrix.html

    for simplicity,
        coo_matrix((data, (i, j)), [shape=(M, N)]) generates A
        where  A[i[k], j[k]] = data[k]
    note,
        COO is a fast format for constructing sparse matrices
        Once a matrix has been constructed, convert to CSR or CSC format for fast arithmetic and matrix vector operations
        by default when converting to CSR or CSC format, duplicate (i,j) entries will be summed together
    """
    return sparse.coo_matrix(([1]*len(doc_indices),(doc_indices, words)), shape=(n_docs, vocab_size)).tocsr()

bow_tr = create_bow(doc_indices_tr, words_tr, n_docs_tr, len(vocab))
bow_ts = create_bow(doc_indices_ts, words_ts, n_docs_ts, len(vocab))
bow_ts_h1 = create_bow(doc_indices_ts_h1, words_ts_h1, n_docs_ts_h1, len(vocab))
bow_ts_h2 = create_bow(doc_indices_ts_h2, words_ts_h2, n_docs_ts_h2, len(vocab))
bow_va = create_bow(doc_indices_va, words_va, n_docs_va, len(vocab))

del words_tr
del words_ts
del words_ts_h1
del words_ts_h2
del words_va
del doc_indices_tr
del doc_indices_ts
del doc_indices_ts_h1
del doc_indices_ts_h2
del doc_indices_va

# Write the vocabulary to a file
path_save = './min_df_' + str(min_df) + '/'
if not os.path.isdir(path_save):
    os.system('mkdir -p ' + path_save)

with open(path_save + 'vocab.pkl', 'wb') as f:
    pickle.dump(vocab, f)
del vocab

# Split bow intro token/value pairs
print('splitting bow intro token/value pairs and saving to disk...')

def split_bow(bow_in, n_docs):
    indices = [[w for w in bow_in[doc,:].indices] for doc in range(n_docs)]
    counts = [[c for c in bow_in[doc,:].data] for doc in range(n_docs)]
    return indices, counts

bow_tr_tokens, bow_tr_counts = split_bow(bow_tr, n_docs_tr)
savemat(path_save + 'bow_tr_tokens', {'tokens': bow_tr_tokens}, do_compression=True)
savemat(path_save + 'bow_tr_counts', {'counts': bow_tr_counts}, do_compression=True)
del bow_tr
del bow_tr_tokens
del bow_tr_counts

bow_ts_tokens, bow_ts_counts = split_bow(bow_ts, n_docs_ts)
savemat(path_save + 'bow_ts_tokens', {'tokens': bow_ts_tokens}, do_compression=True)
savemat(path_save + 'bow_ts_counts', {'counts': bow_ts_counts}, do_compression=True)
del bow_ts
del bow_ts_tokens
del bow_ts_counts

bow_ts_h1_tokens, bow_ts_h1_counts = split_bow(bow_ts_h1, n_docs_ts_h1)
savemat(path_save + 'bow_ts_h1_tokens', {'tokens': bow_ts_h1_tokens}, do_compression=True)
savemat(path_save + 'bow_ts_h1_counts', {'counts': bow_ts_h1_counts}, do_compression=True)
del bow_ts_h1
del bow_ts_h1_tokens
del bow_ts_h1_counts

bow_ts_h2_tokens, bow_ts_h2_counts = split_bow(bow_ts_h2, n_docs_ts_h2)
savemat(path_save + 'bow_ts_h2_tokens', {'tokens': bow_ts_h2_tokens}, do_compression=True)
savemat(path_save + 'bow_ts_h2_counts', {'counts': bow_ts_h2_counts}, do_compression=True)
del bow_ts_h2
del bow_ts_h2_tokens
del bow_ts_h2_counts

bow_va_tokens, bow_va_counts = split_bow(bow_va, n_docs_va)
savemat(path_save + 'bow_va_tokens', {'tokens': bow_va_tokens}, do_compression=True)
savemat(path_save + 'bow_va_counts', {'counts': bow_va_counts}, do_compression=True)
del bow_va
del bow_va_tokens
del bow_va_counts

creating bow representation...
splitting bow intro token/value pairs and saving to disk...
