In [None]:
%load_ext autoreload
%autoreload 2

In [1]:
from sklearn.datasets import fetch_20newsgroups
import numpy as np
import pandas as pd
from tqdm import tqdm
import spacy
from gensim import corpora, models
import os
os.chdir("Documents/GitHub/Topic-Mapping")
print(os.getcwd())
from src.utils import preprocess, get_windows

import requests
requests.packages.urllib3.disable_warnings()
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    # Legacy Python that doesn't verify HTTPS certificates by default
    pass
else:
    # Handle target environment that doesn't support HTTPS verification
    ssl._create_default_https_context = _create_unverified_https_context

/Users/carterward/Documents/GitHub/Topic-Mapping


In [2]:
MIN_COUNTS = 20
MAX_COUNTS = 1800
# words with count < MIN_COUNTS
# and count > MAX_COUNTS
# will be removed

MIN_LENGTH = 15
# minimum document length 
# (number of words)
# after preprocessing

# half the size of the context around a word
HALF_WINDOW_SIZE = 5
# it must be that 2*HALF_WINDOW_SIZE < MIN_LENGTH

# Load NLP model

In [3]:
nlp = spacy.load('en')

# Load dataset

In [28]:
# dataset = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
# docs = dataset['data']
dataset = pd.read_csv("data/large_article_sample.csv")
docs = dataset["tokenized_content"].values
docs = [doc for doc in docs if not pd.isna(doc)]

In [29]:
type(docs[0])

str

In [30]:
# number of documents
len(docs)

9743

In [31]:
# store an index with a document
docs = [(i, doc) for i, doc in enumerate(docs)]

# Preprocess dataset and create windows

In [32]:
docs

raw march @wnyt learn asha burwell withdraw @ualbany attend school disciplinary hearing schedule tomorrow jessica layton @jessicalayton13 march woman refuse deal case grand jury follow breitbart news investigative reporter citizen journalism school founder lee stranahan twitter @stranahan ~'),
 (991,
  'deputy inside lackland afb clearing building bexar county sheriff @bexarcosheriff san antonio texas sheriff spokesman people dead apparent air force base san antonio bexar county sheriff spokesman james keith body find friday morning inside room building joint base san keith believe shooter dead sheriff deputy law enforcement officer continue search building nearby facility lackland key training installation air force fort hood mile lackland air force base site people kill injure'),
 (992,
  'donald trump right january speak reporter hearing grill secretary rex tillerson view human right rubio got remember rubio senate scrutiny cone secretary state tip require scrutiny deference subsequ

In [33]:
encoded_docs, decoder, word_counts = preprocess(
    docs, nlp, MIN_LENGTH, MIN_COUNTS, MAX_COUNTS
)

100%|██████████| 9743/9743 [00:00<00:00, 22743.93it/s]
number of removed short documents: 37
total number of tokens: 3516825
number of tokens to be removed: 1500277
number of additionally removed short documents: 58
total number of tokens: 2015895

minimum word count number: 19
this number can be less than MIN_COUNTS because of document removal


In [34]:
# new ids will be created for the documents.
# create a way of restoring initial ids:
doc_decoder = {i: doc_id for i, (doc_id, doc) in enumerate(encoded_docs)}

In [35]:
data = []
# new ids are created here
for index, (_, doc) in tqdm(enumerate(encoded_docs)):
    windows = get_windows(doc, HALF_WINDOW_SIZE)
    # index represents id of a document, 
    # windows is a list of (word, window around this word),
    # where word is in the document
    data += [[index, w[0]] + w[1] for w in windows]

data = np.array(data, dtype='int64')

9648it [00:05, 1899.33it/s]


In [36]:
# a row in 'data' contains:
# id of a document, id of a word in this document, a window around this word
# 1 + 1 + 10
data.shape[1]

12

In [37]:
# number of windows (equals to the total number of tokens)
data.shape[0]

2015895

# Get unigram distribution

In [38]:
word_counts = np.array(word_counts)
unigram_distribution = word_counts/sum(word_counts)

# Prepare word vectors

In [40]:
%%time
vocab_size = len(decoder)
embedding_dim = 50

# train a skip-gram word2vec model
texts = [[str(j) for j in doc] for i, doc in encoded_docs]
model = models.Word2Vec(texts, size=embedding_dim, window=5, workers=4, sg=1, negative=15, iter=70)
model.init_sims(replace=True) 

word_vectors = np.zeros((vocab_size, embedding_dim)).astype('float32')
for i in decoder:
    word_vectors[i] = model.wv[str(i)]

CPU times: user 42min 4s, sys: 17.6 s, total: 42min 22s
Wall time: 11min


In [41]:
# number of unique words
vocab_size

12402

# Prepare initialization for document weights

In [42]:
texts = [[decoder[j] for j in doc] for i, doc in encoded_docs]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [44]:
%%time
n_topics = 25
lda = models.LdaModel(corpus, alpha=0.9, id2word=dictionary, num_topics=n_topics)
corpus_lda = lda[corpus]

CPU times: user 37.2 s, sys: 5.83 s, total: 43 s
Wall time: 11.3 s


In [45]:
for i, topics in lda.show_topics(n_topics, formatted=False):
    print('topic', i, ':', ' '.join([t for t, _ in topics]))

topic 0 : isis syria islamic iraq iran terrorist iraqi syrian civilian turkey
topic 1 : abortion county drug parenthood planned funding budget medicaid portland texas
topic 2 : castro flight airline mets passenger incident breitbart roberts lewandowski airlines
topic 3 : ryan paul speaker climate tweet gorsuch scientist majority planet answer
topic 4 : oil caucus afghanistan taliban energy afghan robot gas ryan hogan
topic 5 : ms. parent college pipeline protest kid mother education neighborhood project
topic 6 : korea insurance obamacare coverage player korean prince season nfl missile
topic 7 : music song food album remember kid building father room gold
topic 8 : mrs. nuclear flynn gorsuch supreme lie rich politic attorney private
topic 9 : gun fox shooting shoot violence kelly carry ailes cop brain
topic 10 : apple internet technology vehicle google app tesla user model content
topic 11 : comey fbi intelligence de tillerson putin attorney james times leak
topic 12 : transgend gende

In [46]:
doc_weights_init = np.zeros((len(corpus_lda), n_topics))
for i in tqdm(range(len(corpus_lda))):
    topics = corpus_lda[i]
    for j, prob in topics:
        doc_weights_init[i, j] = prob

100%|██████████| 9648/9648 [00:08<00:00, 1120.19it/s]


# Save data

In [47]:
np.save('models/articles/data.npy', data)
np.save('models/articles/word_vectors.npy', word_vectors)
np.save('models/articles/unigram_distribution.npy', unigram_distribution)
np.save('models/articles/decoder.npy', decoder)
np.save('models/articles/doc_decoder.npy', doc_decoder)
np.save('models/articles/doc_weights_init.npy', doc_weights_init)