In [2]:
import re
import nltk
import spacy
import gensim
import warnings
import pyLDAvis
import pyLDAvis.gensim

import numpy as np
import tensorflow_datasets as tfds

from tqdm import tqdm

In [3]:
warnings.filterwarnings('ignore')

In [4]:
pyLDAvis.enable_notebook()

In [5]:
# Get the CNN/DailyMail dataset
ds,info = tfds.load("cnn_dailymail", split = "test", with_info = True, shuffle_files = True)

In [6]:
dataset = []
for batch in tfds.as_numpy(ds.batch(1)):
    dataset.append(re.sub(r"[^a-z]+", ' ', batch['highlights'][0].decode("utf-8").lower()))

In [7]:
len(dataset)

11490

In [8]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc = True)) 

data_words = list(sent_to_words(dataset))

data_words[:1][0][:30]

['nottingham',
 'forest',
 'are',
 'close',
 'to',
 'extending',
 'dougie',
 'freedman',
 'contract',
 'the',
 'forest',
 'boss',
 'took',
 'over',
 'from',
 'former',
 'manager',
 'stuart',
 'pearce',
 'in',
 'february',
 'freedman',
 'has',
 'since',
 'lead',
 'the',
 'club',
 'to',
 'ninth',
 'in']

In [9]:
bigram = gensim.models.Phrases(data_words, min_count = 5, threshold = 100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold = 100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [10]:
stop_words = nltk.corpus.stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [11]:
def remove_stopwords(texts):
    return [[word for word in gensim.utils.simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags = ['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [12]:
data_words_nostops = remove_stopwords(data_words)

data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load("en_core_web_md", disable = ['parser', 'ner'])

data_lemmatized = lemmatization(data_words_bigrams)

data_lemmatized[:1][0][:30]

['close',
 'extend',
 'boss',
 'take',
 'former',
 'manager',
 'ninth',
 'championship']

In [13]:
id2word = gensim.corpora.Dictionary(data_lemmatized)

texts = data_lemmatized

corpus = [id2word.doc2bow(text) for text in texts]

In [14]:
lda = gensim.models.LdaMulticore(corpus = corpus,
                                   id2word = id2word,
                                   num_topics = 10, 
                                   random_state = 100,
                                   chunksize = 100,
                                   passes = 20,
                                   per_word_topics = True,
                                   alpha = 0.01,
                                   eta = 0.9)

In [15]:
tfidf = gensim.models.TfidfModel(corpus)

In [16]:
lda_tfidf = gensim.models.LdaMulticore(corpus = tfidf[corpus],
                                       id2word = id2word,
                                       num_topics = 10, 
                                       random_state = 100,
                                       chunksize = 100,
                                       passes = 20,
                                       per_word_topics = True,
                                       alpha = 0.01,
                                       eta = 0.9)

In [17]:
w2v = gensim.models.Word2Vec(texts)

In [18]:
def compute_ones(vector):
    vector_nums = [i[1] for i in vector]
    left = 1 - sum(vector_nums)
    count = 0
    for i in vector_nums:
        if i == 0.0:
            count += 1
    if count == 0:
        return vector
    else:
        left = left/count
        for i in range(len(vector)):
            if vector[i][1] == 0.0:
                vector[i] = (vector[i][0], left)
        return vector

In [19]:
corpus_w2v = []
for text in tqdm(corpus):
    corpus_sent = []
    for couple in text:
        if(id2word[couple[0]] in w2v.wv.vocab.keys()):
            corpus_sent.append((couple[0], sum(w2v.wv[id2word[couple[0]]].tolist())/len(w2v.wv[id2word[couple[0]]].tolist())))
        else:
            corpus_sent.append((couple[0], 0.0))
    corpus_w2v.append(compute_ones(corpus_sent))

100%|██████████| 11490/11490 [00:01<00:00, 6697.97it/s]


In [20]:
lda_w2v = gensim.models.LdaMulticore(corpus = corpus_w2v,
                                 id2word = id2word,
                                 num_topics = 10, 
                                   random_state = 100,
                                   chunksize = 100,
                                   passes = 20,
                                   per_word_topics = True,
                                   alpha = 0.01,
                                   eta = 0.9)

In [21]:
ft = gensim.models.fasttext.FastText(size = 100)

ft.build_vocab(sentences = texts)

ft.train(
    sentences = texts, epochs = ft.epochs,
    total_examples = len(texts)
)

In [22]:
corpus_ft = []
for text in tqdm(corpus):
    corpus_sent = []
    for couple in text:
        corpus_sent.append((couple[0], sum(ft.wv[id2word[couple[0]]].tolist())/len(ft.wv[id2word[couple[0]]].tolist())))
    corpus_ft.append(corpus_sent)

100%|██████████| 11490/11490 [00:03<00:00, 3517.73it/s]


In [23]:
lda_ft = gensim.models.LdaMulticore(corpus = corpus_ft,
                                 id2word = id2word,
                                 num_topics = 10, 
                                   random_state = 100,
                                   chunksize = 100,
                                   passes = 20,
                                   per_word_topics = True,
                                   alpha = 0.01,
                                   eta = 0.9)

In [24]:
def compute_coherence(model):
    coherence_model_lda = gensim.models.CoherenceModel(model = model, texts = data_lemmatized, dictionary = id2word, coherence = 'c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print('Coherence Score: ', coherence_lda)

In [25]:
models = [lda, lda_tfidf, lda_w2v, lda_ft]

In [26]:
for model in models:
    compute_coherence(model)

Coherence Score:  0.44382140039495066
Coherence Score:  0.6041055006988955
Coherence Score:  0.7612216016138255
Coherence Score:  0.7612216016138255


In [27]:
#pyLDAvis.gensim.prepare(lda_tfidf, corpus, id2word)

TypeError: Object of type complex is not JSON serializable

PreparedData(topic_coordinates=                        x                           y  topics  cluster  \
topic                                                                    
2     -0.136132+0.000000j  5.529429e-07+0.000000e+00j       1        1   
7      0.015088+0.000000j -9.376932e-05+0.000000e+00j       2        1   
6      0.015235+0.000000j  6.780211e-04+0.000000e+00j       3        1   
9      0.015111+0.000000j -8.241395e-05+0.000000e+00j       4        1   
5      0.015118+0.000000j -8.342310e-05+0.000000e+00j       5        1   
8      0.015118+0.000000j -8.342310e-05+0.000000e+00j       6        1   
4      0.015118+0.000000j -8.343691e-05+0.000000e+00j       7        1   
3      0.015110+0.000000j -8.526169e-05+0.000000e+00j       8        1   
1      0.015118+0.000000j -8.342293e-05+0.000000e+00j       9        1   
0      0.015118+0.000000j -8.342305e-05+0.000000e+00j      10        1   

            Freq  
topic             
2      99.543541  
7       0.056903  
6   

In [36]:
for idx, topic in lda_tfidf.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.000*"wireless" + 0.000*"observatorie" + 0.000*"mhz" + 0.000*"methanol" + 0.000*"lawnmower" + 0.000*"infinitesimal" + 0.000*"band" + 0.000*"guide" + 0.000*"wave" + 0.000*"say"
Topic: 1 
Words: 0.000*"margherita" + 0.000*"buttery" + 0.000*"shiraz" + 0.000*"scotch" + 0.000*"zesty" + 0.000*"sauvignon" + 0.000*"riesling" + 0.000*"merlot" + 0.000*"pair" + 0.000*"sausage"
Topic: 2 
Words: 0.004*"say" + 0.002*"year" + 0.002*"last" + 0.002*"make" + 0.002*"take" + 0.002*"police" + 0.002*"man" + 0.002*"also" + 0.002*"find" + 0.002*"week"
Topic: 3 
Words: 0.000*"garter" + 0.000*"skillfully" + 0.000*"stockings" + 0.000*"sensuality" + 0.000*"intimacy" + 0.000*"ranch" + 0.000*"wellness" + 0.000*"offering" + 0.000*"glove" + 0.000*"discuss"
Topic: 4 
Words: 0.000*"spiced" + 0.000*"crumbly" + 0.000*"delicately" + 0.000*"latte" + 0.000*"pastry" + 0.000*"espresso" + 0.000*"pairing" + 0.000*"cheese" + 0.000*"flavour" + 0.000*"irish"
Topic: 5 
Words: 0.000*"glimpse" + 0.000*"red" + 0.000*

In [37]:
lda_tfidf.get_topics()

array([[6.65336047e-05, 6.65336047e-05, 6.65336047e-05, ...,
        6.65336047e-05, 6.65336047e-05, 6.65336047e-05],
       [6.65336047e-05, 6.65336047e-05, 6.65336047e-05, ...,
        6.65336047e-05, 6.65336047e-05, 6.65336047e-05],
       [1.00349379e-03, 4.36801871e-04, 6.62492530e-04, ...,
        1.00680845e-04, 8.49404023e-05, 8.86333728e-05],
       ...,
       [6.64998443e-05, 6.64998443e-05, 6.64998443e-05, ...,
        6.64998443e-05, 6.64998443e-05, 6.64998443e-05],
       [6.65336047e-05, 6.65336047e-05, 6.65336047e-05, ...,
        6.65336047e-05, 6.65336047e-05, 6.65336047e-05],
       [6.65213447e-05, 6.65213447e-05, 6.65213447e-05, ...,
        6.65213447e-05, 6.65213447e-05, 6.65213447e-05]], dtype=float32)