In [2]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from gensim import corpora, models
import re
import spacy
from collections import defaultdict
import pyLDAvis
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

nlp = spacy.load('en')

In [23]:
def deHTML(doc):
    bs = BeautifulSoup(doc, 'lxml').text
    newlinetabs_removed = re.sub('[\n\t]+', ' ', bs)
    extraws_removed = re.sub('\s\s+', ' ', newlinetabs_removed)
    return extraws_removed.strip().lower()

In [24]:
articles_f = '/Users/vasundhara/Downloads/SOCC/raw/gnm_articles.csv'
articles_df = pd.read_csv(articles_f, low_memory=False)

flat_list = articles_df['article_text'].apply(deHTML).values

In [25]:
tokens = []

pattern = re.compile('[^a-z]+$')

for doc in nlp.pipe(flat_list, batch_size=50, n_threads=80):
    if doc.is_parsed:
        tokens.append([n.text for n in doc if ((not pattern.match(n.text)) and (not n.is_stop))])

In [27]:
tokens[:5]

[['elections',
  'choices',
  'imperfect',
  'alternatives',
  'parties',
  'gaps',
  'deficiencies',
  'failings',
  'choose',
  'voters',
  'must.the',
  'election',
  'powered',
  'founded',
  'desire',
  'change',
  'election',
  'opposition',
  'recognized',
  'electorate',
  "'s",
  'desire',
  'stability',
  'continuity',
  'things',
  'economic',
  "'s",
  'liberals',
  'new',
  'democrats',
  'running',
  'rhetoric',
  'change',
  'forward',
  'economic',
  'platforms',
  'built',
  'largely',
  'acceptance',
  'conservative',
  'status',
  'quo.the',
  'key',
  'issue',
  'election',
  'economy',
  'financial',
  'health',
  'canadians',
  'score',
  'conservative',
  'party',
  'solid',
  'record',
  'hardly',
  'perfect',
  'relatively',
  'speaking',
  'better',
  'election',
  'turned',
  'contest',
  'referendum',
  'government',
  "'s",
  'meanness',
  'secretiveness',
  'centralization',
  'power',
  'centralized',
  'prime',
  'minister',
  "'s",
  'office',
  'histor

In [30]:
frequency = defaultdict(int)
texts = tokens
for text in texts:
    for token in text:
        frequency[token] += 1

In [31]:
texts = [[token for token in text if frequency[token] > 1] for text in texts]

In [1]:
texts[:5]

NameError: name 'texts' is not defined

In [32]:
dictionary = corpora.Dictionary(texts)

In [33]:
dictionary.save('articles.dict')

In [36]:
corpus = [dictionary.doc2bow(text) for text in texts]

In [37]:
corpora.MmCorpus.serialize('articles.mm', corpus)
corpora.SvmLightCorpus.serialize('articles.svmlight', corpus)
corpora.BleiCorpus.serialize('articles.lda-c', corpus)
corpora.LowCorpus.serialize('articles.low', corpus)

In [39]:
print(corpus[:5])

[[(0, 14), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 2), (13, 1), (14, 1), (15, 1), (16, 2), (17, 1), (18, 2), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 2), (32, 1), (33, 2), (34, 1), (35, 1), (36, 1), (37, 6), (38, 1), (39, 1), (40, 2), (41, 1), (42, 1), (43, 1), (44, 10), (45, 6), (46, 1), (47, 1), (48, 1), (49, 3), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 3), (57, 1), (58, 1), (59, 1), (60, 2), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 2), (67, 1), (68, 1), (69, 1), (70, 1), (71, 1), (72, 1), (73, 1), (74, 1), (75, 2), (76, 8), (77, 1), (78, 3), (79, 1), (80, 2), (81, 9), (82, 1), (83, 1), (84, 1), (85, 1), (86, 1), (87, 1), (88, 1), (89, 1), (90, 1), (91, 1), (92, 1), (93, 1), (94, 1), (95, 1), (96, 1), (97, 1), (98, 2), (99, 1), (100, 1), (101, 1), (102, 1), (103, 1), (104, 1), (105, 1), (106, 1), (107, 1), (108, 11), (109, 1), (110,

In [41]:
tfidf = models.TfidfModel(corpus)

In [42]:
corpus_tfidf = tfidf[corpus]

In [45]:
model = models.HdpModel(corpus_tfidf, id2word=dictionary)

In [46]:
print(model)

<gensim.models.hdpmodel.HdpModel object at 0x11cbedef0>


## Second attempt

In [5]:
corpus = corpora.MmCorpus('articles.mm')
print(corpus)

MmCorpus(10339 documents, 57810 features, 2437266 non-zero entries)


In [6]:
dictionary = corpora.Dictionary.load('articles.dict')
print(dictionary)

Dictionary(57810 unique tokens: ['fib', 'invigorating', 'mores', 'swab', 'course.if']...)


In [7]:
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

In [18]:
hdp = models.HdpModel(corpus_tfidf, id2word=dictionary)

In [19]:
print(hdp)

<gensim.models.hdpmodel.HdpModel object at 0x1a1a8eb780>


In [20]:
len(hdp.get_topics())

150

In [21]:
hdp.print_topics()

[(0,
  '0.001*mr + 0.001*party + 0.001*canada + 0.001*quebec + 0.001*government + 0.001*health + 0.001*cent + 0.001*women + 0.001*harper + 0.001*canadian'),
 (1,
  '0.000*prime + 0.000*strength.the + 0.000*harper + 0.000*conservatives + 0.000*mr + 0.000*party + 0.000*assault + 0.000*trump + 0.000*ontario + 0.000*repudiate'),
 (2,
  "0.000*trudeau + 0.000*mein + 0.000*french + 0.000*threat + 0.000*chimney + 0.000*brexit + 0.000*you.'it + 0.000*canadian + 0.000*people.this + 0.000*it.no"),
 (3,
  '0.000*trump + 0.000*helmer + 0.000*good + 0.000*islamic + 0.000*harris + 0.000*trim + 0.000*law + 0.000*black + 0.000*roundtables + 0.000*ontario'),
 (4,
  '0.000*prescribed + 0.000*commentary + 0.000*hitchhiking + 0.000*borrow + 0.000*relinquishing + 0.000*willing + 0.000*singularity + 0.000*reside + 0.000*pakhtun + 0.000*cantor'),
 (5,
  '0.000*martinez + 0.000*failed + 0.000*frugality + 0.000*pigeonholed + 0.000*symptoms.the + 0.000*svyatogorsk + 0.000*lloydsmith + 0.000*arrest.the + 0.000*p

In [8]:
lda = models.ldamodel.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=20)

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

In [9]:
len(lda.get_topics())

20

In [10]:
lda.print_topics()

[(0,
  '0.005*"redford" + 0.003*"alberta" + 0.003*"legault" + 0.003*"pipeline" + 0.003*"oil" + 0.002*"wildrose" + 0.002*"caq" + 0.002*"alison" + 0.002*"kitimat" + 0.001*"energy"'),
 (1,
  '0.005*"pq" + 0.003*"marois" + 0.003*"bc" + 0.002*"cummins" + 0.002*"peladeau" + 0.001*"apps" + 0.001*"dix" + 0.001*"quebec" + 0.001*"diagnostic" + 0.001*"k-12"'),
 (2,
  '0.002*"anesthesiologists" + 0.002*"bcma" + 0.001*"toulouse" + 0.001*"graduating" + 0.001*"chewing" + 0.001*"stewards" + 0.001*"rack" + 0.001*"sinclair" + 0.001*"dusty" + 0.001*"deterrence"'),
 (3,
  '0.008*"romney" + 0.003*"patients" + 0.002*"doctors" + 0.002*"toews" + 0.002*"patient" + 0.002*"court" + 0.002*"health" + 0.002*"rick" + 0.001*"pain" + 0.001*"birth"'),
 (4,
  '0.009*"gingrich" + 0.003*"wto" + 0.003*"newt" + 0.002*"hong" + 0.002*"kong" + 0.002*"liquor" + 0.002*"ornge" + 0.002*"beijing" + 0.002*"daisey" + 0.002*"wikileaks"'),
 (5,
  '0.004*"santorum" + 0.003*"police" + 0.002*"gay" + 0.002*"gun" + 0.002*"caterpillar" + 0.0

In [18]:
vis = pyLDAvis.gensim.prepare(lda, corpus_tfidf, dictionary)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [19]:
pyLDAvis.display(vis)