In [1]:
import time
import math
import re
from textblob import TextBlob
import pandas as pd

import nltk as nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

import string

import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')

import gensim
from gensim import corpora, models
from gensim.models.ldamulticore import LdaMulticore
import pyLDAvis.gensim

In [2]:
directory = '~/Datasets/32018/'
#file = 'jeep.txt'
file = 'webhose_cat.pkl'
path = directory + file

In [3]:
df = pd.read_pickle(path)

In [4]:
df.head()

Unnamed: 0,crawled,language,text,title,url
0,2018-01-30T18:28:45.012+02:00,english,Avery Dennison's (AVY) Q4 results are likely t...,IRobot downgraded to neutral from buy at Sidot...,http://omgili.com/ri/.wHSUbtEfZQRfU.5KUm1RkeXy...
1,2018-01-30T18:29:07.001+02:00,french,"1m95, c’est trop grand. Et sa stature, Bertran...","""Bertrand Zibi Abeghe, encore prisonnier, et t...",http://omgili.com/ri/.wHSUbtEfZTpzFtnXyQJIwJ.j...
2,2018-01-30T18:29:40.000+02:00,english,Tuggers and Topper Industrial Carts Help Trans...,Tuggers and Topper Industrial Carts Help Trans...,http://omgili.com/ri/jHIAmI4hxg.zDiulpymXqU_n4...
3,2018-01-30T18:30:05.007+02:00,english,Currently adding the following games:\n100 (by...,,http://omgili.com/ri/.0rSU5LtMgyggHgoOVy9TMDWT...
4,2018-01-30T18:30:05.013+02:00,english,Quote: : » Currently adding the following game...,,http://omgili.com/ri/.0rSU5LtMgyggHgoOVy9TMDWT...


In [5]:
df = df[df.language=='english']

In [6]:
df['text_clean'] = df['text'].map(lambda x: re.sub('[^a-zA-Z0-9 @ . , : - _]', '', str(x)))

In [7]:
df[['text', 'text_clean']].head(5)

Unnamed: 0,text,text_clean
0,Avery Dennison's (AVY) Q4 results are likely t...,Avery Dennisons AVY Q4 results are likely to g...
2,Tuggers and Topper Industrial Carts Help Trans...,Tuggers and Topper Industrial Carts Help Trans...
3,Currently adding the following games:\n100 (by...,Currently adding the following games:100 by ev...
4,Quote: : » Currently adding the following game...,Quote: : Currently adding the following games...
5,Quote: : » Currently adding the following game...,Quote: : Currently adding the following games...


In [8]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

In [9]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sshepa74/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:
news_clean = [clean(doc).split() for doc in df.text_clean.to_list()]

In [11]:
dictionary = corpora.Dictionary(news_clean)

In [12]:
%time doc_term_matrix = [dictionary.doc2bow(doc) for doc in news_clean]

CPU times: user 36 ms, sys: 3.54 ms, total: 39.5 ms
Wall time: 38.6 ms


In [13]:
%time ldamodel3 = LdaMulticore(doc_term_matrix, num_topics=3, id2word = dictionary, passes=50)
%time ldamodel5 = LdaMulticore(doc_term_matrix, num_topics=5, id2word = dictionary, passes=50)
%time ldamodel7 = LdaMulticore(doc_term_matrix, num_topics=7, id2word = dictionary, passes=50)
%time ldamodel9 = LdaMulticore(doc_term_matrix, num_topics=9, id2word = dictionary, passes=50)

CPU times: user 11.4 s, sys: 621 ms, total: 12.1 s
Wall time: 13.3 s
CPU times: user 15.9 s, sys: 1.21 s, total: 17.2 s
Wall time: 14.6 s
CPU times: user 17.5 s, sys: 1.47 s, total: 19 s
Wall time: 13.1 s
CPU times: user 17.5 s, sys: 1.46 s, total: 19 s
Wall time: 12.8 s


In [14]:
print(*ldamodel3.print_topics(num_topics=3, num_words=5), sep='\n')
print()
print(*ldamodel5.print_topics(num_topics=5, num_words=5), sep='\n')
print()
print(*ldamodel7.print_topics(num_topics=7, num_words=5), sep='\n')
print()
print(*ldamodel9.print_topics(num_topics=9, num_words=5), sep='\n')

(0, '0.012*"market" + 0.009*"tax" + 0.007*"u" + 0.005*"city" + 0.005*"state"')
(1, '0.009*"amazon" + 0.008*"sphere" + 0.008*"company" + 0.008*"seattle" + 0.005*"2018"')
(2, '0.008*"plant" + 0.008*"caterpillar" + 0.007*"share" + 0.007*"inc" + 0.006*"jan"')

(0, '0.015*"market" + 0.009*"company" + 0.006*"new" + 0.005*"iot" + 0.005*"2018"')
(1, '0.011*"sphere" + 0.011*"amazon" + 0.010*"seattle" + 0.007*"space" + 0.006*"2018"')
(2, '0.015*"tax" + 0.011*"u" + 0.006*"china" + 0.006*"year" + 0.006*"repatriation"')
(3, '0.017*"market" + 0.012*"inc" + 0.007*"jan" + 0.006*"industry" + 0.005*"report"')
(4, '0.011*"plant" + 0.007*"case" + 0.007*"city" + 0.007*"caterpillar" + 0.007*"estimate"')

(0, '0.019*"plant" + 0.012*"caterpillar" + 0.012*"case" + 0.010*"share" + 0.009*"company"')
(1, '0.009*"city" + 0.009*"median" + 0.008*"estimate" + 0.008*"university" + 0.007*"2017"')
(2, '0.023*"tax" + 0.016*"u" + 0.009*"china" + 0.009*"repatriation" + 0.007*"would"')
(3, '0.015*"inc" + 0.009*"jan" + 0.006

In [15]:
lda_display = pyLDAvis.gensim.prepare(ldamodel3, doc_term_matrix, dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display)

KeyboardInterrupt: 