## Advanced text mining with Python - Topic Modeling

In [12]:
#!pip install pyLDAvis

In [13]:
import sys
print(sys.version)

3.7.2 (default, Feb 12 2019, 08:15:36) 
[Clang 10.0.0 (clang-1000.11.45.5)]


In [14]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [15]:
import time
import math
import re
from textblob import TextBlob
import pandas as pd

import nltk as nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

import string

import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')

import gensim
from gensim import corpora, models
from gensim.models.ldamulticore import LdaMulticore
import pyLDAvis.gensim



In [16]:
directory = '~/Datasets/32018/'
#file = 'jeep.txt'
file = 'jeep_new.txt'
path = directory + file

In [17]:
tweets = pd.read_csv(path,sep='\t', names = ['id', 'lang', 'created_at', 'screen_name', \
                                                       'name', 'location', 'retweet_count', 'text'])

In [18]:
tweets.count()

id               68921
lang             68921
created_at       68921
screen_name      68921
name             68920
location         46879
retweet_count    68921
text             68921
dtype: int64

In [19]:
tweets.head(10)

Unnamed: 0,id,lang,created_at,screen_name,name,location,retweet_count,text
0,9.222399e+17,en,Sun Oct 22 23:15:03 +0000 2017,alyssa_rose4,Princess Alyssa♛,"Chicago, IL",0.0,@Rachel_31297 where’s the Jeep Wrangler option
1,9.222399e+17,en,Sun Oct 22 23:15:09 +0000 2017,negocialoya_us,NegocialoYa USA,Estados Unidos,0.0,Check this out: 2016 JEEP PATRIOT LATITUDE 4X4...
2,9.2224e+17,tl,Sun Oct 22 23:15:14 +0000 2017,Jasmne_abr,Jas,blueberry,0.0,Kadugay sa jeep
3,9.2224e+17,en,Sun Oct 22 23:15:15 +0000 2017,JFCO38,JUAN FCO,,0.0,RT @Jeep: The Grand Cherokee Trackhawk is offi...
4,9.2224e+17,tl,Sun Oct 22 23:15:15 +0000 2017,MaestreRaymond,Jin Rae Min,Northern Mindanao,0.0,Kadugay mularga sa jeep nga kadali ko😬😭
5,9.2224e+17,tl,Sun Oct 22 23:15:21 +0000 2017,troyxaquino,Troy,,0.0,Bang luluwag ng mga jeep hahaha
6,9.2224e+17,en,Sun Oct 22 23:15:27 +0000 2017,LifeForTrucker,TruckerForLife™,United States,0.0,RT @Jeep: Power stance. https://t.co/kl0oC8Xvof
7,9.2224e+17,en,Sun Oct 22 23:15:30 +0000 2017,rpx53,Rod O|||||||O,Where the blacktop ends,0.0,"@THEJeepMafia @Jeep Thanks, 28° but Chaos and ..."
8,9.2224e+17,tl,Sun Oct 22 23:15:34 +0000 2017,ronneldash,🏳️‍🌈ROX🏳️‍🌈,St. Paul,0.0,tangina mula 6:00 nasa sakayan ako tas 7:30 na...
9,9.222401e+17,tl,Sun Oct 22 23:15:49 +0000 2017,FayeUsi,aria,"Las Pinas City, National Capit",0.0,@Dncyngs ikaw yung nakakasabay ko lagi sa jeep...


In [20]:
# Filter non-English tweets
tweets_eng = tweets[tweets['lang']=='en'].reset_index(drop=True)

In [21]:
# Remove special characters to avoid problems with analysis
tweets_eng['text_clean'] = tweets_eng['text'].map(lambda x: re.sub('[^a-zA-Z0-9 @ . , : - _]', '', str(x)))

In [22]:
pd.set_option('display.max_colwidth', 100)
tweets_eng[['text', 'text_clean']].head(5)

Unnamed: 0,text,text_clean
0,@Rachel_31297 where’s the Jeep Wrangler option,@Rachel_31297 wheres the Jeep Wrangler option
1,Check this out: 2016 JEEP PATRIOT LATITUDE 4X4 - $8500 (MIAMI-DADE) 8500.00 USD https://t.co/akP...,Check this out: 2016 JEEP PATRIOT LATITUDE 4X4 8500 MIAMIDADE 8500.00 USD https:t.coakPkANPpEn ...
2,RT @Jeep: The Grand Cherokee Trackhawk is officially in production. Keep your eyes peeled for on...,RT @Jeep: The Grand Cherokee Trackhawk is officially in production. Keep your eyes peeled for on...
3,RT @Jeep: Power stance. https://t.co/kl0oC8Xvof,RT @Jeep: Power stance. https:t.cokl0oC8Xvof
4,"@THEJeepMafia @Jeep Thanks, 28° but Chaos and Bear were awesome feet warmers! 🐾🐾 https://t.co/nN...","@THEJeepMafia @Jeep Thanks, 28 but Chaos and Bear were awesome feet warmers https:t.conNIoaRVXlW"


## Topic Modeling
#### Topics can be defined as “a repeating pattern of co-occurring terms in a corpus”

### TF-IDF (term frequency–inverse document frequency)

#### Using TextBlob functionality to create TF-IDF function for our select Tweets

In [23]:
# http://stevenloria.com/finding-important-words-in-a-document-using-tf-idf/

def tf(word, blob):
    return blob.words.count(word) / len(blob.words)
# tf(word, blob) computes "term frequency" which is the number of times a word appears in a document blob, 
# normalized by dividing by the total number of words in blob. We use TextBlob for breaking up the text into words 
# and getting the word counts.


def n_containing(word, bloblist):
    return sum(1 for blob in bloblist if word in blob.words)
# n_containing(word, bloblist) returns the number of documents containing word. 
# A generator expression is passed to the sum() function.


def idf(word, bloblist):
    return math.log(len(bloblist) / (1 + n_containing(word, bloblist)))
# idf(word, bloblist) computes "inverse document frequency" which measures how common a word is 
# among all documents in bloblist. The more common a word is, the lower its idf. 
# We take the ratio of the total number of documents to the number of documents containing word, 
# then take the log of that. Add 1 to the divisor to prevent division by zero


def tfidf(word, blob, bloblist):
    return tf(word, blob) * idf(word, bloblist)
# tfidf(word, blob, bloblist) computes the TF-IDF score. It is simply the product of tf and idf.

In [24]:
bloblist = []
del bloblist[:]

for i  in range(0,len(tweets_eng)):
    bloblist.append(TextBlob(tweets_eng['text_clean'].iloc[i]))
    
len(bloblist)  

40704

In [25]:
for i, blob in enumerate(bloblist):
# Print top 5 values
    if i == 5:
        break
    print("Top words in tweet {}".format(i + 1))
    scores = {word: tfidf(word, blob, bloblist) for word in blob.words}
    sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    for word, score in sorted_words[:5]:
        print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))

Top words in tweet 1
	Word: Rachel_31297, TF-IDF: 1.65349
	Word: wheres, TF-IDF: 1.50077
	Word: option, TF-IDF: 1.226
	Word: Wrangler, TF-IDF: 0.38572
	Word: the, TF-IDF: 0.28659
Top words in tweet 2
	Word: t.coakPkANPpEn, TF-IDF: 0.58358
	Word: t.coL2WaZBsUTf, TF-IDF: 0.58358
	Word: MIAMIDADE, TF-IDF: 0.55973
	Word: 8500.00, TF-IDF: 0.55973
	Word: 8500, TF-IDF: 0.51896
Top words in tweet 3
	Word: t.cobbiPfPXXax, TF-IDF: 0.34509
	Word: peeled, TF-IDF: 0.34191
	Word: Keep, TF-IDF: 0.31915
	Word: wild, TF-IDF: 0.31915
	Word: officially, TF-IDF: 0.31637
Top words in tweet 4
	Word: t.cokl0oC8Xvof, TF-IDF: 0.89363
	Word: stance, TF-IDF: 0.89104
	Word: Power, TF-IDF: 0.85214
	Word: RT, TF-IDF: 0.18822
	Word: Jeep, TF-IDF: 0.06069
Top words in tweet 5
	Word: t.conNIoaRVXlW, TF-IDF: 0.70864
	Word: warmers, TF-IDF: 0.65913
	Word: Chaos, TF-IDF: 0.63017
	Word: Bear, TF-IDF: 0.60962
	Word: feet, TF-IDF: 0.56011


## LDA (latent dirichlet allocation)
#### LDA is a matrix factorization technique, which assumes documents are produced from a mixture of topics. Those topics then generate words based on their probability distribution.

In [15]:
#https://www.analyticsvidhya.com/blog/2016/08/beginners-guide-to-topic-modeling-in-python/

In [16]:
doc1 = "BMW upbeat sustained sales growth"
doc2 = "Ad wars When BMW Audi Mercedes Benz Jaguar prove prowess through advertisements"
doc3 = "BMW Protonic Frozen Yellow Edition Looks So Cool"
doc4 = "Judge Shuts Door On SoftClose Defect Suit Against BMW Law"
doc5 = "Just Listed BMW Alpina B Turbo Automobile Magazine"
doc6 = "How take part BMW Ultimate Driving Experience"
doc7 = "Long Beach BMW Motorcycles Becomes First BMW Dealer Offer Virtual Reality Experience Virtual Reality Reporter"
doc8 = "NYC Auto Show BMW M Performance Video Overview"
doc9 = "BMW F X Spy video shows SUV stress test"
doc10 = "Driver taken hospital BMW smashes tree Stourbridge Express Star"

# compile documents
doc_complete = [doc1, doc2, doc3, doc4, doc5, doc6, doc7, doc8, doc9, doc10]

In [17]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

doc_clean = [clean(doc).split() for doc in doc_complete]     

In [19]:
doc_clean[:3]

[['bmw', 'upbeat', 'sustained', 'sale', 'growth'],
 ['ad',
  'war',
  'bmw',
  'audi',
  'mercedes',
  'benz',
  'jaguar',
  'prove',
  'prowess',
  'advertisement'],
 ['bmw', 'protonic', 'frozen', 'yellow', 'edition', 'look', 'cool']]

In [18]:
# Creating the term dictionary of our courpus, where every unique term is assigned an index. 
dictionary = corpora.Dictionary(doc_clean)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

https://towardsdatascience.com/topic-modelling-in-python-with-nltk-and-gensim-4ef03213cd21

In [21]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

### Three-topic Model

In [24]:
# Running and Trainign LDA model on the document term matrix.
%time ldamodel = Lda(doc_term_matrix, num_topics=3, id2word = dictionary, passes=50) #3 topics
print(*ldamodel.print_topics(num_topics=3, num_words=3), sep='\n')

Wall time: 501 ms
(0, '0.048*"sustained" + 0.048*"sale" + 0.048*"upbeat"')
(1, '0.083*"bmw" + 0.025*"stourbridge" + 0.025*"smash"')
(2, '0.090*"bmw" + 0.033*"reality" + 0.033*"virtual"')


#### For larger datasets LdaMulticore should provide significant speed improvements

In [28]:
%time ldamodel = LdaMulticore(doc_term_matrix, num_topics=3, id2word = dictionary, passes=50) #3 topics
print(*ldamodel.print_topics(num_topics=3, num_words=3), sep='\n')

Wall time: 9.68 s
(0, '0.076*"bmw" + 0.031*"express" + 0.031*"star"')
(1, '0.092*"bmw" + 0.034*"virtual" + 0.034*"reality"')
(2, '0.060*"bmw" + 0.035*"video" + 0.035*"show"')


In [61]:
#topics = ldamodel.print_topics(num_words=3)
#for topic in topics:
#    print(topic)

In [62]:
lda_display = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display)

### Five-topic Model

In [69]:
%time ldamodel = Lda(doc_term_matrix, num_topics=5, id2word = dictionary, passes=50) #5 topics
print(*ldamodel.print_topics(num_topics=5, num_words=5), sep='\n')

Wall time: 500 ms
(0, '0.077*"bmw" + 0.077*"virtual" + 0.077*"reality" + 0.042*"experience" + 0.042*"becomes"')
(1, '0.058*"turbo" + 0.058*"automobile" + 0.058*"b" + 0.058*"listed" + 0.058*"magazine"')
(2, '0.095*"bmw" + 0.022*"audi" + 0.022*"ad" + 0.022*"mercedes" + 0.022*"jaguar"')
(3, '0.065*"growth" + 0.065*"sale" + 0.065*"sustained" + 0.065*"upbeat" + 0.064*"bmw"')
(4, '0.077*"bmw" + 0.042*"video" + 0.042*"show" + 0.042*"shuts" + 0.042*"softclose"')


In [64]:
lda_display = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display)

### Ten-topic Model

In [71]:
%time ldamodel = Lda(doc_term_matrix, num_topics=10, id2word = dictionary, passes=50)
print(*ldamodel.print_topics(num_topics=10, num_words=5), sep='\n')

Wall time: 484 ms
(0, '0.015*"virtual" + 0.015*"auto" + 0.015*"reporter" + 0.015*"offer" + 0.015*"nyc"')
(1, '0.015*"virtual" + 0.015*"auto" + 0.015*"reporter" + 0.015*"offer" + 0.015*"nyc"')
(2, '0.096*"bmw" + 0.096*"reality" + 0.096*"virtual" + 0.050*"experience" + 0.050*"reporter"')
(3, '0.015*"virtual" + 0.015*"auto" + 0.015*"reporter" + 0.015*"offer" + 0.015*"nyc"')
(4, '0.070*"video" + 0.070*"show" + 0.070*"f" + 0.070*"stress" + 0.070*"spy"')
(5, '0.093*"upbeat" + 0.093*"sustained" + 0.093*"sale" + 0.093*"growth" + 0.093*"bmw"')
(6, '0.080*"edition" + 0.080*"frozen" + 0.080*"cool" + 0.080*"look" + 0.080*"yellow"')
(7, '0.070*"smash" + 0.070*"stourbridge" + 0.070*"hospital" + 0.070*"driver" + 0.070*"star"')
(8, '0.114*"bmw" + 0.025*"door" + 0.025*"magazine" + 0.025*"law" + 0.025*"defect"')
(9, '0.015*"virtual" + 0.015*"auto" + 0.015*"reporter" + 0.015*"offer" + 0.015*"nyc"')


In [72]:
lda_display = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display)

### Applying LDA to tweets

In [121]:
tweets_list = tweets_eng['text_clean'].tolist()
tweets_list[:5]

['@Rachel_31297 wheres the Jeep Wrangler option',
 'Check this out: 2016 JEEP PATRIOT LATITUDE 4X4  8500 MIAMIDADE 8500.00 USD https:t.coakPkANPpEn ads https:t.coL2WaZBsUTf',
 'RT @Jeep: The Grand Cherokee Trackhawk is officially in production. Keep your eyes peeled for one in the wild. https:t.cobbiPfPXXax',
 'RT @Jeep: Power stance. https:t.cokl0oC8Xvof',
 '@THEJeepMafia @Jeep Thanks, 28 but Chaos and Bear were awesome feet warmers  https:t.conNIoaRVXlW']

In [122]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

tweet_clean = [clean(doc).split() for doc in tweets_list]

In [123]:
print(*tweet_clean[:3], sep='\n\n')

['rachel31297', 'wheres', 'jeep', 'wrangler', 'option']

['check', 'out', '2016', 'jeep', 'patriot', 'latitude', '4x4', '8500', 'miamidade', '850000', 'usd', 'httpstcoakpkanppen', 'ad', 'httpstcol2wazbsutf']

['rt', 'jeep', 'grand', 'cherokee', 'trackhawk', 'officially', 'production', 'keep', 'eye', 'peeled', 'one', 'wild', 'httpstcobbipfpxxax']


In [124]:
# Creating the term dictionary of our corpus, where every unique term is assigned an index. 

dictionary = corpora.Dictionary(tweet_clean)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.

%time doc_term_matrix = [dictionary.doc2bow(doc) for doc in tweet_clean]

Wall time: 527 ms


In [125]:
#Using traditional LDA
%time ldamodel = Lda(doc_term_matrix, num_topics=10, id2word = dictionary, passes=50)

Wall time: 13min 36s


In [126]:
#Using multicore LDA
%time ldamodel = LdaMulticore(doc_term_matrix, num_topics=10, id2word = dictionary, passes=50)

Wall time: 8min 9s


In [127]:
print(*ldamodel.print_topics(num_topics=10, num_words=3), sep='\n')

(0, '0.111*"jeep" + 0.073*"wrangler" + 0.022*"unlimited"')
(1, '0.121*"jeep" + 0.106*"rt" + 0.016*"luxbucketlist"')
(2, '0.067*"jeep" + 0.038*"rt" + 0.017*"compass"')
(3, '0.098*"jeep" + 0.012*"im" + 0.011*"rt"')
(4, '0.085*"jeep" + 0.032*"used" + 0.026*"rt"')
(5, '0.063*"jeep" + 0.020*"via" + 0.019*"check"')
(6, '0.107*"giveaway" + 0.059*"jeep" + 0.059*"girl"')
(7, '0.086*"jeep" + 0.044*"rt" + 0.022*"jeepporn"')
(8, '0.101*"jeep" + 0.062*"cherokee" + 0.046*"grand"')
(9, '0.078*"jeep" + 0.027*"rt" + 0.013*"wrangler"')


In [128]:
print(*ldamodel.print_topics(num_topics=10, num_words=3), sep='\n')

(0, '0.111*"jeep" + 0.073*"wrangler" + 0.022*"unlimited"')
(1, '0.121*"jeep" + 0.106*"rt" + 0.016*"luxbucketlist"')
(2, '0.067*"jeep" + 0.038*"rt" + 0.017*"compass"')
(3, '0.098*"jeep" + 0.012*"im" + 0.011*"rt"')
(4, '0.085*"jeep" + 0.032*"used" + 0.026*"rt"')
(5, '0.063*"jeep" + 0.020*"via" + 0.019*"check"')
(6, '0.107*"giveaway" + 0.059*"jeep" + 0.059*"girl"')
(7, '0.086*"jeep" + 0.044*"rt" + 0.022*"jeepporn"')
(8, '0.101*"jeep" + 0.062*"cherokee" + 0.046*"grand"')
(9, '0.078*"jeep" + 0.027*"rt" + 0.013*"wrangler"')


In [129]:
print(*ldamodel.print_topics(num_topics=10, num_words=5), sep='\n\n')

(0, '0.111*"jeep" + 0.073*"wrangler" + 0.022*"unlimited" + 0.022*"sport" + 0.018*"rt"')

(1, '0.121*"jeep" + 0.106*"rt" + 0.016*"luxbucketlist" + 0.015*"jeeplife" + 0.013*"need"')

(2, '0.067*"jeep" + 0.038*"rt" + 0.017*"compass" + 0.013*"amp" + 0.010*"world"')

(3, '0.098*"jeep" + 0.012*"im" + 0.011*"rt" + 0.011*"like" + 0.010*"get"')

(4, '0.085*"jeep" + 0.032*"used" + 0.026*"rt" + 0.026*"photo" + 0.025*"spotted"')

(5, '0.063*"jeep" + 0.020*"via" + 0.019*"check" + 0.019*"ebay" + 0.017*"wrangler"')

(6, '0.107*"giveaway" + 0.059*"jeep" + 0.059*"girl" + 0.057*"win" + 0.055*"small"')

(7, '0.086*"jeep" + 0.044*"rt" + 0.022*"jeepporn" + 0.016*"beep" + 0.012*"make"')

(8, '0.101*"jeep" + 0.062*"cherokee" + 0.046*"grand" + 0.018*"2017" + 0.013*"wrangler"')

(9, '0.078*"jeep" + 0.027*"rt" + 0.013*"wrangler" + 0.011*"mile" + 0.010*"stuck"')


In [130]:
print(*ldamodel.print_topics(num_topics=10, num_words=7), sep='\n\n')

(0, '0.111*"jeep" + 0.073*"wrangler" + 0.022*"unlimited" + 0.022*"sport" + 0.018*"rt" + 0.013*"ebay" + 0.013*"rubicon"')

(1, '0.121*"jeep" + 0.106*"rt" + 0.016*"luxbucketlist" + 0.015*"jeeplife" + 0.013*"need" + 0.009*"matte" + 0.009*"httpstcoyipkreck1n"')

(2, '0.067*"jeep" + 0.038*"rt" + 0.017*"compass" + 0.013*"amp" + 0.010*"world" + 0.008*"new" + 0.008*"around"')

(3, '0.098*"jeep" + 0.012*"im" + 0.011*"rt" + 0.011*"like" + 0.010*"get" + 0.008*"want" + 0.008*"car"')

(4, '0.085*"jeep" + 0.032*"used" + 0.026*"rt" + 0.026*"photo" + 0.025*"spotted" + 0.025*"stick" + 0.024*"bamboo"')

(5, '0.063*"jeep" + 0.020*"via" + 0.019*"check" + 0.019*"ebay" + 0.017*"wrangler" + 0.015*"decal" + 0.013*"oracal"')

(6, '0.107*"giveaway" + 0.059*"jeep" + 0.059*"girl" + 0.057*"win" + 0.055*"small" + 0.055*"body" + 0.054*"chance"')

(7, '0.086*"jeep" + 0.044*"rt" + 0.022*"jeepporn" + 0.016*"beep" + 0.012*"make" + 0.011*"looking" + 0.010*"mean"')

(8, '0.101*"jeep" + 0.062*"cherokee" + 0.046*"grand" + 0

In [131]:
print(*ldamodel.print_topics(num_topics=10, num_words=10), sep='\n\n')

(0, '0.111*"jeep" + 0.073*"wrangler" + 0.022*"unlimited" + 0.022*"sport" + 0.018*"rt" + 0.013*"ebay" + 0.013*"rubicon" + 0.010*"2016" + 0.007*"4x4" + 0.007*"mile"')

(1, '0.121*"jeep" + 0.106*"rt" + 0.016*"luxbucketlist" + 0.015*"jeeplife" + 0.013*"need" + 0.009*"matte" + 0.009*"httpstcoyipkreck1n" + 0.009*"better" + 0.008*"life" + 0.008*"black"')

(2, '0.067*"jeep" + 0.038*"rt" + 0.017*"compass" + 0.013*"amp" + 0.010*"world" + 0.008*"new" + 0.008*"around" + 0.008*"makeinindia" + 0.006*"internationally" + 0.006*"madeinindia"')

(3, '0.098*"jeep" + 0.012*"im" + 0.011*"rt" + 0.011*"like" + 0.010*"get" + 0.008*"want" + 0.008*"car" + 0.007*"got" + 0.007*"love" + 0.007*"back"')

(4, '0.085*"jeep" + 0.032*"used" + 0.026*"rt" + 0.026*"photo" + 0.025*"spotted" + 0.025*"stick" + 0.024*"bamboo" + 0.024*"nigeria" + 0.024*"hummer" + 0.024*"convey"')

(5, '0.063*"jeep" + 0.020*"via" + 0.019*"check" + 0.019*"ebay" + 0.017*"wrangler" + 0.015*"decal" + 0.013*"oracal" + 0.012*"graphic" + 0.012*"side" +

In [132]:
lda_display = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display)

## TF-IDF on news articles

In [61]:
directory = 'C://Users//IBM_ADMIN//Documents//Teaching//Data Projects//Text//Webhose//'
news_articles = 'news_toyota.pkl'
path = directory+news_articles

In [62]:
news_df = pd.read_pickle(directory+news_articles)

In [63]:
news_df.head(5)

Unnamed: 0,crawled,language,text,title
0,2018-02-02T04:24:51.072+02:00,english,"QR Code Link to This Post All maintenance receipts available, one owner truck. Cash sale. No tra...",Dependable truck 03 Toyota Tacoma Double Cab $1500
1,2018-02-02T04:27:15.000+02:00,english,"0 \nNEW YORK: Automakers reported mixed US car sales in January, with strong demand for SUVs and...",US car sales mixed in January; trucks stay strong
2,2018-02-02T04:34:00.008+02:00,english,transmission: automatic 2005 Toyota Camry LE 4 door 4 cyl AUTOMATIC VERY CLEAN INSIDE CLOTH IN...,2005 TOYOTA CAMRY LE 167300 MILEAGE $2450 (TALLASSEE) $2450
3,2018-02-02T04:36:42.006+02:00,english,favorite this post Brand New Toyota Avalon Floor Mats - $115 (New Britain) hide this posting unh...,Brand New Toyota Avalon Floor Mats (New Britain) $115
4,2018-02-02T04:38:24.018+02:00,english,more ads by this user QR Code Link to This Post Black w/Piano Black w/Perforated NuLuxe Seat Tri...,2016 Lexus ES 350 (Coliseum Lexus of Oakland) $27772


In [64]:
news_df.shape

(100, 4)

In [65]:
# Filter non-English tweets
news_eng = news_df[news_df['language']=='english'].reset_index(drop=True)

In [66]:
# Remove special characters to avoid problems with analysis
news_eng['text_clean'] = news_eng['text'].map(lambda x: re.sub('[^a-zA-Z0-9 @ . , : - _]', '', str(x)))

In [67]:
pd.set_option('display.max_colwidth', 100)
    news_eng[['text', 'text_clean']].head(5)

Unnamed: 0,text,text_clean
0,"QR Code Link to This Post All maintenance receipts available, one owner truck. Cash sale. No tra...","QR Code Link to This Post All maintenance receipts available, one owner truck. Cash sale. No tra..."
1,"0 \nNEW YORK: Automakers reported mixed US car sales in January, with strong demand for SUVs and...","0 NEW YORK: Automakers reported mixed US car sales in January, with strong demand for SUVs and p..."
2,transmission: automatic 2005 Toyota Camry LE 4 door 4 cyl AUTOMATIC VERY CLEAN INSIDE CLOTH IN...,transmission: automatic 2005 Toyota Camry LE 4 door 4 cyl AUTOMATIC VERY CLEAN INSIDE CLOTH IN...
3,favorite this post Brand New Toyota Avalon Floor Mats - $115 (New Britain) hide this posting unh...,favorite this post Brand New Toyota Avalon Floor Mats 115 New Britain hide this posting unhide ...
4,more ads by this user QR Code Link to This Post Black w/Piano Black w/Perforated NuLuxe Seat Tri...,more ads by this user QR Code Link to This Post Black wPiano Black wPerforated NuLuxe Seat Trim....


In [68]:
bloblist = []
del bloblist[:]

for i  in range(0,len(news_eng)):
    bloblist.append(TextBlob(news_eng['text_clean'].iloc[i]))
    
len(bloblist) 

100

In [69]:
for i, blob in enumerate(bloblist):
# Print top 5 values
    if i == 5:
        break
    print("Top words in news article {}".format(i + 1))
    scores = {word: tfidf(word, blob, bloblist) for word in blob.words}
    sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    for word, score in sorted_words[:10]:
        print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))

Top words in news article 1
	Word: receipts, TF-IDF: 0.21733
	Word: Cash, TF-IDF: 0.21733
	Word: 6477478013, TF-IDF: 0.21733
	Word: sale, TF-IDF: 0.19481
	Word: maintenance, TF-IDF: 0.17883
	Word: owner, TF-IDF: 0.17883
	Word: available, TF-IDF: 0.16643
	Word: trades, TF-IDF: 0.1563
	Word: truck, TF-IDF: 0.11779
	Word: QR, TF-IDF: 0.09844
Top words in news article 2
	Word: And, TF-IDF: 0.06643
	Word: In, TF-IDF: 0.05853
	Word: sales, TF-IDF: 0.04664
	Word: US, TF-IDF: 0.02365
	Word: The, TF-IDF: 0.02218
	Word: pickup, TF-IDF: 0.02214
	Word: saw, TF-IDF: 0.02117
	Word: strong, TF-IDF: 0.02106
	Word: percent, TF-IDF: 0.01984
	Word: overall, TF-IDF: 0.0195
Top words in news article 3
	Word: AUTOMATIC, TF-IDF: 0.18935
	Word: automatic, TF-IDF: 0.15643
	Word: LE, TF-IDF: 0.11506
	Word: cyl, TF-IDF: 0.11506
	Word: VERY, TF-IDF: 0.11506
	Word: INSIDE, TF-IDF: 0.11506
	Word: CLOTH, TF-IDF: 0.11506
	Word: INTERIOR, TF-IDF: 0.11506
	Word: NICE, TF-IDF: 0.11506
	Word: Just, TF-IDF: 0.11506
Top wo

### Applying LDA to news articles

In [70]:
news_list = news_eng['text_clean'].tolist()
news_list[:1]

['QR Code Link to This Post All maintenance receipts available, one owner truck. Cash sale. No trades.   6477478013']

In [71]:
news_clean = [clean(doc).split() for doc in news_list]

In [72]:
len(news_clean)

100

In [73]:
print(*news_clean[:1], sep='\n\n')

['qr', 'code', 'link', 'post', 'maintenance', 'receipt', 'available', 'one', 'owner', 'truck', 'cash', 'sale', 'trade', '6477478013']


In [74]:
# Creating the term dictionary of our courpus, where every unique term is assigned an index. 

dictionary = corpora.Dictionary(news_clean)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.

%time doc_term_matrix = [dictionary.doc2bow(doc) for doc in news_clean]

Wall time: 17 ms


#### 3 topic model

In [100]:
numtopics = 3

%time ldamodel = LdaMulticore(doc_term_matrix, num_topics=numtopics, id2word = dictionary, passes=50)

Wall time: 37 s


In [101]:
`

(0, '0.013*"percent" + 0.011*"car" + 0.005*"said"')
(1, '0.019*"toyota" + 0.011*"vehicle" + 0.011*"sale"')
(2, '0.019*"u" + 0.018*"percent" + 0.012*"cent"')


In [102]:
print(*ldamodel.print_topics(num_topics=numtopics, num_words=5), sep='\n\n')

(0, '0.013*"percent" + 0.011*"car" + 0.005*"said" + 0.005*"toyota" + 0.004*"job"')

(1, '0.019*"toyota" + 0.011*"vehicle" + 0.011*"sale" + 0.008*"ford" + 0.008*"year"')

(2, '0.019*"u" + 0.018*"percent" + 0.012*"cent" + 0.011*"earnings" + 0.011*"per"')


In [103]:
print(*ldamodel.print_topics(num_topics=numtopics, num_words=7), sep='\n\n')

(0, '0.013*"percent" + 0.011*"car" + 0.005*"said" + 0.005*"toyota" + 0.004*"job" + 0.004*"state" + 0.004*"company"')

(1, '0.019*"toyota" + 0.011*"vehicle" + 0.011*"sale" + 0.008*"ford" + 0.008*"year" + 0.005*"2018" + 0.005*"motor"')

(2, '0.019*"u" + 0.018*"percent" + 0.012*"cent" + 0.011*"earnings" + 0.011*"per" + 0.011*"yield" + 0.010*"share"')


In [104]:
print(*ldamodel.print_topics(num_topics=numtopics, num_words=10), sep='\n\n')

(0, '0.013*"percent" + 0.011*"car" + 0.005*"said" + 0.005*"toyota" + 0.004*"job" + 0.004*"state" + 0.004*"company" + 0.004*"d" + 0.004*"export" + 0.004*"also"')

(1, '0.019*"toyota" + 0.011*"vehicle" + 0.011*"sale" + 0.008*"ford" + 0.008*"year" + 0.005*"2018" + 0.005*"motor" + 0.005*"new" + 0.005*"car" + 0.004*"january"')

(2, '0.019*"u" + 0.018*"percent" + 0.012*"cent" + 0.011*"earnings" + 0.011*"per" + 0.011*"yield" + 0.010*"share" + 0.010*"index" + 0.008*"lower" + 0.008*"investor"')


In [105]:
lda_display = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display)

#### 5 topic model

In [106]:
numtopics = 5

%time ldamodel = LdaMulticore(doc_term_matrix, num_topics=numtopics, id2word = dictionary, passes=50)

Wall time: 37.7 s


In [109]:
print(*ldamodel.print_topics(num_topics=numtopics, num_words=3), sep='\n\n')

(0, '0.017*"unit" + 0.014*"market" + 0.014*"percent"')

(1, '0.021*"toyota" + 0.014*"sale" + 0.013*"vehicle"')

(2, '0.017*"ford" + 0.010*"post" + 0.009*"toyota"')

(3, '0.021*"u" + 0.019*"percent" + 0.012*"earnings"')

(4, '0.019*"car" + 0.006*"d" + 0.005*"law"')


In [107]:
print(*ldamodel.print_topics(num_topics=numtopics, num_words=5), sep='\n\n')

(0, '0.017*"unit" + 0.014*"market" + 0.014*"percent" + 0.013*"toyota" + 0.009*"january"')

(1, '0.021*"toyota" + 0.014*"sale" + 0.013*"vehicle" + 0.009*"percent" + 0.006*"japan"')

(2, '0.017*"ford" + 0.010*"post" + 0.009*"toyota" + 0.008*"year" + 0.007*"contact"')

(3, '0.021*"u" + 0.019*"percent" + 0.012*"earnings" + 0.012*"yield" + 0.011*"index"')

(4, '0.019*"car" + 0.006*"d" + 0.005*"law" + 0.004*"toyota" + 0.004*"player"')


In [108]:
print(*ldamodel.print_topics(num_topics=numtopics, num_words=7), sep='\n\n')

(0, '0.017*"unit" + 0.014*"market" + 0.014*"percent" + 0.013*"toyota" + 0.009*"january" + 0.008*"vehicle" + 0.007*"2018"')

(1, '0.021*"toyota" + 0.014*"sale" + 0.013*"vehicle" + 0.009*"percent" + 0.006*"japan" + 0.006*"year" + 0.005*"industry"')

(2, '0.017*"ford" + 0.010*"post" + 0.009*"toyota" + 0.008*"year" + 0.007*"contact" + 0.006*"margin" + 0.006*"export"')

(3, '0.021*"u" + 0.019*"percent" + 0.012*"earnings" + 0.012*"yield" + 0.011*"index" + 0.009*"share" + 0.009*"lower"')

(4, '0.019*"car" + 0.006*"d" + 0.005*"law" + 0.004*"toyota" + 0.004*"player" + 0.004*"ball" + 0.004*"review"')


In [110]:
print(*ldamodel.print_topics(num_topics=numtopics, num_words=10), sep='\n\n')

(0, '0.017*"unit" + 0.014*"market" + 0.014*"percent" + 0.013*"toyota" + 0.009*"january" + 0.008*"vehicle" + 0.007*"2018" + 0.007*"new" + 0.007*"1" + 0.006*"year"')

(1, '0.021*"toyota" + 0.014*"sale" + 0.013*"vehicle" + 0.009*"percent" + 0.006*"japan" + 0.006*"year" + 0.005*"industry" + 0.005*"lexus" + 0.005*"company" + 0.005*"2018"')

(2, '0.017*"ford" + 0.010*"post" + 0.009*"toyota" + 0.008*"year" + 0.007*"contact" + 0.006*"margin" + 0.006*"export" + 0.006*"service" + 0.005*"company" + 0.005*"motor"')

(3, '0.021*"u" + 0.019*"percent" + 0.012*"earnings" + 0.012*"yield" + 0.011*"index" + 0.009*"share" + 0.009*"lower" + 0.009*"cent" + 0.009*"per" + 0.009*"investor"')

(4, '0.019*"car" + 0.006*"d" + 0.005*"law" + 0.004*"toyota" + 0.004*"player" + 0.004*"ball" + 0.004*"review" + 0.004*"get" + 0.004*"release" + 0.004*"new"')


In [111]:
print(*ldamodel.print_topics(num_topics=numtopics, num_words=10), sep='\n\n')

(0, '0.017*"unit" + 0.014*"market" + 0.014*"percent" + 0.013*"toyota" + 0.009*"january" + 0.008*"vehicle" + 0.007*"2018" + 0.007*"new" + 0.007*"1" + 0.006*"year"')

(1, '0.021*"toyota" + 0.014*"sale" + 0.013*"vehicle" + 0.009*"percent" + 0.006*"japan" + 0.006*"year" + 0.005*"industry" + 0.005*"lexus" + 0.005*"company" + 0.005*"2018"')

(2, '0.017*"ford" + 0.010*"post" + 0.009*"toyota" + 0.008*"year" + 0.007*"contact" + 0.006*"margin" + 0.006*"export" + 0.006*"service" + 0.005*"company" + 0.005*"motor"')

(3, '0.021*"u" + 0.019*"percent" + 0.012*"earnings" + 0.012*"yield" + 0.011*"index" + 0.009*"share" + 0.009*"lower" + 0.009*"cent" + 0.009*"per" + 0.009*"investor"')

(4, '0.019*"car" + 0.006*"d" + 0.005*"law" + 0.004*"toyota" + 0.004*"player" + 0.004*"ball" + 0.004*"review" + 0.004*"get" + 0.004*"release" + 0.004*"new"')


In [112]:
lda_display = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display)

#### 10 topic model

In [113]:
numtopics = 10

%time ldamodel = LdaMulticore(doc_term_matrix, num_topics=numtopics, id2word = dictionary, passes=50)

Wall time: 34.8 s


In [114]:
print(*ldamodel.print_topics(num_topics=numtopics, num_words=3), sep='\n\n')

(0, '0.024*"u" + 0.020*"percent" + 0.014*"earnings"')

(1, '0.028*"toyota" + 0.018*"lexus" + 0.017*"air"')

(2, '0.012*"toyota" + 0.009*"japan" + 0.008*"hydrogen"')

(3, '0.017*"percent" + 0.014*"sale" + 0.009*"year"')

(4, '0.010*"truck" + 0.010*"wythe" + 0.008*"toyota"')

(5, '0.029*"percent" + 0.017*"toyota" + 0.011*"car"')

(6, '0.013*"export" + 0.010*"year" + 0.010*"percent"')

(7, '0.025*"post" + 0.018*"contact" + 0.016*"car"')

(8, '0.014*"unit" + 0.012*"sale" + 0.011*"toyota"')

(9, '0.035*"ford" + 0.014*"vehicle" + 0.013*"margin"')


In [115]:
print(*ldamodel.print_topics(num_topics=numtopics, num_words=5), sep='\n\n')

(0, '0.024*"u" + 0.020*"percent" + 0.014*"earnings" + 0.014*"yield" + 0.012*"index"')

(1, '0.028*"toyota" + 0.018*"lexus" + 0.017*"air" + 0.015*"bag" + 0.012*"could"')

(2, '0.012*"toyota" + 0.009*"japan" + 0.008*"hydrogen" + 0.008*"australia" + 0.006*"company"')

(3, '0.017*"percent" + 0.014*"sale" + 0.009*"year" + 0.009*"toyota" + 0.006*"law"')

(4, '0.010*"truck" + 0.010*"wythe" + 0.008*"toyota" + 0.006*"said" + 0.006*"county"')

(5, '0.029*"percent" + 0.017*"toyota" + 0.011*"car" + 0.010*"1" + 0.008*"market"')

(6, '0.013*"export" + 0.010*"year" + 0.010*"percent" + 0.010*"toyota" + 0.008*"vehicle"')

(7, '0.025*"post" + 0.018*"contact" + 0.016*"car" + 0.015*"offer" + 0.014*"service"')

(8, '0.014*"unit" + 0.012*"sale" + 0.011*"toyota" + 0.011*"market" + 0.009*"per"')

(9, '0.035*"ford" + 0.014*"vehicle" + 0.013*"margin" + 0.011*"commodity" + 0.011*"toyota"')


In [118]:
print(*ldamodel.print_topics(num_topics=numtopics, num_words=7), sep='\n\n')

(0, '0.024*"u" + 0.020*"percent" + 0.014*"earnings" + 0.014*"yield" + 0.012*"index" + 0.011*"share" + 0.011*"cent"')

(1, '0.028*"toyota" + 0.018*"lexus" + 0.017*"air" + 0.015*"bag" + 0.012*"could" + 0.011*"vehicle" + 0.009*"prius"')

(2, '0.012*"toyota" + 0.009*"japan" + 0.008*"hydrogen" + 0.008*"australia" + 0.006*"company" + 0.006*"year" + 0.005*"also"')

(3, '0.017*"percent" + 0.014*"sale" + 0.009*"year" + 0.009*"toyota" + 0.006*"law" + 0.006*"last" + 0.006*"january"')

(4, '0.010*"truck" + 0.010*"wythe" + 0.008*"toyota" + 0.006*"said" + 0.006*"county" + 0.006*"livestock" + 0.005*"avoid"')

(5, '0.029*"percent" + 0.017*"toyota" + 0.011*"car" + 0.010*"1" + 0.008*"market" + 0.007*"vehicle" + 0.007*"canada"')

(6, '0.013*"export" + 0.010*"year" + 0.010*"percent" + 0.010*"toyota" + 0.008*"vehicle" + 0.008*"said" + 0.007*"january"')

(7, '0.025*"post" + 0.018*"contact" + 0.016*"car" + 0.015*"offer" + 0.014*"service" + 0.013*"id" + 0.012*"unsolicited"')

(8, '0.014*"unit" + 0.012*"sale" 

In [116]:
print(*ldamodel.print_topics(num_topics=numtopics, num_words=10), sep='\n\n')

(0, '0.024*"u" + 0.020*"percent" + 0.014*"earnings" + 0.014*"yield" + 0.012*"index" + 0.011*"share" + 0.011*"cent" + 0.011*"per" + 0.010*"investor" + 0.010*"benchmark"')

(1, '0.028*"toyota" + 0.018*"lexus" + 0.017*"air" + 0.015*"bag" + 0.012*"could" + 0.011*"vehicle" + 0.009*"prius" + 0.008*"owner" + 0.008*"nx" + 0.008*"recall"')

(2, '0.012*"toyota" + 0.009*"japan" + 0.008*"hydrogen" + 0.008*"australia" + 0.006*"company" + 0.006*"year" + 0.005*"also" + 0.005*"car" + 0.005*"one" + 0.004*"museum"')

(3, '0.017*"percent" + 0.014*"sale" + 0.009*"year" + 0.009*"toyota" + 0.006*"law" + 0.006*"last" + 0.006*"january" + 0.006*"ball" + 0.005*"player" + 0.005*"month"')

(4, '0.010*"truck" + 0.010*"wythe" + 0.008*"toyota" + 0.006*"said" + 0.006*"county" + 0.006*"livestock" + 0.005*"avoid" + 0.005*"geller" + 0.005*"pig" + 0.005*"tractortrailer"')

(5, '0.029*"percent" + 0.017*"toyota" + 0.011*"car" + 0.010*"1" + 0.008*"market" + 0.007*"vehicle" + 0.007*"canada" + 0.007*"losing" + 0.007*"review" 

In [117]:
print(*ldamodel.print_topics(num_topics=numtopics, num_words=10), sep='\n\n')

(0, '0.024*"u" + 0.020*"percent" + 0.014*"earnings" + 0.014*"yield" + 0.012*"index" + 0.011*"share" + 0.011*"cent" + 0.011*"per" + 0.010*"investor" + 0.010*"benchmark"')

(1, '0.028*"toyota" + 0.018*"lexus" + 0.017*"air" + 0.015*"bag" + 0.012*"could" + 0.011*"vehicle" + 0.009*"prius" + 0.008*"owner" + 0.008*"nx" + 0.008*"recall"')

(2, '0.012*"toyota" + 0.009*"japan" + 0.008*"hydrogen" + 0.008*"australia" + 0.006*"company" + 0.006*"year" + 0.005*"also" + 0.005*"car" + 0.005*"one" + 0.004*"museum"')

(3, '0.017*"percent" + 0.014*"sale" + 0.009*"year" + 0.009*"toyota" + 0.006*"law" + 0.006*"last" + 0.006*"january" + 0.006*"ball" + 0.005*"player" + 0.005*"month"')

(4, '0.010*"truck" + 0.010*"wythe" + 0.008*"toyota" + 0.006*"said" + 0.006*"county" + 0.006*"livestock" + 0.005*"avoid" + 0.005*"geller" + 0.005*"pig" + 0.005*"tractortrailer"')

(5, '0.029*"percent" + 0.017*"toyota" + 0.011*"car" + 0.010*"1" + 0.008*"market" + 0.007*"vehicle" + 0.007*"canada" + 0.007*"losing" + 0.007*"review" 

In [119]:
lda_display = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display)