## Advanced text mining with Python - Topic Modeling

In [1]:
#!pip install pyLDAvis

In [2]:
import sys
print(sys.version)

3.7.2 (default, Feb 12 2019, 08:15:36) 
[Clang 10.0.0 (clang-1000.11.45.5)]


In [3]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [4]:
import time
import math
import re
from textblob import TextBlob
import pandas as pd

import nltk as nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

import string

import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')

import gensim
from gensim import corpora, models
from gensim.models.ldamulticore import LdaMulticore
import pyLDAvis.gensim



In [5]:
directory = '~/Datasets/32018/'
#file = 'jeep.txt'
file = 'jeep_new.txt'
path = directory + file

In [6]:
tweets = pd.read_csv(path,sep='\t', names = ['id', 'lang', 'created_at', 'screen_name', \
                                                       'name', 'location', 'retweet_count', 'text'])

In [7]:
tweets.count()

id               68921
lang             68921
created_at       68921
screen_name      68921
name             68920
location         46879
retweet_count    68921
text             68921
dtype: int64

In [8]:
tweets.head(10)

Unnamed: 0,id,lang,created_at,screen_name,name,location,retweet_count,text
0,9.222399e+17,en,Sun Oct 22 23:15:03 +0000 2017,alyssa_rose4,Princess Alyssa♛,"Chicago, IL",0.0,@Rachel_31297 where’s the Jeep Wrangler option
1,9.222399e+17,en,Sun Oct 22 23:15:09 +0000 2017,negocialoya_us,NegocialoYa USA,Estados Unidos,0.0,Check this out: 2016 JEEP PATRIOT LATITUDE 4X4...
2,9.2224e+17,tl,Sun Oct 22 23:15:14 +0000 2017,Jasmne_abr,Jas,blueberry,0.0,Kadugay sa jeep
3,9.2224e+17,en,Sun Oct 22 23:15:15 +0000 2017,JFCO38,JUAN FCO,,0.0,RT @Jeep: The Grand Cherokee Trackhawk is offi...
4,9.2224e+17,tl,Sun Oct 22 23:15:15 +0000 2017,MaestreRaymond,Jin Rae Min,Northern Mindanao,0.0,Kadugay mularga sa jeep nga kadali ko😬😭
5,9.2224e+17,tl,Sun Oct 22 23:15:21 +0000 2017,troyxaquino,Troy,,0.0,Bang luluwag ng mga jeep hahaha
6,9.2224e+17,en,Sun Oct 22 23:15:27 +0000 2017,LifeForTrucker,TruckerForLife™,United States,0.0,RT @Jeep: Power stance. https://t.co/kl0oC8Xvof
7,9.2224e+17,en,Sun Oct 22 23:15:30 +0000 2017,rpx53,Rod O|||||||O,Where the blacktop ends,0.0,"@THEJeepMafia @Jeep Thanks, 28° but Chaos and ..."
8,9.2224e+17,tl,Sun Oct 22 23:15:34 +0000 2017,ronneldash,🏳️‍🌈ROX🏳️‍🌈,St. Paul,0.0,tangina mula 6:00 nasa sakayan ako tas 7:30 na...
9,9.222401e+17,tl,Sun Oct 22 23:15:49 +0000 2017,FayeUsi,aria,"Las Pinas City, National Capit",0.0,@Dncyngs ikaw yung nakakasabay ko lagi sa jeep...


In [9]:
# Filter non-English tweets
tweets_eng = tweets[tweets['lang']=='en'].reset_index(drop=True)

In [10]:
# Remove special characters to avoid problems with analysis
tweets_eng['text_clean'] = tweets_eng['text'].map(lambda x: re.sub('[^a-zA-Z0-9 @ . , : - _]', '', str(x)))

In [11]:
pd.set_option('display.max_colwidth', 100)
tweets_eng[['text', 'text_clean']].head(5)

Unnamed: 0,text,text_clean
0,@Rachel_31297 where’s the Jeep Wrangler option,@Rachel_31297 wheres the Jeep Wrangler option
1,Check this out: 2016 JEEP PATRIOT LATITUDE 4X4 - $8500 (MIAMI-DADE) 8500.00 USD https://t.co/akP...,Check this out: 2016 JEEP PATRIOT LATITUDE 4X4 8500 MIAMIDADE 8500.00 USD https:t.coakPkANPpEn ...
2,RT @Jeep: The Grand Cherokee Trackhawk is officially in production. Keep your eyes peeled for on...,RT @Jeep: The Grand Cherokee Trackhawk is officially in production. Keep your eyes peeled for on...
3,RT @Jeep: Power stance. https://t.co/kl0oC8Xvof,RT @Jeep: Power stance. https:t.cokl0oC8Xvof
4,"@THEJeepMafia @Jeep Thanks, 28° but Chaos and Bear were awesome feet warmers! 🐾🐾 https://t.co/nN...","@THEJeepMafia @Jeep Thanks, 28 but Chaos and Bear were awesome feet warmers https:t.conNIoaRVXlW"


## Topic Modeling
#### Topics can be defined as “a repeating pattern of co-occurring terms in a corpus”

### TF-IDF (term frequency–inverse document frequency)

#### Using TextBlob functionality to create TF-IDF function for our select Tweets

In [12]:
# http://stevenloria.com/finding-important-words-in-a-document-using-tf-idf/

def tf(word, blob):
    return blob.words.count(word) / len(blob.words)
# tf(word, blob) computes "term frequency" which is the number of times a word appears in a document blob, 
# normalized by dividing by the total number of words in blob. We use TextBlob for breaking up the text into words 
# and getting the word counts.


def n_containing(word, bloblist):
    return sum(1 for blob in bloblist if word in blob.words)
# n_containing(word, bloblist) returns the number of documents containing word. 
# A generator expression is passed to the sum() function.


def idf(word, bloblist):
    return math.log(len(bloblist) / (1 + n_containing(word, bloblist)))
# idf(word, bloblist) computes "inverse document frequency" which measures how common a word is 
# among all documents in bloblist. The more common a word is, the lower its idf. 
# We take the ratio of the total number of documents to the number of documents containing word, 
# then take the log of that. Add 1 to the divisor to prevent division by zero


def tfidf(word, blob, bloblist):
    return tf(word, blob) * idf(word, bloblist)
# tfidf(word, blob, bloblist) computes the TF-IDF score. It is simply the product of tf and idf.

In [13]:
bloblist = []
del bloblist[:]

for i  in range(0,len(tweets_eng)):
    bloblist.append(TextBlob(tweets_eng['text_clean'].iloc[i]))
    
len(bloblist)  

40704

In [14]:
for i, blob in enumerate(bloblist):
# Print top 5 values
    if i == 5:
        break
    print("Top words in tweet {}".format(i + 1))
    scores = {word: tfidf(word, blob, bloblist) for word in blob.words}
    sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    for word, score in sorted_words[:5]:
        print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))

Top words in tweet 1
	Word: Rachel_31297, TF-IDF: 1.65349
	Word: wheres, TF-IDF: 1.50077
	Word: option, TF-IDF: 1.226
	Word: Wrangler, TF-IDF: 0.38572
	Word: the, TF-IDF: 0.28659
Top words in tweet 2
	Word: t.coakPkANPpEn, TF-IDF: 0.58358
	Word: t.coL2WaZBsUTf, TF-IDF: 0.58358
	Word: MIAMIDADE, TF-IDF: 0.55973
	Word: 8500.00, TF-IDF: 0.55973
	Word: 8500, TF-IDF: 0.51896
Top words in tweet 3
	Word: t.cobbiPfPXXax, TF-IDF: 0.34509
	Word: peeled, TF-IDF: 0.34191
	Word: Keep, TF-IDF: 0.31915
	Word: wild, TF-IDF: 0.31915
	Word: officially, TF-IDF: 0.31637
Top words in tweet 4
	Word: t.cokl0oC8Xvof, TF-IDF: 0.89363
	Word: stance, TF-IDF: 0.89104
	Word: Power, TF-IDF: 0.85214
	Word: RT, TF-IDF: 0.18822
	Word: Jeep, TF-IDF: 0.06069
Top words in tweet 5
	Word: t.conNIoaRVXlW, TF-IDF: 0.70864
	Word: warmers, TF-IDF: 0.65913
	Word: Chaos, TF-IDF: 0.63017
	Word: Bear, TF-IDF: 0.60962
	Word: feet, TF-IDF: 0.56011


## LDA (latent dirichlet allocation)
#### LDA is a matrix factorization technique, which assumes documents are produced from a mixture of topics. Those topics then generate words based on their probability distribution.

In [15]:
#https://www.analyticsvidhya.com/blog/2016/08/beginners-guide-to-topic-modeling-in-python/

In [16]:
doc1 = "BMW upbeat sustained sales growth"
doc2 = "Ad wars When BMW Audi Mercedes Benz Jaguar prove prowess through advertisements"
doc3 = "BMW Protonic Frozen Yellow Edition Looks So Cool"
doc4 = "Judge Shuts Door On SoftClose Defect Suit Against BMW Law"
doc5 = "Just Listed BMW Alpina B Turbo Automobile Magazine"
doc6 = "How take part BMW Ultimate Driving Experience"
doc7 = "Long Beach BMW Motorcycles Becomes First BMW Dealer Offer Virtual Reality Experience Virtual Reality Reporter"
doc8 = "NYC Auto Show BMW M Performance Video Overview"
doc9 = "BMW F X Spy video shows SUV stress test"
doc10 = "Driver taken hospital BMW smashes tree Stourbridge Express Star"

# compile documents
doc_complete = [doc1, doc2, doc3, doc4, doc5, doc6, doc7, doc8, doc9, doc10]

In [17]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

doc_clean = [clean(doc).split() for doc in doc_complete]     

In [18]:
doc_clean[:3]

[['bmw', 'upbeat', 'sustained', 'sale', 'growth'],
 ['ad',
  'war',
  'bmw',
  'audi',
  'mercedes',
  'benz',
  'jaguar',
  'prove',
  'prowess',
  'advertisement'],
 ['bmw', 'protonic', 'frozen', 'yellow', 'edition', 'look', 'cool']]

In [19]:
# Creating the term dictionary of our courpus, where every unique term is assigned an index. 
dictionary = corpora.Dictionary(doc_clean)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

https://towardsdatascience.com/topic-modelling-in-python-with-nltk-and-gensim-4ef03213cd21

In [20]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

### Three-topic Model

In [21]:
# Running and Trainign LDA model on the document term matrix.
%time ldamodel = Lda(doc_term_matrix, num_topics=3, id2word = dictionary, passes=50) #3 topics
print(*ldamodel.print_topics(num_topics=3, num_words=3), sep='\n')

CPU times: user 223 ms, sys: 10.6 ms, total: 234 ms
Wall time: 227 ms
(0, '0.095*"bmw" + 0.035*"reality" + 0.035*"virtual"')
(1, '0.067*"bmw" + 0.038*"protonic" + 0.038*"frozen"')
(2, '0.067*"bmw" + 0.027*"benz" + 0.027*"advertisement"')


#### For larger datasets LdaMulticore should provide significant speed improvements

In [22]:
%time ldamodel = LdaMulticore(doc_term_matrix, num_topics=3, id2word = dictionary, passes=50) #3 topics
print(*ldamodel.print_topics(num_topics=3, num_words=3), sep='\n')

CPU times: user 255 ms, sys: 45.8 ms, total: 301 ms
Wall time: 338 ms
(0, '0.043*"defect" + 0.043*"law" + 0.043*"shuts"')
(1, '0.073*"bmw" + 0.051*"show" + 0.051*"video"')
(2, '0.098*"bmw" + 0.031*"reality" + 0.031*"virtual"')


In [23]:
#topics = ldamodel.print_topics(num_words=3)
#for topic in topics:
#    print(topic)

In [24]:
lda_display = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display)

### Five-topic Model

In [25]:
%time ldamodel = Lda(doc_term_matrix, num_topics=5, id2word = dictionary, passes=50) #5 topics
print(*ldamodel.print_topics(num_topics=5, num_words=5), sep='\n')

CPU times: user 312 ms, sys: 17.8 ms, total: 330 ms
Wall time: 352 ms
(0, '0.072*"bmw" + 0.039*"advertisement" + 0.039*"jaguar" + 0.039*"mercedes" + 0.039*"audi"')
(1, '0.086*"bmw" + 0.047*"show" + 0.047*"video" + 0.047*"performance" + 0.047*"overview"')
(2, '0.085*"bmw" + 0.032*"stress" + 0.032*"f" + 0.032*"spy" + 0.032*"suv"')
(3, '0.096*"bmw" + 0.050*"virtual" + 0.050*"reality" + 0.050*"experience" + 0.028*"dealer"')
(4, '0.015*"bmw" + 0.015*"show" + 0.015*"video" + 0.015*"part" + 0.015*"ultimate"')


In [26]:
lda_display = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display)

### Ten-topic Model

In [27]:
%time ldamodel = Lda(doc_term_matrix, num_topics=10, id2word = dictionary, passes=50)
print(*ldamodel.print_topics(num_topics=10, num_words=5), sep='\n')

CPU times: user 207 ms, sys: 6.03 ms, total: 213 ms
Wall time: 209 ms
(0, '0.015*"virtual" + 0.015*"auto" + 0.015*"reporter" + 0.015*"offer" + 0.015*"nyc"')
(1, '0.106*"bmw" + 0.056*"defect" + 0.056*"shuts" + 0.056*"suit" + 0.056*"door"')
(2, '0.015*"virtual" + 0.015*"auto" + 0.015*"reporter" + 0.015*"offer" + 0.015*"nyc"')
(3, '0.070*"smash" + 0.070*"stourbridge" + 0.070*"hospital" + 0.070*"driver" + 0.070*"star"')
(4, '0.096*"bmw" + 0.050*"video" + 0.050*"show" + 0.050*"f" + 0.050*"stress"')
(5, '0.015*"virtual" + 0.015*"auto" + 0.015*"reporter" + 0.015*"offer" + 0.015*"nyc"')
(6, '0.115*"bmw" + 0.059*"reality" + 0.059*"virtual" + 0.031*"offer" + 0.031*"dealer"')
(7, '0.080*"turbo" + 0.080*"alpina" + 0.080*"listed" + 0.080*"b" + 0.080*"automobile"')
(8, '0.015*"virtual" + 0.015*"auto" + 0.015*"reporter" + 0.015*"offer" + 0.015*"nyc"')
(9, '0.065*"advertisement" + 0.065*"jaguar" + 0.065*"benz" + 0.065*"prove" + 0.065*"mercedes"')


In [28]:
lda_display = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display)

### Applying LDA to tweets

In [29]:
tweets_list = tweets_eng['text_clean'].tolist()
tweets_list[:5]

['@Rachel_31297 wheres the Jeep Wrangler option',
 'Check this out: 2016 JEEP PATRIOT LATITUDE 4X4  8500 MIAMIDADE 8500.00 USD https:t.coakPkANPpEn ads https:t.coL2WaZBsUTf',
 'RT @Jeep: The Grand Cherokee Trackhawk is officially in production. Keep your eyes peeled for one in the wild. https:t.cobbiPfPXXax',
 'RT @Jeep: Power stance. https:t.cokl0oC8Xvof',
 '@THEJeepMafia @Jeep Thanks, 28 but Chaos and Bear were awesome feet warmers  https:t.conNIoaRVXlW']

In [30]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

tweet_clean = [clean(doc).split() for doc in tweets_list]

In [31]:
print(*tweet_clean[:3], sep='\n\n')

['rachel31297', 'wheres', 'jeep', 'wrangler', 'option']

['check', 'out', '2016', 'jeep', 'patriot', 'latitude', '4x4', '8500', 'miamidade', '850000', 'usd', 'httpstcoakpkanppen', 'ad', 'httpstcol2wazbsutf']

['rt', 'jeep', 'grand', 'cherokee', 'trackhawk', 'officially', 'production', 'keep', 'eye', 'peeled', 'one', 'wild', 'httpstcobbipfpxxax']


In [32]:
# Creating the term dictionary of our corpus, where every unique term is assigned an index. 

dictionary = corpora.Dictionary(tweet_clean)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.

%time doc_term_matrix = [dictionary.doc2bow(doc) for doc in tweet_clean]

CPU times: user 437 ms, sys: 16.3 ms, total: 453 ms
Wall time: 456 ms


In [33]:
#Using traditional LDA
%time ldamodel = Lda(doc_term_matrix, num_topics=10, id2word = dictionary, passes=50)

CPU times: user 6min 47s, sys: 2.26 s, total: 6min 49s
Wall time: 7min


In [34]:
#Using multicore LDA
%time ldamodel = LdaMulticore(doc_term_matrix, num_topics=10, id2word = dictionary, passes=50)

CPU times: user 3min 56s, sys: 38.2 s, total: 4min 35s
Wall time: 4min 43s


In [35]:
print(*ldamodel.print_topics(num_topics=10, num_words=3), sep='\n')

(0, '0.091*"jeep" + 0.049*"rt" + 0.019*"blue"')
(1, '0.084*"jeep" + 0.035*"rt" + 0.029*"wrangler"')
(2, '0.088*"jeep" + 0.044*"photo" + 0.042*"rt"')
(3, '0.126*"jeep" + 0.053*"rt" + 0.048*"cherokee"')
(4, '0.072*"jeep" + 0.026*"rt" + 0.018*"cherokee"')
(5, '0.072*"jeep" + 0.031*"2017" + 0.031*"video"')
(6, '0.105*"giveaway" + 0.064*"jeep" + 0.057*"girl"')
(7, '0.108*"jeep" + 0.017*"rt" + 0.015*"new"')
(8, '0.074*"jeep" + 0.043*"rt" + 0.027*"chrysler"')
(9, '0.123*"jeep" + 0.064*"wrangler" + 0.031*"rt"')


In [36]:
print(*ldamodel.print_topics(num_topics=10, num_words=3), sep='\n')

(0, '0.091*"jeep" + 0.049*"rt" + 0.019*"blue"')
(1, '0.084*"jeep" + 0.035*"rt" + 0.029*"wrangler"')
(2, '0.088*"jeep" + 0.044*"photo" + 0.042*"rt"')
(3, '0.126*"jeep" + 0.053*"rt" + 0.048*"cherokee"')
(4, '0.072*"jeep" + 0.026*"rt" + 0.018*"cherokee"')
(5, '0.072*"jeep" + 0.031*"2017" + 0.031*"video"')
(6, '0.105*"giveaway" + 0.064*"jeep" + 0.057*"girl"')
(7, '0.108*"jeep" + 0.017*"rt" + 0.015*"new"')
(8, '0.074*"jeep" + 0.043*"rt" + 0.027*"chrysler"')
(9, '0.123*"jeep" + 0.064*"wrangler" + 0.031*"rt"')


In [37]:
print(*ldamodel.print_topics(num_topics=10, num_words=5), sep='\n\n')

(0, '0.091*"jeep" + 0.049*"rt" + 0.019*"blue" + 0.016*"matte" + 0.013*"jeepahoiics"')

(1, '0.084*"jeep" + 0.035*"rt" + 0.029*"wrangler" + 0.020*"need" + 0.018*"check"')

(2, '0.088*"jeep" + 0.044*"photo" + 0.042*"rt" + 0.032*"used" + 0.025*"spotted"')

(3, '0.126*"jeep" + 0.053*"rt" + 0.048*"cherokee" + 0.032*"grand" + 0.013*"jeeplife"')

(4, '0.072*"jeep" + 0.026*"rt" + 0.018*"cherokee" + 0.012*"thejeepmafia" + 0.012*"jeepmafia"')

(5, '0.072*"jeep" + 0.031*"2017" + 0.031*"video" + 0.026*"youtube" + 0.026*"cherokee"')

(6, '0.105*"giveaway" + 0.064*"jeep" + 0.057*"girl" + 0.056*"win" + 0.054*"small"')

(7, '0.108*"jeep" + 0.017*"rt" + 0.015*"new" + 0.013*"car" + 0.013*"im"')

(8, '0.074*"jeep" + 0.043*"rt" + 0.027*"chrysler" + 0.023*"dodge" + 0.022*"compass"')

(9, '0.123*"jeep" + 0.064*"wrangler" + 0.031*"rt" + 0.023*"sport" + 0.019*"mile"')


In [38]:
print(*ldamodel.print_topics(num_topics=10, num_words=7), sep='\n\n')

(0, '0.091*"jeep" + 0.049*"rt" + 0.019*"blue" + 0.016*"matte" + 0.013*"jeepahoiics" + 0.012*"httpstco685ffiofnl" + 0.008*"compass"')

(1, '0.084*"jeep" + 0.035*"rt" + 0.029*"wrangler" + 0.020*"need" + 0.018*"check" + 0.017*"via" + 0.013*"ebay"')

(2, '0.088*"jeep" + 0.044*"photo" + 0.042*"rt" + 0.032*"used" + 0.025*"spotted" + 0.025*"stick" + 0.025*"bamboo"')

(3, '0.126*"jeep" + 0.053*"rt" + 0.048*"cherokee" + 0.032*"grand" + 0.013*"jeeplife" + 0.012*"limited" + 0.010*"jeepporn"')

(4, '0.072*"jeep" + 0.026*"rt" + 0.018*"cherokee" + 0.012*"thejeepmafia" + 0.012*"jeepmafia" + 0.010*"dodge" + 0.009*"jeeplife"')

(5, '0.072*"jeep" + 0.031*"2017" + 0.031*"video" + 0.026*"youtube" + 0.026*"cherokee" + 0.019*"grand" + 0.014*"rt"')

(6, '0.105*"giveaway" + 0.064*"jeep" + 0.057*"girl" + 0.056*"win" + 0.054*"small" + 0.054*"body" + 0.053*"chance"')

(7, '0.108*"jeep" + 0.017*"rt" + 0.015*"new" + 0.013*"car" + 0.013*"im" + 0.012*"get" + 0.011*"like"')

(8, '0.074*"jeep" + 0.043*"rt" + 0.027*"ch

In [39]:
print(*ldamodel.print_topics(num_topics=10, num_words=10), sep='\n\n')

(0, '0.091*"jeep" + 0.049*"rt" + 0.019*"blue" + 0.016*"matte" + 0.013*"jeepahoiics" + 0.012*"httpstco685ffiofnl" + 0.008*"compass" + 0.008*"lift" + 0.008*"madeinindia" + 0.008*"check"')

(1, '0.084*"jeep" + 0.035*"rt" + 0.029*"wrangler" + 0.020*"need" + 0.018*"check" + 0.017*"via" + 0.013*"ebay" + 0.012*"decal" + 0.011*"httpstcoyipkreck1n" + 0.011*"luxbucketlist"')

(2, '0.088*"jeep" + 0.044*"photo" + 0.042*"rt" + 0.032*"used" + 0.025*"spotted" + 0.025*"stick" + 0.025*"bamboo" + 0.024*"nigeria" + 0.024*"hummer" + 0.024*"convey"')

(3, '0.126*"jeep" + 0.053*"rt" + 0.048*"cherokee" + 0.032*"grand" + 0.013*"jeeplife" + 0.012*"limited" + 0.010*"jeepporn" + 0.009*"life" + 0.009*"take" + 0.009*"renegade"')

(4, '0.072*"jeep" + 0.026*"rt" + 0.018*"cherokee" + 0.012*"thejeepmafia" + 0.012*"jeepmafia" + 0.010*"dodge" + 0.009*"jeeplife" + 0.009*"car" + 0.009*"trackhawk" + 0.009*"wrangler"')

(5, '0.072*"jeep" + 0.031*"2017" + 0.031*"video" + 0.026*"youtube" + 0.026*"cherokee" + 0.019*"grand" + 0

In [40]:
lda_display = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display)

KeyboardInterrupt: 

## TF-IDF on news articles

In [None]:
directory = 'C://Users//IBM_ADMIN//Documents//Teaching//Data Projects//Text//Webhose//'
news_articles = 'news_toyota.pkl'
path = directory+news_articles

In [None]:
news_df = pd.read_pickle(directory+news_articles)

In [None]:
news_df.head(5)

In [None]:
news_df.shape

In [None]:
# Filter non-English tweets
news_eng = news_df[news_df['language']=='english'].reset_index(drop=True)

In [None]:
# Remove special characters to avoid problems with analysis
news_eng['text_clean'] = news_eng['text'].map(lambda x: re.sub('[^a-zA-Z0-9 @ . , : - _]', '', str(x)))

In [None]:
pd.set_option('display.max_colwidth', 100)
    news_eng[['text', 'text_clean']].head(5)

In [None]:
bloblist = []
del bloblist[:]

for i  in range(0,len(news_eng)):
    bloblist.append(TextBlob(news_eng['text_clean'].iloc[i]))
    
len(bloblist) 

In [None]:
for i, blob in enumerate(bloblist):
# Print top 5 values
    if i == 5:
        break
    print("Top words in news article {}".format(i + 1))
    scores = {word: tfidf(word, blob, bloblist) for word in blob.words}
    sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    for word, score in sorted_words[:10]:
        print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))

### Applying LDA to news articles

In [None]:
news_list = news_eng['text_clean'].tolist()
news_list[:1]

In [None]:
news_clean = [clean(doc).split() for doc in news_list]

In [None]:
len(news_clean)

In [None]:
print(*news_clean[:1], sep='\n\n')

In [None]:
# Creating the term dictionary of our courpus, where every unique term is assigned an index. 

dictionary = corpora.Dictionary(news_clean)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.

%time doc_term_matrix = [dictionary.doc2bow(doc) for doc in news_clean]

#### 3 topic model

In [None]:
numtopics = 3

%time ldamodel = LdaMulticore(doc_term_matrix, num_topics=numtopics, id2word = dictionary, passes=50)

In [None]:
print(*ldamodel.print_topics(num_topics=numtopics, num_words=5), sep='\n\n')

In [None]:
print(*ldamodel.print_topics(num_topics=numtopics, num_words=7), sep='\n\n')

In [None]:
print(*ldamodel.print_topics(num_topics=numtopics, num_words=10), sep='\n\n')

In [None]:
lda_display = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display)

#### 5 topic model

In [None]:
numtopics = 5

%time ldamodel = LdaMulticore(doc_term_matrix, num_topics=numtopics, id2word = dictionary, passes=50)

In [None]:
print(*ldamodel.print_topics(num_topics=numtopics, num_words=3), sep='\n\n')

In [None]:
print(*ldamodel.print_topics(num_topics=numtopics, num_words=5), sep='\n\n')

In [None]:
print(*ldamodel.print_topics(num_topics=numtopics, num_words=7), sep='\n\n')

In [None]:
print(*ldamodel.print_topics(num_topics=numtopics, num_words=10), sep='\n\n')

In [None]:
print(*ldamodel.print_topics(num_topics=numtopics, num_words=10), sep='\n\n')

In [None]:
lda_display = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display)

#### 10 topic model

In [None]:
numtopics = 10

%time ldamodel = LdaMulticore(doc_term_matrix, num_topics=numtopics, id2word = dictionary, passes=50)

In [None]:
print(*ldamodel.print_topics(num_topics=numtopics, num_words=3), sep='\n\n')

In [None]:
print(*ldamodel.print_topics(num_topics=numtopics, num_words=5), sep='\n\n')

In [None]:
print(*ldamodel.print_topics(num_topics=numtopics, num_words=7), sep='\n\n')

In [None]:
print(*ldamodel.print_topics(num_topics=numtopics, num_words=10), sep='\n\n')

In [None]:
print(*ldamodel.print_topics(num_topics=numtopics, num_words=10), sep='\n\n')

In [None]:
lda_display = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display)