## Topic Segmentation

In [1]:
# Add project path to the PYTHONPATH

import os
import sys
from pathlib import Path

sys.path.append(Path(os.path.join(os.path.abspath(''), '../')).resolve().as_posix())

In [2]:
from nlp.dataset import Dataset

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/rmohashi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
filename = '../datasets/topic_segmentation/1175455210979725313-1175407650365935616_batman.csv'
dataset = Dataset(filename, label_col='label', text_col='text')
dataset.load()
dataset.preprocess_texts(no_emoji=True)

Time to clean up: 3.49 sec


In [4]:
df = dataset.dataframe
df.cleaned.head()

0    gonna hang tonight added batman fortnut last t...
1    bat signal projected onto landmarks across wor...
2                                    batman theme song
3    die hard batman fan ring tone tv series theme ...
4    not scared almost anything terrified joker ori...
Name: cleaned, dtype: object

In [5]:
import gensim
from gensim.utils import simple_preprocess
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *

In [6]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/rmohashi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
stemmer = SnowballStemmer("english")

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess_tweet(text):
    result = []
    for token in text.split():
        result.append(token)
#         result.append(lemmatize_stemming(token))
            
    return result

In [8]:
processed_tweets = [preprocess_tweet(tweet) for tweet in df.cleaned]

In [9]:
%%time
dictionary = gensim.corpora.Dictionary(processed_tweets)
dictionary.filter_extremes(no_below=15, no_above=0.1, keep_n= 100000)
bow_corpus = [dictionary.doc2bow(tweets) for tweets in processed_tweets]

CPU times: user 485 ms, sys: 0 ns, total: 485 ms
Wall time: 487 ms


In [10]:
document_num = 0
bow_doc_x = bow_corpus[document_num]

for i in range(len(bow_doc_x)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_x[i][0], 
                                                     dictionary[bow_doc_x[i][0]], 
                                                     bow_doc_x[i][1]))

Word 0 ("added") appears 1 time.
Word 1 ("gonna") appears 1 time.
Word 2 ("justice") appears 1 time.
Word 3 ("last") appears 1 time.
Word 4 ("need") appears 1 time.
Word 5 ("see") appears 1 time.
Word 6 ("thing") appears 1 time.
Word 7 ("tonight") appears 1 time.


In [53]:
%%time
lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = 6, 
                                   id2word = dictionary,                                    
                                   passes = 10,
                                   workers = 4)

CPU times: user 25.8 s, sys: 1.46 s, total: 27.2 s
Wall time: 29.1 s


In [12]:
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.029*"favorite" + 0.024*"today" + 0.018*"one" + 0.017*"knight" + 0.016*"dark" + 0.016*"movie" + 0.015*"comics" + 0.014*"comic" + 0.013*"th" + 0.013*"time"


Topic: 1 
Words: 0.116*"fortnite" + 0.043*"x" + 0.030*"new" + 0.028*"gotham" + 0.024*"event" + 0.023*"city" + 0.022*"arkham" + 0.021*"live" + 0.020*"code" + 0.019*"batman:"


Topic: 2 
Words: 0.061*"best" + 0.050*"bat" + 0.044*"celebrate" + 0.032*"world" + 0.032*"signal" + 0.024*"ever" + 0.021*"dccomics" + 0.020*"across" + 0.020*"projected" + 0.020*"landmarks"


Topic: 3 
Words: 0.022*"game" + 0.018*"like" + 0.014*"good" + 0.014*"fortnite" + 0.011*"gonna" + 0.011*"would" + 0.011*"skin" + 0.011*"knight" + 0.010*"everyone" + 0.010*"no"


Topic: 4 
Words: 0.023*"like" + 0.020*"skin" + 0.018*"bundle" + 0.017*"get" + 0.015*"pack" + 0.015*"want" + 0.015*"buy" + 0.014*"really" + 0.013*"joker" + 0.012*"need"




In [13]:
import pyLDAvis.gensim

In [54]:
pyLDAvis.enable_notebook()

pyLDAvis.gensim.prepare(lda_model, bow_corpus, dictionary)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [22]:
%%time
hdp = gensim.models.HdpModel(bow_corpus, dictionary)

  start_time = time.clock()


In [26]:
import pandas as pd 

In [49]:
def topic_prob_extractor(gensim_hdp):
    shown_topics = gensim_hdp.show_topics(num_topics=100, formatted=False)
    topics_nos = [x[0] for x in shown_topics ]
    weights = [ sum([item[1] for item in shown_topics[topicN][1]]) for topicN in topics_nos ]

    return pd.DataFrame({'topic_id' : topics_nos, 'weight' : weights})

In [52]:
hdp.show_topics(num_topics=20)

[(0,
  '0.007*house + 0.007*main + 0.006*robert + 0.006*rich + 0.006*thinking + 0.006*definitely + 0.005*story + 0.005*screen + 0.005*happybatmanday + 0.005*grew + 0.005*conroy + 0.005*sorry + 0.005*see + 0.005*weird + 0.005*taking + 0.005*brave + 0.005*channel + 0.005*figure + 0.005*hear + 0.005*ya'),
 (1,
  '0.008*run + 0.007*town + 0.007*holy + 0.006*september + 0.006*choice + 0.006*either + 0.006*wind + 0.006*keaton + 0.005*used + 0.005*watching + 0.005*comics + 0.005*vbucks + 0.005*season + 0.005*superheroes + 0.005*darkest + 0.005*play + 0.005*loves + 0.005*buying + 0.004*download + 0.004*terry'),
 (2,
  '0.008*easily + 0.007*bob + 0.006*might + 0.006*legend + 0.006*comics + 0.006*twitch + 0.006*appreciate + 0.006*deserve + 0.006*penguin + 0.005*even + 0.005*joke + 0.005*deals: + 0.005*no + 0.005*sure + 0.005*due + 0.005*caped + 0.005*themed + 0.005*ones + 0.005*major + 0.005*remember'),
 (3,
  '0.008*thoughts + 0.007*year + 0.007*person + 0.006*question + 0.006*years + 0.006*sto

In [44]:
topic_prob_extractor(hdp).sort_values('weight', ascending=False)

Unnamed: 0,topic_id,weight
54,54,0.130086
84,84,0.122082
92,92,0.121903
87,87,0.121083
99,99,0.120899
60,60,0.118855
68,68,0.117877
34,34,0.117543
25,25,0.116781
83,83,0.116390


In [48]:
hdp.show_topics(num_topics=100, formatted=False)[54]

(54,
 [('full', 0.009303129136030987),
  ('tomorrow', 0.009099250545639101),
  ('something', 0.008523792231207673),
  ('child', 0.008110896507258524),
  ('celebrating', 0.007816790184412448),
  ('lawn', 0.007298630739805863),
  ('hollow', 0.007098484717081091),
  ('back', 0.0067521159187030185),
  ('beyond', 0.006378469779929438),
  ('nice', 0.006230062526697725),
  ('cop', 0.00589092874897036),
  ('make', 0.005792175296984575),
  ('official', 0.005399727525769403),
  ('hi', 0.00535921165757075),
  ('week', 0.005338749453099278),
  ('light', 0.005300247390953402),
  ('wayne', 0.00527612765517106),
  ('really', 0.005173241668444097),
  ('knight', 0.005019613663233169),
  ('voice', 0.004924645765172755)])