## Topic Segmentation

In [1]:
# Add project path to the PYTHONPATH

import os
import sys
from pathlib import Path

sys.path.append(Path(os.path.join(os.path.abspath(''), '../')).resolve().as_posix())

In [2]:
from nlp.dataset import Dataset

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/rmohashi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
filename = '../datasets/topic_segmentation/1175455210979725313-1175407650365935616_batman.csv'
dataset = Dataset(filename, label_col='label', text_col='text')
dataset.load()
dataset.preprocess_texts(no_emoji=True)

Time to clean up: 3.49 sec


In [4]:
df = dataset.dataframe
df.cleaned.head()

0    gonna hang tonight added batman fortnut last t...
1    bat signal projected onto landmarks across wor...
2                                    batman theme song
3    die hard batman fan ring tone tv series theme ...
4    not scared almost anything terrified joker ori...
Name: cleaned, dtype: object

In [5]:
import gensim
from gensim.utils import simple_preprocess
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *

In [6]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/rmohashi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
stemmer = SnowballStemmer("english")

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess_tweet(text):
    result = []
    for token in text.split():
        result.append(token)
#         result.append(lemmatize_stemming(token))
            
    return result

In [8]:
processed_tweets = [preprocess_tweet(tweet) for tweet in df.cleaned]

In [9]:
%%time
dictionary = gensim.corpora.Dictionary(processed_tweets)
dictionary.filter_extremes(no_below=15, no_above=0.1, keep_n= 100000)
bow_corpus = [dictionary.doc2bow(tweets) for tweets in processed_tweets]

CPU times: user 485 ms, sys: 0 ns, total: 485 ms
Wall time: 487 ms


In [10]:
document_num = 0
bow_doc_x = bow_corpus[document_num]

for i in range(len(bow_doc_x)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_x[i][0], 
                                                     dictionary[bow_doc_x[i][0]], 
                                                     bow_doc_x[i][1]))

Word 0 ("added") appears 1 time.
Word 1 ("gonna") appears 1 time.
Word 2 ("justice") appears 1 time.
Word 3 ("last") appears 1 time.
Word 4 ("need") appears 1 time.
Word 5 ("see") appears 1 time.
Word 6 ("thing") appears 1 time.
Word 7 ("tonight") appears 1 time.


In [11]:
%%time
lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = 5, 
                                   id2word = dictionary,                                    
                                   passes = 10,
                                   workers = 4)

CPU times: user 32.4 s, sys: 1.94 s, total: 34.3 s
Wall time: 40.6 s


In [12]:
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.029*"favorite" + 0.024*"today" + 0.018*"one" + 0.017*"knight" + 0.016*"dark" + 0.016*"movie" + 0.015*"comics" + 0.014*"comic" + 0.013*"th" + 0.013*"time"


Topic: 1 
Words: 0.116*"fortnite" + 0.043*"x" + 0.030*"new" + 0.028*"gotham" + 0.024*"event" + 0.023*"city" + 0.022*"arkham" + 0.021*"live" + 0.020*"code" + 0.019*"batman:"


Topic: 2 
Words: 0.061*"best" + 0.050*"bat" + 0.044*"celebrate" + 0.032*"world" + 0.032*"signal" + 0.024*"ever" + 0.021*"dccomics" + 0.020*"across" + 0.020*"projected" + 0.020*"landmarks"


Topic: 3 
Words: 0.022*"game" + 0.018*"like" + 0.014*"good" + 0.014*"fortnite" + 0.011*"gonna" + 0.011*"would" + 0.011*"skin" + 0.011*"knight" + 0.010*"everyone" + 0.010*"no"


Topic: 4 
Words: 0.023*"like" + 0.020*"skin" + 0.018*"bundle" + 0.017*"get" + 0.015*"pack" + 0.015*"want" + 0.015*"buy" + 0.014*"really" + 0.013*"joker" + 0.012*"need"




In [13]:
import pyLDAvis.gensim

In [14]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda_model, bow_corpus, dictionary)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [15]:
df[df.text.str.contains('amp')].text

11      @beefkr10z Quite hard to tell currently what w...
14      @MemeBloody Isn’t twitter for expressing your ...
78      Celebrating 80 Years of #Batman with #BatmanDa...
147     Batman Jones of course, Batman saved a couple ...
149     Batman is drawn into conflict with Mr. Whisper...
154     Pow! #Batman was born in #TheBronx. Writer Bil...
175     &amp; have psychological damage/abnormalities:...
177     @AstroIoxy Wow!!! Thats what happens when you ...
181     I’ve been fortunate to have played in the Batm...
189     I see your smoke bomb Batman &amp; I'll raise ...
191     Batman, Batman Returns, The Dark Knight Triolo...
205     Lastly, I watched Batman Forever and Batman &a...
228     ⚡️ x1 Batman Caped Crusader Pack Giveaway ⚡️ \...
262     If I was off today for #BatmanDay I would've d...
267     🐾Catwoman/Selina Kyle\n🐾Expert Cat Burglar\n🐾B...
312     Happy #BatmanDay! The Caped Crusader is 80 yrs...
352     https://t.co/dpIbGyyPPH Crime-fighting duo #Ba...
433     Happy 