In [38]:
#Imports
import pandas as pd
import re

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

import gensim

In [51]:
## Constant
stop_words = set(stopwords.words('english'))
SAMPLE_SIZE = 10000
MAX_TOPIC = 10

In [25]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

In [16]:
## Taking 1000 random of the dataset for test
df = pd.read_csv('abcnews-date-text.csv')
df = df.sample(n = SAMPLE_SIZE)

In [17]:
df.shape

(10000, 2)

In [36]:
corpus = []
for col in df["headline_text"]:
    col = re.sub('^[A-Za-z]',"",col)
    word_tokens = word_tokenize(col)
    filtered_tokens = [w for w in word_tokens if w not in stop_words]
    stem_tokens = [stemmer.stem(w) for w in filtered_tokens]
    lammetized_tokens = [lemmatizer.lemmatize(w) for w in stem_tokens]
    corpus.append(lammetized_tokens)

In [41]:
dictionary = gensim.corpora.Dictionary(corpus)

In [43]:
count=0
for k,v in dictionary.iteritems():
    print(k,v)
    count+=1
    if count>20:
        break

0 heat
1 leav
2 mum
3 penrith
4 young
5 attack
6 boy
7 nake
8 northern
9 nsw
10 lectrolux
11 shutdown
12 coach
13 estern
14 gombau
15 josep
16 sack
17 sydney
18 wander
19 detain
20 korea


In [45]:
bow_corpus=[dictionary.doc2bow(doc) for doc in corpus]

In [52]:
lda_model=gensim.models.LdaMulticore(bow_corpus,
                                    num_topics=MAX_TOPIC,
                                    id2word=dictionary,
                                    passes=5)



In [53]:
for idx,topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic))
    print("\n")

Topic: 0 
Words: 0.007*"plan" + 0.007*"help" + 0.006*"fire" + 0.005*"sw" + 0.005*"nterview" + 0.004*"hospit" + 0.004*"olic" + 0.004*"flood" + 0.004*"report" + 0.004*"case"


Topic: 1 
Words: 0.008*"say" + 0.006*"car" + 0.006*"kill" + 0.005*"delay" + 0.004*"power" + 0.004*"olic" + 0.004*"u" + 0.004*"dead" + 0.004*"new" + 0.004*"sex"


Topic: 2 
Words: 0.008*"win" + 0.005*"say" + 0.005*"plan" + 0.004*"chang" + 0.004*"coast" + 0.003*"titl" + 0.003*"name" + 0.003*"home" + 0.003*"boost" + 0.003*"water"


Topic: 3 
Words: 0.011*"charg" + 0.009*"polic" + 0.008*"man" + 0.007*"attack" + 0.006*"call" + 0.005*"murder" + 0.005*"court" + 0.005*"olic" + 0.005*"back" + 0.004*"kill"


Topic: 4 
Words: 0.006*"say" + 0.005*"fire" + 0.004*"plan" + 0.004*"claim" + 0.004*"pm" + 0.003*"nsw" + 0.003*"cut" + 0.003*"vote" + 0.003*"health" + 0.003*"hous"


Topic: 5 
Words: 0.005*"court" + 0.005*"win" + 0.004*"rate" + 0.004*"new" + 0.004*"charg" + 0.004*"open" + 0.003*"warn" + 0.003*"elect" + 0.003*"law" + 0.003

In [54]:
lda_model[bow_corpus[4310]]

[(0, 0.010007298),
 (1, 0.010007434),
 (2, 0.010007923),
 (3, 0.01000806),
 (4, 0.010007644),
 (5, 0.9099319),
 (6, 0.010007212),
 (7, 0.010007176),
 (8, 0.010007675),
 (9, 0.010007671)]

In [70]:
bow_corpus[6000]

[(402, 1), (713, 1), (1712, 1), (1754, 1)]

In [69]:
for index,score in sorted(lda_model[bow_corpus[6000]], key=lambda tup: -1*tup[1]):
    print("Score: {} \ntopic: {} \n{}\n".format(score, index, lda_model.print_topics(index, 1)))

Score: 0.8198750019073486 
topic: 0 
[(5, '0.005*"court"'), (2, '0.008*"win"'), (3, '0.011*"charg"'), (4, '0.006*"say"'), (7, '0.005*"plan"'), (1, '0.008*"say"'), (0, '0.007*"plan"'), (9, '0.004*"nation"'), (8, '0.005*"test"'), (6, '0.006*"urg"')]

Score: 0.02002139575779438 
topic: 9 
[(2, '0.008*"win"'), (5, '0.005*"court"'), (6, '0.006*"urg"'), (1, '0.008*"say"'), (3, '0.011*"charg"'), (4, '0.006*"say"'), (7, '0.005*"plan"'), (9, '0.004*"nation"'), (8, '0.005*"test"')]

Score: 0.020018182694911957 
topic: 1 
[(8, '0.005*"test"')]

Score: 0.020013956353068352 
topic: 2 
[(2, '0.008*"win"'), (5, '0.005*"court"')]

Score: 0.020013757050037384 
topic: 7 
[(2, '0.008*"win"'), (5, '0.005*"court"'), (4, '0.006*"say"'), (7, '0.005*"plan"'), (3, '0.011*"charg"'), (9, '0.004*"nation"'), (6, '0.006*"urg"')]

Score: 0.020012732595205307 
topic: 6 
[(8, '0.005*"test"'), (7, '0.005*"plan"'), (0, '0.007*"plan"'), (4, '0.006*"say"'), (2, '0.008*"win"'), (9, '0.004*"nation"')]

Score: 0.020012702792