In [20]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sea 

In [21]:
df=pd.read_csv('/content/abcnews-date-text.csv',error_bad_lines=False)

In [22]:
df.shape

(559787, 2)

In [23]:
df.head()

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


In [24]:
data_text=df[['headline_text']]

In [25]:
data_text['index']=data_text.index

In [26]:
document=data_text

In [27]:
len(document)

559787

In [28]:
document[:5]

Unnamed: 0,headline_text,index
0,aba decides against community broadcasting lic...,0
1,act fire witnesses must be aware of defamation,1
2,a g calls for infrastructure protection summit,2
3,air nz staff in aust strike for pay rise,3
4,air nz strike to affect australian travellers,4


In [29]:
import nltk
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.corpus import stopwords 
from nltk.stem import PorterStemmer,WordNetLemmatizer

In [30]:
nltk.download('stopwords')
stop_word=stopwords.words('english')
stemmer=PorterStemmer()
lemitizer=WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [31]:
np.random.seed(2018)

In [32]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [33]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [34]:
def cleaning(text):
  var1=[]
  for word in word_tokenize(text):
    if word not in stop_word:
      var1.append(word)
  var2=[]
  for word in var1:
    var2.append(lemitizer.lemmatize(word))

  return var2

In [35]:
document['new_headline']=document['headline_text'].apply(lambda x:cleaning(x))

In [36]:
document.head()

Unnamed: 0,headline_text,index,new_headline
0,aba decides against community broadcasting lic...,0,"[aba, decides, community, broadcasting, licence]"
1,act fire witnesses must be aware of defamation,1,"[act, fire, witness, must, aware, defamation]"
2,a g calls for infrastructure protection summit,2,"[g, call, infrastructure, protection, summit]"
3,air nz staff in aust strike for pay rise,3,"[air, nz, staff, aust, strike, pay, rise]"
4,air nz strike to affect australian travellers,4,"[air, nz, strike, affect, australian, traveller]"


In [37]:
print(document['headline_text'][0])
print(document['new_headline'][0])

aba decides against community broadcasting licence
['aba', 'decides', 'community', 'broadcasting', 'licence']


In [38]:
document['headline_text'][4311]

'residents hope rain not so heavy'

In [39]:
document['new_headline'][4311]

['resident', 'hope', 'rain', 'heavy']

In [40]:
df=document.drop('headline_text',axis=1)

In [41]:
df.head()

Unnamed: 0,index,new_headline
0,0,"[aba, decides, community, broadcasting, licence]"
1,1,"[act, fire, witness, must, aware, defamation]"
2,2,"[g, call, infrastructure, protection, summit]"
3,3,"[air, nz, staff, aust, strike, pay, rise]"
4,4,"[air, nz, strike, affect, australian, traveller]"


In [42]:
def extra_clean(text):
  var1=[]
  for word in text:
    var1.append(stemmer.stem(word))
  return var1

In [43]:
df['new_headline']=df['new_headline'].apply(lambda x:extra_clean(x))

In [44]:
df.head()

Unnamed: 0,index,new_headline
0,0,"[aba, decid, commun, broadcast, licenc]"
1,1,"[act, fire, wit, must, awar, defam]"
2,2,"[g, call, infrastructur, protect, summit]"
3,3,"[air, nz, staff, aust, strike, pay, rise]"
4,4,"[air, nz, strike, affect, australian, travel]"


In [48]:
processed_docs=df['new_headline']

In [49]:
processed_docs[:10]

0              [aba, decid, commun, broadcast, licenc]
1                  [act, fire, wit, must, awar, defam]
2            [g, call, infrastructur, protect, summit]
3            [air, nz, staff, aust, strike, pay, rise]
4        [air, nz, strike, affect, australian, travel]
5                   [ambiti, olsson, win, tripl, jump]
6               [antic, delight, record, break, barca]
7    [aussi, qualifi, stosur, wast, four, memphi, m...
8            [aust, address, un, secur, council, iraq]
9                   [australia, lock, war, timet, opp]
Name: new_headline, dtype: object

In [50]:
import gensim
from gensim import corpora

In [51]:
dictionary=corpora.Dictionary(processed_docs)

In [52]:
print(dictionary.token2id)

{'aba': 0, 'broadcast': 1, 'commun': 2, 'decid': 3, 'licenc': 4, 'act': 5, 'awar': 6, 'defam': 7, 'fire': 8, 'must': 9, 'wit': 10, 'call': 11, 'g': 12, 'infrastructur': 13, 'protect': 14, 'summit': 15, 'air': 16, 'aust': 17, 'nz': 18, 'pay': 19, 'rise': 20, 'staff': 21, 'strike': 22, 'affect': 23, 'australian': 24, 'travel': 25, 'ambiti': 26, 'jump': 27, 'olsson': 28, 'tripl': 29, 'win': 30, 'antic': 31, 'barca': 32, 'break': 33, 'delight': 34, 'record': 35, 'aussi': 36, 'four': 37, 'match': 38, 'memphi': 39, 'qualifi': 40, 'stosur': 41, 'wast': 42, 'address': 43, 'council': 44, 'iraq': 45, 'secur': 46, 'un': 47, 'australia': 48, 'lock': 49, 'opp': 50, 'timet': 51, 'war': 52, '10': 53, 'aid': 54, 'contribut': 55, 'million': 56, 'birthday': 57, 'celebr': 58, 'robson': 59, 'take': 60, 'ahead': 61, 'bathhous': 62, 'move': 63, 'plan': 64, 'big': 65, 'championship': 66, 'cycl': 67, 'hope': 68, 'launceston': 69, 'boost': 70, 'paroo': 71, 'suppli': 72, 'water': 73, 'bill': 74, 'blizzard': 75,

In [53]:
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 aba
1 broadcast
2 commun
3 decid
4 licenc
5 act
6 awar
7 defam
8 fire
9 must
10 wit


In [54]:
dictionary.filter_extremes(no_below=15,no_above=0.5,keep_n=100000)

In [55]:
my_corpus=[]
for word in processed_docs:
  my_corpus.append(dictionary.doc2bow(word))

In [57]:
var1=[]
for doc in my_corpus:
  for key,freq in doc:
    var1.append([dictionary[key],freq])

In [61]:
from gensim import models

In [62]:
tfidf=models.TfidfModel(my_corpus)

In [63]:
corpus_tfidf=tfidf[my_corpus]

In [66]:
from gensim.models import LdaModel,LdaMulticore 

In [68]:
lda_model=LdaMulticore(my_corpus,num_topics=15,id2word=dictionary,passes=2,workers=2)

In [69]:
print(lda_model)

LdaModel(num_terms=10513, num_topics=15, decay=0.5, chunksize=2000)


In [72]:
for idx,topic in lda_model.print_topics(-1):
  print('topic :{} \nwords :{}'.format(idx,topic))

topic :0 
words :0.032*"talk" + 0.021*"rudd" + 0.018*"deni" + 0.018*"protest" + 0.017*"north" + 0.016*"leader" + 0.015*"south" + 0.014*"pm" + 0.013*"minist" + 0.013*"asylum"
topic :1 
words :0.028*"govt" + 0.027*"plan" + 0.023*"fund" + 0.022*"health" + 0.019*"mine" + 0.016*"school" + 0.014*"boost" + 0.013*"urg" + 0.013*"nsw" + 0.013*"council"
topic :2 
words :0.029*"win" + 0.023*"home" + 0.020*"open" + 0.019*"day" + 0.019*"set" + 0.018*"australia" + 0.015*"aussi" + 0.015*"final" + 0.015*"lead" + 0.015*"first"
topic :3 
words :0.062*"polic" + 0.022*"probe" + 0.018*"investig" + 0.017*"victim" + 0.014*"speak" + 0.013*"bodi" + 0.013*"fire" + 0.013*"look" + 0.013*"dog" + 0.012*"black"
topic :4 
words :0.039*"water" + 0.024*"ban" + 0.023*"hope" + 0.022*"time" + 0.020*"farmer" + 0.015*"rain" + 0.012*"decis" + 0.011*"use" + 0.011*"offer" + 0.010*"good"
topic :5 
words :0.016*"bail" + 0.016*"show" + 0.012*"expert" + 0.012*"near" + 0.012*"poll" + 0.011*"citi" + 0.011*"whale" + 0.010*"hunt" + 0.0

In [74]:
processed_docs[4310]

['ratepay', 'group', 'want', 'compulsori', 'local', 'govt', 'vote']

In [75]:
for index, score in sorted(lda_model[my_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.4702596366405487	 
Topic: 0.028*"govt" + 0.027*"plan" + 0.023*"fund" + 0.022*"health" + 0.019*"mine" + 0.016*"school" + 0.014*"boost" + 0.013*"urg" + 0.013*"nsw" + 0.013*"council"

Score: 0.155799999833107	 
Topic: 0.037*"council" + 0.030*"chang" + 0.024*"elect" + 0.018*"law" + 0.018*"govern" + 0.018*"union" + 0.017*"hous" + 0.017*"seek" + 0.016*"blaze" + 0.014*"vote"

Score: 0.147896870970726	 
Topic: 0.035*"green" + 0.017*"concern" + 0.015*"safeti" + 0.014*"sale" + 0.013*"rescu" + 0.012*"oil" + 0.012*"price" + 0.011*"compani" + 0.011*"light" + 0.011*"question"

Score: 0.13437682390213013	 
Topic: 0.039*"water" + 0.024*"ban" + 0.023*"hope" + 0.022*"time" + 0.020*"farmer" + 0.015*"rain" + 0.012*"decis" + 0.011*"use" + 0.011*"offer" + 0.010*"good"


In [76]:
print(dictionary.token2id)

{'aba': 0, 'broadcast': 1, 'commun': 2, 'decid': 3, 'licenc': 4, 'act': 5, 'awar': 6, 'defam': 7, 'fire': 8, 'must': 9, 'wit': 10, 'call': 11, 'g': 12, 'infrastructur': 13, 'protect': 14, 'summit': 15, 'air': 16, 'aust': 17, 'nz': 18, 'pay': 19, 'rise': 20, 'staff': 21, 'strike': 22, 'affect': 23, 'australian': 24, 'travel': 25, 'ambiti': 26, 'jump': 27, 'tripl': 28, 'win': 29, 'antic': 30, 'barca': 31, 'break': 32, 'delight': 33, 'record': 34, 'aussi': 35, 'four': 36, 'match': 37, 'memphi': 38, 'qualifi': 39, 'stosur': 40, 'wast': 41, 'address': 42, 'council': 43, 'iraq': 44, 'secur': 45, 'un': 46, 'australia': 47, 'lock': 48, 'opp': 49, 'timet': 50, 'war': 51, '10': 52, 'aid': 53, 'contribut': 54, 'million': 55, 'birthday': 56, 'celebr': 57, 'robson': 58, 'take': 59, 'ahead': 60, 'bathhous': 61, 'move': 62, 'plan': 63, 'big': 64, 'championship': 65, 'cycl': 66, 'hope': 67, 'launceston': 68, 'boost': 69, 'paroo': 70, 'suppli': 71, 'water': 72, 'bill': 73, 'blizzard': 74, 'buri': 75, '

In [77]:
len(dictionary)

10513