In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['abcnews-date-text.csv']


In [2]:
#Import necessary libraries
import numpy as np
import pandas as pd
import string
import nltk
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from matplotlib import pyplot as plt

In [3]:
df=pd.read_csv("../input/abcnews-date-text.csv")
df.head()

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


In [4]:
text=pd.DataFrame()
text['Text']=df['headline_text']
text.head()
#Convert all characters to lower case
text['clean_text']=text['Text'].str.lower()
text.head()
#Apply regular expressions to retain only alphabets, #, and spaces
text['clean_text']=text['clean_text'].str.replace('[^a-z ]','')
text.head()
#Remove stopwords
stop=stopwords.words('english')
#creating function for stop words
def sw(text):
    text=[word for word in text.split() if word not in stop]
    return " ".join(text)
text['split_words']=text['clean_text'].apply(sw)
text.head()
#removing words less than 4 character   - optional
def lw(x):
    x=[word for word in x.split() if len(word)>3]
    return " ".join(x)
text['split_words']=text['split_words'].apply(lw)
text.head()

Unnamed: 0,Text,clean_text,split_words
0,aba decides against community broadcasting lic...,aba decides against community broadcasting lic...,decides community broadcasting licence
1,act fire witnesses must be aware of defamation,act fire witnesses must be aware of defamation,fire witnesses must aware defamation
2,a g calls for infrastructure protection summit,a g calls for infrastructure protection summit,calls infrastructure protection summit
3,air nz staff in aust strike for pay rise,air nz staff in aust strike for pay rise,staff aust strike rise
4,air nz strike to affect australian travellers,air nz strike to affect australian travellers,strike affect australian travellers


In [5]:
#creating tf-idf
tf_idf_vet=TfidfVectorizer()
score=tf_idf_vet.fit_transform(text['split_words'])
score

<1103663x88175 sparse matrix of type '<class 'numpy.float64'>'
	with 5205387 stored elements in Compressed Sparse Row format>

In [6]:
#building the LDA model - dividing DTM to 5 topics
# for Topic modeling 2 main libs= gensim and 
from sklearn.decomposition import LatentDirichletAllocation

#creating the lda model
lda_model=LatentDirichletAllocation(n_topics=10,random_state=1234,max_iter=15)

#fitting lda model on DTM(score)
lda_output= lda_model.fit_transform(score)



In [7]:
#Find the dominating topic for each document
#create the column for the document - topic matrix
topicnames=['Topic '+str(i) for i in range (lda_model.n_topics)]
print(topicnames)
#create the row name for the document - topic matrix
docnames=['Doc '+str(i) for i in range (len(text['split_words']))]

['Topic 0', 'Topic 1', 'Topic 2', 'Topic 3', 'Topic 4', 'Topic 5', 'Topic 6', 'Topic 7', 'Topic 8', 'Topic 9']


In [8]:
#create a dataframe for Document - Topic Matrix
df_document_topic=pd.DataFrame(np.round(lda_output,2),index=docnames,columns=topicnames)
#finding the dominating topic
dominating_topic=np.argmax(df_document_topic.values,axis=1)
dominating_topic
df_document_topic['Dominating_topic']=dominating_topic
df_document_topic

#group by
df_document_topic.groupby(['Dominating_topic']).size()

Dominating_topic
0     56398
1     98401
2    138265
3    193407
4    142429
5     82019
6     78097
7    108304
8    115918
9     90425
dtype: int64

In [9]:
# creating  TTM - topic term matrix
df_topic_keywords=pd.DataFrame(lda_model.components_)

#Assigning the column and index
df_topic_keywords.columns=tf_idf_vet.get_feature_names()

df_topic_keywords.index=topicnames

df_topic_keywords

Unnamed: 0,aaahhh,aaas,aacc,aaco,aacos,aacta,aactas,aadmi,aads,aagaard,aagard,aalto,aamer,aamers,aami,aamodt,aandahl,aant,aapa,aaps,aapt,aaradhna,aardman,aardvark,aardvarks,aares,aargau,aarli,aaron,aaronpaul,aarons,aarwun,abaaoud,ababa,aback,abadi,abadoned,abal,abalone,abalonve,...,zulfikar,zullo,zulu,zuma,zumar,zumas,zumba,zumbo,zumoi,zumsteins,zunar,zunde,zuniga,zupljanin,zura,zurakowski,zurbaran,zurhake,zuri,zurich,zusak,zushi,zussino,zuul,zverev,zvonareva,zvonereva,zwaanswijk,zwanziger,zwar,zwartz,zweli,zwitkowsky,zydelig,zygar,zygiefs,zygier,zylvester,zynga,zyngier
Topic 0,0.1,0.1,0.1,38.760449,0.1,0.100003,0.1,0.1,0.1,0.100008,0.1,0.1,0.1,0.1,0.100009,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,6.264725,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.100001,0.1,...,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.100038,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.100005,0.100007,0.1,0.1,0.100012,0.100008,0.457187,0.1,0.1,0.1,0.622789,0.1,0.1,0.100005,0.1,0.1,0.1,0.100011,0.1,0.1,0.1
Topic 1,0.1,0.1,0.1,0.100007,0.1,6.528705,0.100007,0.1,0.1,0.1,0.1,0.1,0.100002,0.1,0.100001,0.1,0.100008,0.1,0.1,0.1,0.100009,0.1,0.682599,0.831252,0.1,1.665242,0.1,0.100006,17.059339,0.1,0.100035,0.1,0.100045,0.100009,0.1,0.100122,0.649057,0.796697,0.100003,0.100037,...,0.581353,0.1,0.100038,41.158349,0.1,0.10006,0.1,0.1,0.1,0.1,0.1,0.100043,0.1,0.726515,0.1,0.1,0.1,0.1,0.1,0.100009,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.100005,0.1,0.1,0.662765,0.1,0.100018,0.100024,0.1,0.1,0.87535
Topic 2,0.1,0.773174,0.1,0.1,0.1,0.1,0.100003,0.1,0.100035,0.1,0.1,0.1,0.100003,0.100006,0.100008,0.1,0.1,0.10002,0.1,0.1,0.100001,0.857605,0.1,0.1,0.1,0.1,0.682847,0.1,6.763572,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.100022,0.1,0.100009,0.1,...,0.1,0.1,0.1,0.100005,0.1,0.100014,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.100009,0.1,0.1,0.1,0.100193,0.100005,0.100001,0.100002,0.100027,0.1,0.100024,0.100005,0.1,0.1,0.1,0.1,0.1,0.100002,0.1,0.1,0.1,0.658752,0.1,0.100009,0.1,0.1,0.1
Topic 3,0.1,0.100012,0.100008,0.100006,0.100047,0.1,0.1,0.668711,0.1,0.100001,0.100021,0.74419,0.1,0.1,0.100004,0.748328,0.1,0.100006,0.100026,0.1,4.600875,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.100002,0.1,0.1,0.1,0.1,0.1,0.1,0.634799,0.1,0.1,0.100003,0.874632,...,0.1,0.1,0.65012,0.100001,0.1,0.1,1.1,0.1,0.631763,1.40166,0.1,0.1,0.1,0.1,1.402429,0.1,0.766746,0.1,0.1,0.100002,0.1,0.1,0.100016,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.100001,0.1,0.739182,0.100034
Topic 4,0.1,0.1,0.10002,0.100004,0.1,0.100001,0.100002,0.100012,0.1,0.100012,0.1,0.1,0.100001,0.1,0.1,0.1,1.8782,0.100002,1.417539,0.100009,0.1,0.1,0.1,0.1,0.1,0.1,0.1,1.394675,0.100003,1.1,0.972175,0.791637,0.1,0.1,1.833425,0.1,0.1,0.100085,0.100001,0.1,...,0.1,0.1,0.100067,0.100007,0.90854,0.1,0.1,0.100001,0.1,0.1,0.1,0.1,1.339378,0.1,0.1,0.100088,0.1,0.1,0.1,0.100005,0.1,0.1,0.1,0.1,3.808697,0.1,0.1,0.1,0.1,0.1,0.10001,0.100007,0.1,0.1,0.100173,0.730379,10.374877,0.1,0.1,0.1
Topic 5,0.1,0.1,0.100003,0.580216,0.1,0.100001,0.1,0.1,0.599497,2.093701,0.1,0.1,8.397033,0.1,0.100011,0.100028,0.1,0.1,0.1,0.1,5.963617,0.1,0.1,0.1,0.1,0.1,0.1,0.1,15.934439,0.1,0.100016,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.100004,0.1,...,0.1,0.100005,0.1,0.100001,0.1,0.1,0.1,0.1,0.1,0.100065,0.1,0.1,0.100035,0.1,0.100021,0.1,0.1,0.1,0.100001,20.674557,0.1,0.1,0.1,0.571054,0.100016,0.100003,0.100016,0.100007,0.1,0.1,0.10001,0.1,0.100007,0.1,0.1,0.1,0.100004,0.1,0.1,0.1
Topic 6,0.1,0.1,0.100006,0.100002,0.1,0.100013,0.1,0.1,0.1,0.1,0.1,0.721307,0.1,0.902725,0.1,0.1,0.1,0.1,0.1,0.1,0.100006,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.100017,0.1,0.1,0.1,0.1,0.100025,0.100021,0.1,0.1,0.1,0.100001,0.1,...,0.1,0.1,0.729434,0.100001,0.1,0.100009,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.100022,0.6461,0.1,0.100001,0.100003,0.1,0.1,0.1,0.100008,0.100001,0.1,0.1,0.1,0.1,0.1,0.683792,0.1,0.100025,0.1,0.1,0.100002,0.1,0.1,0.1
Topic 7,0.931525,0.1,0.1,0.109426,0.732453,0.10002,0.1,0.100022,0.100191,0.100001,0.1,0.1,0.100001,0.1,0.100005,0.100085,0.1,0.100004,0.1,0.1,0.100002,0.100099,0.1,0.1,0.1,0.1,0.1,0.1,48.036081,0.1,0.100012,0.100128,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,...,0.1,4.101799,0.1,0.1,0.1,3.013264,0.1,0.100061,0.1,0.1,1.86948,0.69544,0.1,0.1,0.100061,0.100047,0.1,0.100045,0.100002,0.100027,1.085562,0.1,1.3153,0.1,0.100098,24.828863,0.710138,0.100026,0.100114,0.100022,1.816413,0.1,0.1,0.1,0.1,0.1,0.1,0.100106,0.1,0.1
Topic 8,0.100018,0.100022,0.1,24.073249,0.1,0.100002,0.1,0.1,0.1,0.1,0.822958,0.1,0.486947,0.1,2.229981,0.100031,0.1,3.80594,0.1,0.667727,0.100018,0.1,0.100074,0.1,0.687935,0.1,0.600546,0.1,0.100006,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.100003,0.1,...,0.1,0.1,0.1,0.100001,0.1,0.1,0.1,1.199089,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.723416,0.1,0.1,0.1,0.100001,0.1,0.663467,0.1,0.100009,0.100012,0.1,0.1,0.1,0.820442,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.76192,0.100026,0.1
Topic 9,0.1,0.1,2.893707,0.100025,0.1,0.100002,2.371759,0.1,0.1,0.1,0.1,0.1,0.1,0.1,3.28133,0.1,0.1,0.100005,0.1,0.1,0.100002,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.100005,0.1,0.1,0.1,0.644319,0.700363,0.1,0.1,0.1,0.1,230.45453,0.100037,...,0.1,0.1,0.1,0.100001,0.1,0.100007,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,1.247728,0.1,0.1,0.1,0.1,0.1,0.100007,0.1,0.1,2.92721,0.1,0.1,0.100023,0.1,0.667058,0.100157,0.1,0.1,0.100002,0.1,0.1,0.1


In [10]:
def show_topics(vectorizer=tf_idf_vet,model=lda_model,n_words=20):
    keywords=np.array(tf_idf_vet.get_feature_names())
    topic_keywords=[]
    for topic_weights in lda_model.components_:
        top_keyword_locs=(-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords
    
topic_keywords=show_topics(vectorizer=tf_idf_vet,model=lda_model,n_words=20)
topic_keywords

[array(['country', 'news', 'hour', 'rural', 'podcast', 'sport', 'drum',
        'grandstand', 'business', 'march', 'friday', 'national', 'monday',
        'wednesday', 'tuesday', 'september', 'thursday', 'august',
        'october', 'quarter'], dtype='<U34'),
 array(['killed', 'iraq', 'bomb', 'kills', 'troops', 'attack', 'blast',
        'dead', 'soldiers', 'israel', 'afghan', 'kill', 'pakistan',
        'police', 'afghanistan', 'iraqi', 'australian', 'gaza', 'suicide',
        'attacks'], dtype='<U34'),
 array(['police', 'charged', 'court', 'murder', 'death', 'woman',
        'missing', 'accused', 'charges', 'found', 'jailed', 'guilty',
        'assault', 'drug', 'search', 'child', 'jail', 'arrested',
        'shooting', 'driver'], dtype='<U34'),
 array(['council', 'govt', 'plan', 'health', 'water', 'funding', 'urged',
        'boost', 'group', 'funds', 'indigenous', 'centre', 'plans', 'mine',
        'call', 'public', 'concerns', 'hospital', 'power', 'changes'],
       dtype='<U34'),

In [11]:
#Creating topic word data frame
df_topic_keywords=pd.DataFrame(topic_keywords)

#giving the index name and column name
df_topic_keywords.columns=['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index=['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14,Word 15,Word 16,Word 17,Word 18,Word 19
Topic 0,country,news,hour,rural,podcast,sport,drum,grandstand,business,march,friday,national,monday,wednesday,tuesday,september,thursday,august,october,quarter
Topic 1,killed,iraq,bomb,kills,troops,attack,blast,dead,soldiers,israel,afghan,kill,pakistan,police,afghanistan,iraqi,australian,gaza,suicide,attacks
Topic 2,police,charged,court,murder,death,woman,missing,accused,charges,found,jailed,guilty,assault,drug,search,child,jail,arrested,shooting,driver
Topic 3,council,govt,plan,health,water,funding,urged,boost,group,funds,indigenous,centre,plans,mine,call,public,concerns,hospital,power,changes
Topic 4,election,labor,says,minister,govt,rudd,opposition,abbott,government,union,talks,chief,calls,defends,deal,premier,howard,vote,claims,campaign
Topic 5,crash,dies,plane,killed,accident,train,injured,truck,highway,dead,fire,road,rescue,crashes,interview,light,woman,quake,pilot,fatal
Topic 6,closer,tigers,rain,toll,blues,port,storm,warriors,clash,australia,bulls,eagles,back,bulldogs,sharks,summary,lanka,broncos,lions,knights
Topic 7,interview,world,final,hill,wins,test,open,weather,england,australia,tour,broken,round,murray,ashes,wallabies,league,cricket,grand,aussies
Topic 8,market,rise,prices,rate,price,farmers,rates,share,high,australian,dollar,profit,shares,record,dairy,interest,year,sales,fall,growth
Topic 9,fire,asylum,firefighters,boat,island,blaze,wild,north,bushfire,warning,fires,illegal,crews,bird,korea,east,fishing,south,seekers,weather


In [12]:
# Adding the column of topic to our data frame

df['topic'] = dominating_topic
df.head()

Unnamed: 0,publish_date,headline_text,topic
0,20030219,aba decides against community broadcasting lic...,3
1,20030219,act fire witnesses must be aware of defamation,2
2,20030219,a g calls for infrastructure protection summit,3
3,20030219,air nz staff in aust strike for pay rise,4
4,20030219,air nz strike to affect australian travellers,9


In [13]:
#For each topic find the total number of documents.
df['topic'].value_counts()

3    193407
4    142429
2    138265
8    115918
7    108304
1     98401
9     90425
5     82019
6     78097
0     56398
Name: topic, dtype: int64