In [1]:
#text processing
import re
import string
import nltk
from gensim import corpora, models, similarities
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import pandas as pd
import numpy as np

In [2]:
#read the csv file with amazon reviews
reviews_df=pd.read_csv('C:\\Users\\Ansh\\test\\icp6\\abcnews.csv',error_bad_lines=False)

In [3]:
print(reviews_df.dtypes)

publish_date      int64
headline_text    object
dtype: object


In [4]:
reviews_df['headline_text'] = reviews_df['headline_text'].astype(str)

In [5]:
print(reviews_df.head(6))

   publish_date                                      headline_text
0      20030219  aba decides against community broadcasting lic...
1      20030219     act fire witnesses must be aware of defamation
2      20030219     a g calls for infrastructure protection summit
3      20030219           air nz staff in aust strike for pay rise
4      20030219      air nz strike to affect australian travellers
5      20030219                  ambitious olsson wins triple jump


In [6]:
def initial_clean(text):
    """
    Function to clean text-remove punctuations, lowercase text etc.
    """
    text = re.sub("[^a-zA-Z ]", "", text)
    text = text.lower()  # lower case text
    text = nltk.word_tokenize(text)
    return (text)

In [7]:
stop_words = stopwords.words('english')
stop_words.extend(['news', 'say','use', 'not', 'would', 'say', 'could', '_', 'be', 'know', 'good', 'go', 'get', 'do','took','time','year',
'done', 'try', 'many', 'some','nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 'even', 'right', 'line','even', 'also', 'may', 'take', 'come', 'new','said', 'like','people'])

In [8]:
def remove_stop_words(text):
     return [word for word in text if word not in stop_words]

In [9]:
stemmer = PorterStemmer()

In [10]:
def stem_words(text):
    """
    Function to stem words
    """
    try:
        text = [stemmer.stem(word) for word in text]
        text = [word for word in text if len(word) > 1] # no single letter words
    except IndexError:
        pass

    return text

In [11]:
def apply_all(text):
    """
    This function applies all the functions above into one
    """
    return stem_words(remove_stop_words(initial_clean(text)))

In [12]:
# clean reviews and create new column "tokenized"
import time

In [13]:
t1 = time.time()
reviews_df['tokenized_reviews'] = reviews_df['headline_text'].apply(apply_all)
t2 = time.time()
print("Time to clean and tokenize", len(reviews_df), "reviews:", (t2-t1)/60, "min") #Time to clean and tokenize 3209 reviews: 0.21254388093948365 min

Time to clean and tokenize 118253 reviews: 0.4888541539510091 min


In [14]:
print("reviews with their respective tokenize version:" )
print(reviews_df.head(5))

reviews with their respective tokenize version:
   publish_date                                      headline_text  \
0      20030219  aba decides against community broadcasting lic...   
1      20030219     act fire witnesses must be aware of defamation   
2      20030219     a g calls for infrastructure protection summit   
3      20030219           air nz staff in aust strike for pay rise   
4      20030219      air nz strike to affect australian travellers   

                               tokenized_reviews  
0        [aba, decid, commun, broadcast, licenc]  
1            [act, fire, wit, must, awar, defam]  
2         [call, infrastructur, protect, summit]  
3      [air, nz, staff, aust, strike, pay, rise]  
4  [air, nz, strike, affect, australian, travel]  


In [15]:
#LDA
import gensim
import pyLDAvis.gensim

In [16]:
#Create a Gensim dictionary from the tokenized data
tokenized = reviews_df['tokenized_reviews']

In [17]:
#Creating term dictionary of corpus, where each unique term is assigned an index.
dictionary = corpora.Dictionary(tokenized)

In [18]:
#Filter terms which occurs in less than 1 review and more than 80% of the reviews.
dictionary.filter_extremes(no_below=1, no_above=0.8)

In [19]:
#convert the dictionary to a bag of words corpus
corpus = [dictionary.doc2bow(tokens) for tokens in tokenized]
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)]]


In [20]:
print([[(dictionary[id], freq) for id, freq in cp] for cp in corpus[:1]])

[[('aba', 1), ('broadcast', 1), ('commun', 1), ('decid', 1), ('licenc', 1)]]


In [21]:
#LDA
t3 = time.time()
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 5, id2word=dictionary, passes=15)
t4 = time.time()
print("create model in ", len(reviews_df), "reviews:", (t4-t3)/60, "min")

create model in  118253 reviews: 10.44262334505717 min


In [22]:
#saving the model
ldamodel.save('model_combined.gensim')
topics = ldamodel.print_topics(num_words=8)

In [23]:
print("Now printing the topics and their composition")
print("This output shows the Topic-Words matrix for the 7 topics created and the 4 words within each topic")
for topic in topics:
    print(topic)

Now printing the topics and their composition
This output shows the Topic-Words matrix for the 7 topics created and the 4 words within each topic
(0, '0.022*"council" + 0.020*"govt" + 0.014*"labor" + 0.013*"plan" + 0.007*"qld" + 0.007*"water" + 0.007*"elect" + 0.006*"fund"')
(1, '0.035*"polic" + 0.020*"man" + 0.018*"charg" + 0.014*"court" + 0.013*"face" + 0.010*"death" + 0.010*"murder" + 0.009*"drug"')
(2, '0.015*"call" + 0.013*"plan" + 0.010*"urg" + 0.009*"boost" + 0.009*"group" + 0.009*"govt" + 0.009*"fund" + 0.009*"health"')
(3, '0.018*"win" + 0.012*"servic" + 0.011*"gold" + 0.009*"home" + 0.008*"alp" + 0.008*"oil" + 0.008*"final" + 0.007*"centr"')
(4, '0.029*"us" + 0.017*"iraq" + 0.012*"kill" + 0.010*"australia" + 0.008*"two" + 0.008*"australian" + 0.008*"market" + 0.007*"iraqi"')


In [24]:
#finding the similarity of the first review with topics
print("first review is:")
print(reviews_df.headline_text[0])
get_document_topics = ldamodel.get_document_topics(corpus[0])
print('\n')
print("The similarity of this review with the topics and respective similarity score are ")
print(get_document_topics)

first review is:
aba decides against community broadcasting licence


The similarity of this review with the topics and respective similarity score are 
[(0, 0.19991545), (1, 0.03334793), (2, 0.7000407), (3, 0.033347953), (4, 0.033347946)]


In [None]:
#visualizing topics
lda_viz = gensim.models.ldamodel.LdaModel.load('model_combined.gensim')
lda_display = pyLDAvis.gensim.prepare(lda_viz, corpus, dictionary, sort_topics=True)
pyLDAvis.show(lda_display)


Note: if you're in the IPython notebook, pyLDAvis.show() is not the best command
      to use. Consider using pyLDAvis.display(), or pyLDAvis.enable_notebook().
      See more information at http://pyLDAvis.github.io/quickstart.html .

You must interrupt the kernel to end this command

Serving to http://127.0.0.1:8889/    [Ctrl-C to exit]


127.0.0.1 - - [08/Mar/2020 18:45:24] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [08/Mar/2020 18:45:24] "GET /LDAvis.css HTTP/1.1" 200 -
127.0.0.1 - - [08/Mar/2020 18:45:24] "GET /d3.js HTTP/1.1" 200 -
127.0.0.1 - - [08/Mar/2020 18:45:24] "GET /LDAvis.js HTTP/1.1" 200 -
