In [1]:
#importing necessary libraries
import pandas as pd
import numpy as np

#text processing
import re
import string
import nltk
from gensim import corpora, models, similarities 
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

#LDA
import gensim
import pyLDAvis.gensim

import warnings
warnings.simplefilter("ignore", category=DeprecationWarning)

  """
  """
  """


In [2]:
#read the csv file with amazon reviews
reviews_df=pd.read_csv('reviews.csv',error_bad_lines=False)
reviews_df['Reviews'] = reviews_df['Reviews'].astype(str) 
reviews_df.head(7)

Unnamed: 0,Reviews
0,I thought it would be as big as small paper bu...
1,This kindle is light and easy to use especiall...
2,Didnt know how much i'd use a kindle so went f...
3,I am 100 happy with my purchase. I caught it o...
4,Solid entry level Kindle. Great for kids. Gift...
5,This make an excellent ebook reader. Don't exp...
6,"I ordered this for my daughter, as I have the ..."


In [3]:
def initial_clean(text):
    """
    Function to clean text-remove punctuations, lowercase text etc.
    
    """
    text = re.sub("((\S+)?(http(s)?)(\S+))|((\S+)?(www)(\S+))|((\S+)?(\@)(\S+)?)", " ", text)
    text = re.sub("[^a-zA-Z ]", "", text)
    text = text.lower() # lower case text
    text = nltk.word_tokenize(text)
    return text

stop_words = stopwords.words('english')
stop_words.extend(['news', 'say','use', 'not', 'would', 'say', 'could', '_', 'be', 'know', 'good', 'go', 'get', 'do','took','time','year',
                   'done', 'try', 'many', 'some','nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily', 'lot', 'lack', 
                   'make', 'want', 'seem', 'run', 'need', 'even', 'right', 'line','even', 'also', 'may', 'take', 'come', 'new','said', 'like','people'])
def remove_stop_words(text):
    """
    Function to remove all stopwords from text
    """
    return [word for word in text if word not in stop_words]

stemmer = PorterStemmer()
def stem_words(text):
    """
    Function to stem words, so plural and singular are treated the same
    """
    try:
        text = [stemmer.stem(word) for word in text]
        text = [word for word in text if len(word) > 1] # makes sure there are no single letter words
    except IndexError:
        pass
    return text


In [4]:
def apply_all(text):
    """
    This function applies all the functions above into one
    """
    return stem_words(remove_stop_words(initial_clean(text)))

## LDA on amazon reviews

In [5]:
# clean reviews data and create new column "tokenized"
import time
t1 = time.time()
reviews_df['tokenized_reviews'] = reviews_df['Reviews'].apply(apply_all) 
t2 = time.time()
print("Time to clean and tokenize", len(reviews_df), "reviews:", (t2-t1)/60, "min")


Time to clean and tokenize 5000 reviews: 0.06679176092147827 min


In [6]:
reviews_df.head()

Unnamed: 0,Reviews,tokenized_reviews
0,I thought it would be as big as small paper bu...,"[thought, big, small, paper, turn, palm, small..."
1,This kindle is light and easy to use especiall...,"[kindl, light, especi, beach]"
2,Didnt know how much i'd use a kindle so went f...,"[didnt, much, id, kindl, went, lower, end, im,..."
3,I am 100 happy with my purchase. I caught it o...,"[happi, purchas, caught, sale, realli, price, ..."
4,Solid entry level Kindle. Great for kids. Gift...,"[solid, entri, level, kindl, great, kid, gift,..."


In [7]:
##create a Gensim dictionary from the tokenized data
tokenized = reviews_df['tokenized_reviews']
# Creating term dictionary of corpus, where each unique term is assigned an index.
dictionary = corpora.Dictionary(tokenized)
# Filter terms which occurs in less than 1 review & more than 80% of the reviews.
dictionary.filter_extremes(no_below=1, no_above=0.8)
#convert the dictionary to a bag of words corpus 
corpus = [dictionary.doc2bow(tokens) for tokens in tokenized]
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 2), (12, 1), (13, 1)]]


In [10]:
#LDA
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 7, id2word=dictionary, passes=15)
ldamodel.save('model.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.046*"echo" + 0.033*"alexa" + 0.026*"show" + 0.025*"music"')
(1, '0.049*"read" + 0.047*"book" + 0.040*"kindl" + 0.029*"love"')
(2, '0.042*"kid" + 0.023*"great" + 0.018*"tablet" + 0.014*"set"')
(3, '0.025*"work" + 0.024*"great" + 0.023*"amazon" + 0.022*"app"')
(4, '0.029*"kindl" + 0.017*"read" + 0.016*"one" + 0.015*"screen"')
(5, '0.107*"love" + 0.065*"bought" + 0.040*"gift" + 0.038*"one"')
(6, '0.088*"tablet" + 0.051*"great" + 0.031*"price" + 0.026*"fire"')


In [14]:
get_document_topics = ldamodel.get_document_topics(corpus[0])
print(get_document_topics)

[(4, 0.94627804)]


# Dominant Topic in each review

In [13]:
def dominant_topic(ldamodel, corpus, texts):
    # Function to find the dominant topic in each review
    sent_topics_df = pd.DataFrame()

    # Get main topic in each review
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each review
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num,topn=4)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

In [17]:
df_dominant_topic = dominant_topic(ldamodel=ldamodel, corpus=corpus, texts=reviews_df['Reviews'])
df_dominant_topic.head(20)

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,Reviews
0,4.0,0.9463,"kindl, read, one, screen",I thought it would be as big as small paper bu...
1,1.0,0.8282,"read, book, kindl, love",This kindle is light and easy to use especiall...
2,4.0,0.9284,"kindl, read, one, screen",Didnt know how much i'd use a kindle so went f...
3,1.0,0.5047,"read, book, kindl, love",I am 100 happy with my purchase. I caught it o...
4,1.0,0.3055,"read, book, kindl, love",Solid entry level Kindle. Great for kids. Gift...
5,4.0,0.6935,"kindl, read, one, screen",This make an excellent ebook reader. Don't exp...
6,4.0,0.5325,"kindl, read, one, screen","I ordered this for my daughter, as I have the ..."
7,4.0,0.705,"kindl, read, one, screen",I bought my Kindle about 2 months ago and the ...
8,4.0,0.9044,"kindl, read, one, screen","amazon kindle is always the best ebook, upgrad..."
9,0.0,0.4783,"echo, alexa, show, music","It's beyond my expectation, and it can even sh..."
