In [1]:
#importing necessary libraries
import pandas as pd
import numpy as np

#text processing
import re
import string
import nltk
from gensim import corpora, models, similarities 
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

#LDA
import gensim
import pyLDAvis.gensim

import warnings
warnings.simplefilter("ignore", category=DeprecationWarning)

  """
  """
  """


In [4]:
#read the csv file with amazon reviews
reviews_df=pd.read_csv('amazon_reviews.csv',error_bad_lines=False)
reviews_df['Reviews'] = reviews_df['Reviews'].astype(str) 
reviews_df.head()

Unnamed: 0,Reviews
0,Stuning even for the non-gamer: This sound tra...
1,The best soundtrack ever to anything.: I'm rea...
2,Amazing!: This soundtrack is my favorite music...
3,Excellent Soundtrack: I truly like this soundt...
4,"Remember, Pull Your Jaw Off The Floor After He..."


In [6]:
def initial_clean(text):
    """
    Function to clean text-remove punctuations, lowercase text etc.
    
    """
    text = re.sub("((\S+)?(http(s)?)(\S+))|((\S+)?(www)(\S+))|((\S+)?(\@)(\S+)?)", " ", text)
    text = re.sub("[^a-zA-Z ]", "", text)
    text = text.lower() # lower case text
    text = nltk.word_tokenize(text)
    return text

stop_words = stopwords.words('english')
stop_words.extend(['news', 'say','use', 'not', 'would', 'say', 'could', '_', 'be', 'know', 'good', 'go', 'get', 'do','took','time','year',
                   'done', 'try', 'many', 'some','nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily', 'lot', 'lack', 
                   'make', 'want', 'seem', 'run', 'need', 'even', 'right', 'line','even', 'also', 'may', 'take', 'come', 'new','said', 'like','people'])
def remove_stop_words(text):
    """
    Function to remove all stopwords from text
    """
    return [word for word in text if word not in stop_words]

stemmer = PorterStemmer()
def stem_words(text):
    """
    Function to stem words, so plural and singular are treated the same
    """
    try:
        text = [stemmer.stem(word) for word in text]
        text = [word for word in text if len(word) > 1] # makes sure there are no single letter words
    except IndexError:
        pass
    return text


In [7]:
def apply_all(text):
    """
    This function applies all the functions above into one
    """
    return stem_words(remove_stop_words(initial_clean(text)))

## LDA on news articles

In [8]:
# clean text and title and create new column "tokenized"
import time
t1 = time.time()
reviews_df['tokenized_reviews'] = reviews_df['Reviews'].apply(apply_all) 
t2 = time.time()
print("Time to clean and tokenize", len(reviews_df), "reviews:", (t2-t1)/60, "min")


Time to clean and tokenize 3209 reviews: 0.21254388093948365 min


In [10]:
##create a Gensim dictionary from the tokenized data
tokenized = reviews_df['tokenized_reviews']
dictionary = corpora.Dictionary(tokenized)
#removing extremes 
dictionary.filter_extremes(no_below=1, no_above=0.8)
#convert the dictionary to a bag of words corpus 
corpus = [dictionary.doc2bow(tokens) for tokens in tokenized]


In [11]:
#LDA
ldamodel_combined = gensim.models.ldamodel.LdaModel(corpus, num_topics = 7, id2word=dictionary, passes=15)
ldamodel_combined.save('model_combined.gensim')
topics = ldamodel_combined.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.010*"great" + 0.008*"cd" + 0.008*"love" + 0.007*"music"')
(1, '0.013*"toy" + 0.008*"one" + 0.007*"work" + 0.007*"pump"')
(2, '0.016*"work" + 0.013*"product" + 0.012*"one" + 0.009*"buy"')
(3, '0.014*"bed" + 0.010*"boot" + 0.010*"product" + 0.010*"air"')
(4, '0.053*"book" + 0.026*"read" + 0.012*"one" + 0.006*"stori"')
(5, '0.027*"movi" + 0.013*"one" + 0.011*"song" + 0.011*"cd"')
(6, '0.015*"book" + 0.013*"stori" + 0.008*"illustr" + 0.008*"version"')
