# Step 1: Load the dataset¶


In [3]:
import pandas as pd
import numpy as np
import re

df = pd.read_csv("data/Volvo_edmunds_10yrs.csv",lineterminator='\n').iloc[:,1:]

In [4]:
# Load the regular expression library
import re
# Remove punctuation
df['review_clean'] = df['Review'].map(lambda x: re.sub('[,\.!?]', '', x))
# Convert the titles to lowercase
df['review_clean'] = df['review_clean'].map(lambda x: x.lower())
# Print out the first rows of papers
df['review_clean'].head()

0    in evaluating a car first you define the missi...
1    i thought i loved my 2007 mini s convertible b...
2    have had my 2011 volvo c30 for a little over a...
3    i've read plenty of reviews stating how this c...
4    i test drove several cars to include the volks...
Name: review_clean, dtype: object

# Step 2: Data Preprocessing¶


In [5]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(400)


In [6]:
import nltk
nltk.download('wordnet')
import pandas as pd
stemmer = SnowballStemmer("english")

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sixumeng/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
'''
Write a function to perform the pre processing steps on the entire dataset
'''
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

# Tokenize and lemmatize
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
            
    return result

In [8]:
'''
preprocess all the review we have
'''
processed_docs = []

for doc in df["review_clean"]:
    processed_docs.append(preprocess(doc))

In [9]:
'''
Create a dictionary from 'processed_docs' containing the number of times a word appears 
'''
dictionary = gensim.corpora.Dictionary(processed_docs)


In [10]:

'''
Checking dictionary created
'''
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 aesthet
1 agil
2 appeal
3 bliss
4 cabin
5 coddl
6 comfort
7 corvett
8 defin
9 design
10 distinct


# Step 3: Bag of words on the dataset¶


In [11]:
'''
Remove very rare and very common words:
- words appearing less than 15 times
- words appearing in more than 10% of all documents
'''
dictionary.filter_extremes(no_below=15, no_above=0.1, keep_n= 100000)

In [12]:
'''
Create the Bag-of-words model for each document i.e for each document we create a dictionary reporting how many
words and how many times those words appear. Save this to 'bow_corpus'
'''
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [13]:
'''
Preview BOW for our sample preprocessed document
'''
document_num = 20
bow_doc_x = bow_corpus[document_num]

for i in range(len(bow_doc_x)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_x[i][0], 
                                                     dictionary[bow_doc_x[i][0]], 
                                                     bow_doc_x[i][1]))

Word 21 ("passeng") appears 1 time.
Word 39 ("style") appears 1 time.
Word 53 ("uncomfort") appears 1 time.
Word 85 ("daili") appears 1 time.
Word 109 ("trade") appears 1 time.
Word 197 ("choic") appears 1 time.
Word 198 ("concern") appears 1 time.
Word 199 ("lower") appears 1 time.
Word 200 ("profil") appears 1 time.
Word 201 ("run") appears 1 time.
Word 202 ("stiff") appears 1 time.


# Step 4: Running LDA using Bag of Words¶


In [14]:
# LDA mono-core -- fallback code in case LdaMulticore throws an error on your machine
# lda_model = gensim.models.LdaModel(bow_corpus, 
#                                    num_topics = 10, 
#                                    id2word = dictionary,                                    
#                                    passes = 50)

# LDA multicore 
'''
Train your lda model using gensim.models.LdaMulticore and save it to 'lda_model'
'''
# TODO
lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = 8, 
                                   id2word = dictionary,                                    
                                   passes = 10,
                                   workers = 2)

In [15]:
'''
For each topic, we will explore the words occuring in that topic and its relative weight
'''
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.012*"screen" + 0.010*"technolog" + 0.008*"mode" + 0.008*"averag" + 0.008*"inscript" + 0.008*"tell" + 0.008*"impress" + 0.008*"requir" + 0.007*"function" + 0.007*"disappoint"


Topic: 1 
Words: 0.018*"repair" + 0.017*"dealership" + 0.013*"warranti" + 0.013*"shop" + 0.012*"tell" + 0.012*"say" + 0.012*"batteri" + 0.009*"softwar" + 0.009*"cost" + 0.009*"day"


Topic: 2 
Words: 0.012*"mode" + 0.010*"assist" + 0.010*"gear" + 0.010*"right" + 0.010*"transmiss" + 0.009*"shift" + 0.008*"door" + 0.008*"park" + 0.008*"say" + 0.007*"speed"


Topic: 3 
Words: 0.012*"easi" + 0.012*"fuel" + 0.012*"nois" + 0.010*"economi" + 0.009*"sound" + 0.009*"wish" + 0.009*"leather" + 0.008*"infotain" + 0.008*"speed" + 0.008*"traffic"


Topic: 4 
Words: 0.011*"experi" + 0.010*"amaz" + 0.010*"wagon" + 0.008*"screen" + 0.007*"perfect" + 0.007*"enjoy" + 0.007*"space" + 0.007*"easi" + 0.007*"safe" + 0.007*"charg"


Topic: 5 
Words: 0.017*"smooth" + 0.013*"suspens" + 0.013*"expect" + 0.012*"quiet" + 0

In [21]:
import pyLDAvis
import pyLDAvis.gensim 

lda_display = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

  and should_run_async(code)
