In [3]:
import pandas as pd
import numpy as np

# plotting
import matplotlib.pyplot as plt

import warnings
warnings.simplefilter("ignore", DeprecationWarning)

import re

from sklearn.decomposition import LatentDirichletAllocation as LDA
# Tf-Idf and Clustering packages
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

import string
# NTLK functions
import nltk
from nltk.corpus import stopwords
from nltk import tokenize as tok
from nltk.stem.snowball import SnowballStemmer # load nltk's SnowballStemmer as variabled 'stemmer'

In [4]:
df = pd.read_csv("data/Scraped_Car_Review_volvo.csv",lineterminator='\n').iloc[:,1:]

In [5]:
df["Review"][0]

&#39; We own both the XC90 and S60and have been frustrated with both vehicles.Both have seen more than their fair share of the service department and the lack of concern from Volvo has been discouraging.In two years of the S60, we have had to replace the ignition switches twice. The emergency cable never really engaged properly so it had to be adjusted as well as the sunroof at a cost. A faulty ABS contact reeel and front axle shaft had to be replaced due to recalls. The headlight washer fell off and had to be replaced twice. As if we didn\x92t feel nickeled and dimed already, we now have to replace the Fire Trap Housing not covered on the warranty. We are very disappointed.&#39;

In [6]:
isURL = re.compile(r'http[s]?:// (?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', re.VERBOSE | re.IGNORECASE)
isRTusername = re.compile(r'^RT+[\s]+(@[\w_]+:)',re.VERBOSE | re.IGNORECASE) #r'^RT+[\s]+(@[\w_]+:)'
isEntity = re.compile(r'@[\w_]+', re.VERBOSE | re.IGNORECASE)

In [7]:
# Helper functions
def print_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join([words[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]])) 

# Show top n keywords for each topic
def show_topics(vectorizer, lda_model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords
     
        
def clean_tweet(row):
    row = isURL.sub("",row)
    row = isRTusername.sub("",row)
    row = isEntity.sub("",row)
    return row

def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in tok.sent_tokenize(text) for word in tok.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [8]:
df["review_clean"] = df['Review'].apply(lambda row:clean_tweet(row))
RE_PUNCTUATION = '|'.join([re.escape(x) for x in string.punctuation])  
df['review_clean'] = df['review_clean'].str.replace(RE_PUNCTUATION, "")
df.head()

Unnamed: 0,Review_Date,Author_Name,Vehicle_Title,Review_Title,Review,Rating\r,review_clean
0,on 05/16/12 11:58 AM (PDT),2xvolvoowner,2008 Volvo S60 Sedan 2.5T 4dr Sedan (2.5L 5cyl...,Double trouble,We own both the XC90 and S60and have been fru...,3.125,We own both the XC90 and S60and have been fru...
1,on 08/29/10 08:28 AM (PDT),aikiman,2008 Volvo S60 Sedan 2.5T 4dr Sedan (2.5L 5cyl...,First Volvo,"This is my first Volvo, had an Acura and Maxi...",4.625,This is my first Volvo had an Acura and Maxim...
2,on 05/09/09 11:51 AM (PDT),Central Florida,2008 Volvo S60 Sedan 2.5T 4dr Sedan (2.5L 5cyl...,My first Volvo,I've had my Volvo 10 months and love it! I tr...,5.0,Ive had my Volvo 10 months and love it I trad...
3,on 11/21/08 18:11 PM (PST),lordway,2008 Volvo S60 Sedan 2.5T 4dr Sedan (2.5L 5cyl...,Relability Stinks,I liked this car a lot when I first bought it...,1.875,I liked this car a lot when I first bought it...
4,on 09/27/08 22:30 PM (PDT),Stravage,2008 Volvo S60 Sedan 2.5T 4dr Sedan (2.5L 5cyl...,1st Impression,In the past 2 years I have gone from a jeep g...,3.875,In the past 2 years I have gone from a jeep g...


In [16]:

number_topics = 5
number_words = 5

corpus = df['review_clean'].tolist()
    # print(corpus)
tf_vectorizer = CountVectorizer(max_df=0.9, min_df=0.00, stop_words="english", tokenizer=tokenize_only) # Use tf (raw term count) features for LDA.
tf = tf_vectorizer.fit_transform(corpus)

# Create and fit the LDA model
model = LDA(n_components=number_topics, n_jobs=-1)
id_topic = model.fit(tf)
# Print the topics found by the LDA model
print("Topics found via LDA:")
topic_keywords = show_topics(vectorizer=tf_vectorizer, lda_model=model, n_words=number_words)        
# Topic - Keywords Dataframe
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]

df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]

df_topic_keywords = df_topic_keywords.reset_index()
df_topic_keywords['topic_index'] = df_topic_keywords['index'].str.split(' ', n = 1, expand = True)[[1]].astype('int')
print(df_topic_keywords)
    
############ get the dominat topic for each document in a data frame ###############
# Create Document — Topic Matrix
lda_output = model.transform(tf)
# column names
topicnames = ["Topic" + str(i) for i in range(model.n_components)]
# index names
docnames = ["Doc" + str(i) for i in range(len(corpus))]
    
# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)
# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic   
df_document_topic = df_document_topic.reset_index()
        
df_document_topic

Topics found via LDA:
     index   Word 0  Word 1   Word 2    Word 3       Word 4  topic_index
0  Topic 0      car   volvo    miles  problems       dealer            0
1  Topic 1      car   volvo    great     drive         like            1
2  Topic 2  mileage     gas  vehicle     volvo        miles            2
3  Topic 3   safety  better    volvo  features     interior            3
4  Topic 4      car   great    volvo     drive  comfortable            4


Unnamed: 0,index,Topic0,Topic1,Topic2,Topic3,Topic4,dominant_topic
0,Doc0,0.75,0.00,0.00,0.23,0.00,0
1,Doc1,0.48,0.25,0.00,0.00,0.26,0
2,Doc2,0.01,0.96,0.01,0.01,0.01,1
3,Doc3,0.51,0.00,0.00,0.00,0.48,0
4,Doc4,0.00,0.99,0.00,0.00,0.00,1
...,...,...,...,...,...,...,...
4842,Doc4842,0.07,0.92,0.00,0.00,0.00,1
4843,Doc4843,0.00,0.99,0.00,0.00,0.00,1
4844,Doc4844,0.00,0.99,0.00,0.00,0.00,1
4845,Doc4845,0.18,0.62,0.01,0.19,0.01,1
