In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import LatentDirichletAllocation

import string
import nltk
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords,wordnet

nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/ryankirkland/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [3]:
df = pd.read_csv('../data/total_reviews.csv')
cleaned = pd.read_csv('../data/cleaned_reviews.csv')

In [4]:
df = df.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1)

In [5]:
df['asin'].unique()

array(['B08267BBJT', 'B08268F6XN', 'B08267X3LH', 'B079JFK22D',
       'B07QW531W2', 'B07TJTQDYG', 'B085HB8QVX', 'B07R2KK5P7',
       'B07Y475MD3', 'B07F27PK2M', 'B085FZFVQV', 'B072R2SWXX',
       'B0828KRQZ3', 'B0821ZNWKW', 'B07QV15B3W', 'B07MWYYDTM',
       'B07RSJMS76', 'B07D1LMMDD', 'B0855TM65T', 'B07NTXYFBV',
       'B086L3Q8YX', 'B085FT4YR1', 'B085XT3GTW', 'B07P9XZPYG',
       'B0824WB5ST', 'B07HQ7QV7W', 'B083ZMYF55', 'B07FQD7PZ5',
       'B07TWDR7VJ', 'B085DV7VZK', 'B086GTFGPP', 'B07Q6PZ2F4',
       'B082W54KQK'], dtype=object)

In [17]:
def get_wordnet_pos(pos_tag):

    if pos_tag.startswith('J'):
        return wordnet.ADJ 

    elif pos_tag.startswith('V'):
        return wordnet.VERB 

    elif pos_tag.startswith('N'):
        return wordnet.NOUN

    elif pos_tag.startswith('R'):
        return wordnet.ADV
    
    else:
        return wordnet.NOUN # default, return wordnet tag "NOUN"

#Create a function to lemmatize tokens in the reviews
def lemmatized_tokens(text):
        text = text.lower()
        pattern = r'\b[a-zA-Z]{3,}\b'                 
        tokens = nltk.regexp_tokenize(text, pattern) # tokenize the text
        tagged_tokens = nltk.pos_tag(tokens)  # a list of tuples (word, pos_tag)
          
        stop_words = stopwords.words('english')
        new_stopwords = ['battery']  #customize extra stop_words
        stop_words.extend(new_stopwords)
        stop_words = set(stop_words)
        
        wordnet_lemmatizer = WordNetLemmatizer()
        # get lemmatized tokens                             #call function "get_wordnet_pos"
        lemmatized_words=[wordnet_lemmatizer.lemmatize(word, get_wordnet_pos(tag)) 
                  # tagged_tokens is a list of tuples (word, tag)
                  for (word, tag) in tagged_tokens \
                  # remove stop words
                  if word not in stop_words and \
                  # remove punctuations
                  word not in string.punctuation]

        return lemmatized_words

In [8]:
cleaned = cleaned.drop('Unnamed: 0', axis=1)

In [23]:
cleaned.head()

Unnamed: 0,asin,product,date,verified,title,desc,reviewer_name,rating,month,year,month_year,title_desc
0,B08267BBJT,AAA,2020-08-11,Verified Purchase,"Didn't work, then worked, now don't work again",All I got in terms of use out of these batter...,Jasmine Carroll,1.0,8,2020,2020-08,"Didn't work, then worked, now don't work again..."
1,B08267BBJT,AAA,2020-07-30,Verified Purchase,These absolutely suck,I bought these for a wall mounted magnifying ...,Ashlee M.,1.0,7,2020,2020-07,These absolutely suck I bought these for a wal...
2,B08268F6XN,AA,2020-07-19,Verified Purchase,longer lasting battery for remote controller!!,i like the constant voltage and hopefully it ...,ARCHANGEL TROY,5.0,7,2020,2020-07,longer lasting battery for remote controller!!...
3,B08267BBJT,AAA,2020-07-18,Verified Purchase,Minimal plastic in packaging.,"Just received these today, but I’m reviewing ...",ira,5.0,7,2020,2020-07,Minimal plastic in packaging. Just received th...
4,B08267BBJT,AAA,2020-07-17,Verified Purchase,Not long enough battery life for a night hike,Shuts off suddenly in headlamp,T,3.0,7,2020,2020-07,Not long enough battery life for a night hike ...


In [11]:
test_txt = cleaned.loc[0, 'title_desc']

In [18]:
lemmed_test = lemmatized_tokens(test_txt)

In [19]:
lemmed_test

['work',
 'work',
 'work',
 'get',
 'term',
 'use',
 'battery',
 'three',
 'day',
 'use',
 'two',
 'additional',
 'success',
 'buy',
 'bleed',
 'aaa',
 'battery',
 'hop',
 'compact',
 'design',
 'would',
 'better',
 'something',
 'bulky',
 'right',
 'box',
 'charge',
 'light',
 'green',
 'indicate',
 'fully',
 'charge',
 'try',
 'use',
 'couple',
 'device',
 'luck',
 'go',
 'return',
 'friend',
 'suggest',
 'switch',
 'charge',
 'extension',
 'cord',
 'directly',
 'wall',
 'socket',
 'think',
 'trick',
 'even',
 'though',
 'thought',
 'silly',
 'try',
 'battery',
 'device',
 'let',
 'charge',
 'overnight',
 'plug',
 'directly',
 'wall',
 'socket',
 'work',
 'work',
 'well',
 'three',
 'day',
 'later',
 'device',
 'stop',
 'work',
 'middle',
 'high',
 'power',
 'usage',
 'hair',
 'trimmer',
 'swap',
 'battery',
 'two',
 'charge',
 'entire',
 'time',
 'work',
 'go',
 'buy',
 'regular',
 'aaa',
 'battery',
 'device',
 'go',
 'back',
 'work',
 'fine',
 'try',
 'battery',
 'device',
 'work'

In [25]:
#Create a function to build the optimal LDA model
def optimal_lda_model(df, review_colname):
    '''
    INPUTS:
        df_review - dataframe that contains the reviews
        review_colname: name of column that contains reviews
        
    OUTPUTS:
        lda_tfidf - Latent Dirichlet Allocation (LDA) model
        dtm_tfidf - document-term matrix in the tfidf format
        tfidf_vectorizer - word frequency in the reviews
        A graph comparing LDA Model Performance Scores with different params
    '''
    docs_raw = df[review_colname].tolist()

    #************   Step 1: Convert to document-term matrix   ************#

    #Transform text to vector form using the vectorizer object 
    tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                    stop_words = 'english',
                                    lowercase = True,
                                    token_pattern = r'\b[a-zA-Z]{3,}\b', # num chars > 3 to avoid some meaningless words
                                    max_df = 0.9,                        # discard words that appear in > 90% of the reviews
                                    min_df = 10)                         # discard words that appear in < 10 reviews    

    #apply transformation
    tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())

    #convert to document-term matrix
    dtm_tfidf = tfidf_vectorizer.fit_transform(docs_raw)  

    print("The shape of the tfidf is {}, meaning that there are {} {} and {} tokens made through the filtering process.".\
              format(dtm_tfidf.shape,dtm_tfidf.shape[0], review_colname, dtm_tfidf.shape[1]))

    
    #*******   Step 2: GridSearch & parameter tuning to find the optimal LDA model   *******#

    # Define Search Param
    search_params = {'n_components': [5, 10, 15, 20, 25, 30], 
                     'learning_decay': [.5, .7, .9]}

    # Init the Model
    lda = LatentDirichletAllocation()

    # Init Grid Search Class
    model = GridSearchCV(lda, param_grid=search_params)

    # Do the Grid Search
    model.fit(dtm_tfidf)


    #*****  Step 3: Output the optimal lda model and its parameters  *****#

    # Best Model
    best_lda_model = model.best_estimator_

    # Model Parameters
    print("Best Model's Params: ", model.best_params_)

    # Log Likelihood Score: Higher the better
    print("Model Log Likelihood Score: ", model.best_score_)

    # Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
    print("Model Perplexity: ", best_lda_model.perplexity(dtm_tfidf))


    #***********   Step 4: Compare LDA Model Performance Scores   ***********#

    #Get Log Likelyhoods from Grid Search Output
    gscore=model.fit(dtm_tfidf).cv_results_
    n_topics = [5, 10, 15, 20, 25, 30]

    log_likelyhoods_5 = [gscore['mean_test_score'][gscore['params'].index(v)] for v in gscore['params'] if v['learning_decay']==0.5]
    log_likelyhoods_7 = [gscore['mean_test_score'][gscore['params'].index(v)] for v in gscore['params'] if v['learning_decay']==0.7]
    log_likelyhoods_9 = [gscore['mean_test_score'][gscore['params'].index(v)] for v in gscore['params'] if v['learning_decay']==0.9]

    # Show graph
    plt.figure(figsize=(12, 8))
    plt.plot(n_topics, log_likelyhoods_5, label='0.5')
    plt.plot(n_topics, log_likelyhoods_7, label='0.7')
    plt.plot(n_topics, log_likelyhoods_9, label='0.9')
    plt.title("Choosing Optimal LDA Model")
    plt.xlabel("Num Topics")
    plt.ylabel("Log Likelyhood Scores")
    plt.legend(title='Learning decay', loc='best')
    plt.show()
    
    return gscore, best_lda_model, dtm_tfidf, tfidf_vectorizer

In [None]:
gscore, best_lda_model, dtm_tfidf, tfidf_vectorizer = optimal_lda_model(cleaned, 'title_desc')



The shape of the tfidf is (4745, 1120), meaning that there are 4745 title_desc and 1120 tokens made through the filtering process.
Best Model's Params:  {'learning_decay': 0.5, 'n_components': 5}
Model Log Likelihood Score:  -25759.51499442464
Model Perplexity:  1279.1050702123528
