In [26]:
import pandas as pd
import numpy as np
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from nltk.stem.wordnet import WordNetLemmatizer
from collections import Counter
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer

wordnet = WordNetLemmatizer()
nltk.download('stopwords')

def remove_punc(my_string:str) -> str:
    '''Given a string, removes all punctuation and returned punctuation-less string'''
    return re.sub(f'[{string.punctuation}]', '', my_string)

def tokenize(str):
    '''
    Tokenize a str and return a tokenized list.
    '''
    return [word for word in word_tokenize(str)]

def lemmatize(doc):
    '''Takes in a doc and lemmatizes tokens in doc
    Parameters
    ----------
    doc: list of tokens
    
    Returns
    -------
    lemmatized tokens
    '''
    return [wordnet.lemmatize(tkn) for tkn in doc]

def rm_stop_words(doc, stops=set(stopwords.words('english'))):
    '''Takes in a doc and removes stop words
    Parameters
    ----------
    doc: list of tokens
    
    Returns
    -------
    Tokens with stop words removed
    '''
    return([w for w in doc if w not in stops])

def n_grams(input_tokens):
    # retain 1-gram tokens
    ret_list = list(input_tokens)
    
    for i in range(2,3):
        # add each n-grams to the list
        ret_list.extend(['-'.join(tgram) for tgram in ngrams(input_tokens, i)])
    return(ret_list)
    
    
def preprocess_corpus(content):
    '''
    Add docstring. Make flexible to allow for doing, or not doing, preprocessing functions. 
    Parameters
    ----------
    content (str): a collection of strings
    Returns
    -------
    A list of lists: each list contains a tokenized version of the original string
    '''
    preprocessed = []
    for i in range(len(content)):
        step_1 = remove_punc(content[i].lower())
        step_2 = tokenize(step_1)
        step_3 = lemmatize(step_2)
        step_4 = rm_stop_words(step_3)
        step_5 = n_grams(step_4)
        preprocessed.append(step_5)
    return preprocessed

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ryankirkland/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [27]:
df = pd.read_csv('cleaned_reviews.csv')

### Pass combined review titles and descriptions into preprocessing functions that remove punctuation and set words to lowercase, tokenize the words, lemmatize the words, remove English stop words and return a list of cleaned words

In [28]:
cleaned_desc = preprocess_corpus(df['title_desc'])
cleaned_desc[0]

['didnt',
 'work',
 'worked',
 'dont',
 'work',
 'got',
 'term',
 'use',
 'battery',
 'three',
 'day',
 'use',
 'two',
 'additional',
 'success',
 'bought',
 'wa',
 'bleeding',
 'aaa',
 'battery',
 'wa',
 'hoping',
 'compact',
 'design',
 'would',
 'better',
 'something',
 'bulkyright',
 'box',
 'charged',
 'light',
 'wa',
 'green',
 'indicating',
 'fully',
 'charged',
 'tried',
 'use',
 'couple',
 'device',
 'didnt',
 'luck',
 'wa',
 'going',
 'return',
 'friend',
 'suggested',
 'switch',
 'charging',
 'extension',
 'cord',
 'directly',
 'wall',
 'socket',
 'thought',
 'trick',
 'even',
 'though',
 'thought',
 'wa',
 'silly',
 'tried',
 'battery',
 'device',
 'letting',
 'charge',
 'overnight',
 'plugged',
 'directly',
 'wall',
 'socket',
 'work',
 'worked',
 'wellabout',
 'three',
 'day',
 'later',
 'device',
 'stopped',
 'working',
 'middle',
 'high',
 'powered',
 'usage',
 'hair',
 'trimmer',
 'swapped',
 'battery',
 'two',
 'charging',
 'entire',
 'time',
 'didnt',
 'work',
 'went

In [29]:
str_desc = [" ".join(x) for x in cleaned_desc]

In [30]:
str_desc

['didnt work worked dont work got term use battery three day use two additional success bought wa bleeding aaa battery wa hoping compact design would better something bulkyright box charged light wa green indicating fully charged tried use couple device didnt luck wa going return friend suggested switch charging extension cord directly wall socket thought trick even though thought wa silly tried battery device letting charge overnight plugged directly wall socket work worked wellabout three day later device stopped working middle high powered usage hair trimmer swapped battery two charging entire time didnt work went bought regular aaa battery device went back working fine tried battery device arent working seems work enough convince work stop workingi dont often leave feedback product felt wa important say would recommend anyone buy battery totally faulty least completely inconsistent youd better buying disposable didnt-work work-worked worked-dont dont-work work-got got-term term-use

In [31]:
df['str_desc'] = str_desc

In [32]:
df = df.drop('Unnamed: 0', axis=1)

In [33]:
tfidfvect = TfidfVectorizer()
tfidf_vectorized = tfidfvect.fit_transform(str_desc)
tfidf_vectorized.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [41]:
nmf = NMF(n_components=30, random_state=1)

In [42]:
W = nmf.fit_transform(tfidf_vectorized)
H = nmf.components_

In [43]:
topics = ['latent_topic_{}'.format(i) for i in range(30)]
W = pd.DataFrame(W, index=df.str_desc, columns=topics)
W.sort_values('latent_topic_0', ascending=False)

Unnamed: 0_level_0,latent_topic_0,latent_topic_1,latent_topic_2,latent_topic_3,latent_topic_4,latent_topic_5,latent_topic_6,latent_topic_7,latent_topic_8,latent_topic_9,...,latent_topic_20,latent_topic_21,latent_topic_22,latent_topic_23,latent_topic_24,latent_topic_25,latent_topic_26,latent_topic_27,latent_topic_28,latent_topic_29
str_desc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
battery battery battery-battery,0.171520,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000
battery battery battery battery battery-battery battery-battery battery-battery,0.171520,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000
great battery good battery great-battery battery-good good-battery,0.111844,0.195272,0.149743,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000
great battery battery awesome great-battery battery-battery battery-awesome,0.110167,0.000000,0.124304,0.000000,0.001868,0.002933,0.0,0.000000,0.000000,0.000570,...,0.000000,0.000000,0.004891,0.028716,0.0,0.0,0.009538,0.000000,0.000000,0.000000
rechargeable battery great battery rechargeable-battery battery-great great-battery,0.102325,0.000000,0.218449,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
advertised work great security camera advertised-work work-great great-security security-camera,0.000000,0.000000,0.104740,0.110190,0.000000,0.000486,0.0,0.001438,0.000000,0.000000,...,0.132282,0.001644,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000
last long last long last-long long-last last-long,0.000000,0.000000,0.000000,0.000000,0.304242,0.000000,0.0,0.000000,0.000000,0.000000,...,0.000000,0.053877,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000
made china review state made japan image product web site indicates made japan box indicatesmade china made-china china-review review-state state-made made-japan japan-image image-product product-web web-site site-indicates indicates-made made-japan japan-box box-indicatesmade indicatesmade-china,0.000000,0.000000,0.000000,0.000000,0.000000,0.037669,0.0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000
come right door ready use come-right right-door door-ready ready-use,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000052,0.013412,0.001585,...,0.002761,0.001686,0.000000,0.001811,0.0,0.0,0.000443,0.006790,0.006829,0.007446


In [79]:
def sort_latent_and_collect_articles(n, articles, w_):
    headlines = {}
    for i in range(n):
        sorted_df = w_.sort_values(f'latent_topic_{i}', ascending=False)
        headlines[f'latent_topic_{i}'] = list(sorted_df.index[:articles])
    return headlines

def get_popular_words(topics, extra_stop_word=None):
    latent_topics = {}
    total_word_count = Counter()
    for key, value in topics.items():
        words = ' '.join(topics[key]).lower().split()
        stop_words = set(stopwords.words('english'))
        if extra_stop_word:
            stop_words.update(extra_stop_word)
        filtered_words = []
        for word in words:
            if word not in stop_words:
                filtered_words.append(word)
        word_count = Counter(filtered_words)
        print(word_count)
        total_word_count += word_count
        first, second, third, fourth, fifth = word_count.most_common()[0][0], word_count.most_common()[1][0], word_count.most_common()[2][0], word_count.most_common()[3][0], word_count.most_common()[4][0]
        latent_topics[key] = [first, second, third, fourth, fifth]
    return latent_topics, total_word_count

In [80]:
top_obs = sort_latent_and_collect_articles(30, 30, W)

In [81]:
top_words, word_count = get_popular_words(top_obs, )

Counter({'good': 59, 'good-good': 28, 'battery': 3, 'good-battery': 3, 'product': 1, 'good-product': 1, 'battery-good': 1, 'price': 1, 'good-price': 1})
Counter({'great': 60, 'great-great': 15, 'work': 13, 'work-great': 13, 'battery': 11, 'great-battery': 9, 'battery-great': 6, 'great-work': 4, 'value': 4, 'great-value': 4, 'value-great': 2, '👍': 2, 'great-👍': 2, 'love': 1, 'great-love': 1, 'charge': 1, 'great-charge': 1, 'charge-great': 1, 'item': 1, 'great-item': 1, '👍-work': 1, 'value-work': 1, 'battery-work': 1, 'price': 1, 'battery-price': 1, 'good': 1, 'good-great': 1, 'product': 1, 'charger': 1, 'great-product': 1, 'product-great': 1, 'battery-charger': 1})
Counter({'work': 54, 'great': 15, 'work-work': 14, 'work-great': 13, 'well': 8, 'work-well': 8, 'great-work': 5, 'good': 4, 'battery': 3, 'work-good': 3, 'battery-work': 2, 'good-work': 2, 'intended': 2, 'work-intended': 2, '👍': 2, 'great-👍': 2, 'well-work': 2, 'work-battery': 1, 'far': 1, 'work-far': 1, 'really': 1, 'really-

In [82]:
top_words

{'latent_topic_0': ['battery',
  'great',
  'good',
  'great-battery',
  'good-battery'],
 'latent_topic_1': ['good', 'good-good', 'battery', 'good-battery', 'product'],
 'latent_topic_2': ['great', 'great-great', 'work', 'work-great', 'battery'],
 'latent_topic_3': ['work', 'great', 'work-work', 'work-great', 'well'],
 'latent_topic_4': ['last', 'long', 'last-long', 'battery', 'time'],
 'latent_topic_5': ['product',
  'great',
  'great-product',
  'good',
  'product-great'],
 'latent_topic_6': ['charge', 'hold', 'hold-charge', 'battery', 'long'],
 'latent_topic_7': ['value', 'great', 'great-value', 'good', 'good-value'],
 'latent_topic_8': ['battery', 'charged', 'light', 'charge', 'wa'],
 'latent_topic_9': ['charger', 'battery', 'charge', 'work', 'great'],
 'latent_topic_10': ['price', 'great', 'good', 'great-price', 'good-price'],
 'latent_topic_11': ['use', 'easy', 'easy-use', 'battery', 'charge'],
 'latent_topic_12': ['far', 'good', 'far-good', 'great', 'good-far'],
 'latent_topic_

In [48]:
def pos_neg_split(df):
    neg_df = df[df['rating'] < 3.0]
    pos_df = df[df['rating'] > 3.0]
    return neg_df, pos_df

In [50]:
negative_reviews, positive_reviews = pos_neg_split(df)

In [56]:
positive_reviews.head()

Unnamed: 0,asin,product,date,verified,title,desc,reviewer_name,rating,month,year,month_year,title_desc,str_desc
2,B08268F6XN,AA,2020-07-19,Verified Purchase,longer lasting battery for remote controller!!,i like the constant voltage and hopefully it ...,ARCHANGEL TROY,5.0,7,2020,2020-07,longer lasting battery for remote controller!!...,longer lasting battery remote controller like ...
3,B08267BBJT,AAA,2020-07-18,Verified Purchase,Minimal plastic in packaging.,"Just received these today, but I’m reviewing ...",ira,5.0,7,2020,2020-07,Minimal plastic in packaging. Just received th...,minimal plastic packaging received today ’ rev...
5,B08267BBJT,AAA,2020-07-17,Verified Purchase,"So far, I love them!",I bought these because we were burning throug...,Joseph M Zenevitch,5.0,7,2020,2020-07,"So far, I love them! I bought these because we...",far love bought burning battery like crazy noi...
6,B08268F6XN,AA,2020-07-16,Not Verified,Never want a disposable AA ever again,I didn't realize how barbaric it was to buy s...,M. L. Kijewski,5.0,7,2020,2020-07,Never want a disposable AA ever again I didn't...,never want disposable aa ever didnt realize ba...
9,B08267X3LH,9V,2020-06-12,Verified Purchase,Get these batteries!,I can’t believe that I can love a battery. Bu...,techie511,5.0,6,2020,2020-06,Get these batteries! I can’t believe that I ca...,get battery ’ believe love battery ’ replacing...


### Positive Reviews NMF

In [57]:
def testing_nmf(df, n=5):
    str_desc = df['str_desc']
    
    tfidfvect = TfidfVectorizer()
    tfidf_vectorized = tfidfvect.fit_transform(str_desc)
    tfidf_vectorized.toarray()

    nmf = NMF(n_components=n, random_state=1)

    W = nmf.fit_transform(tfidf_vectorized)
    H = nmf.components_

    topics = ['latent_topic_{}'.format(i) for i in range(n)]
    W = pd.DataFrame(W, index=df.str_desc, columns=topics)
    return W

In [60]:
pos_W = testing_nmf(positive_reviews)

In [62]:
pos_top_obs = sort_latent_and_collect_articles(5, 30, pos_W)

In [65]:
pos_top_words, pos_word_count = get_popular_words(pos_top_obs)

Counter({'good-battery': 4, 'price': 2, 'good-price': 2, 'battery-good': 2, 'product': 1, 'good-product': 1, 'work': 1, 'work-good': 1})
Counter({'product': 20, 'great-product': 20, 'product-great': 10, 'great-battery': 4, 'price': 3, 'battery-great': 3, 'work': 2, 'work-great': 2, 'charger': 1, 'battery-charger': 1, 'price-great': 1, 'love': 1, 'great-love': 1, 'product-price': 1, 'battery-price': 1, 'item': 1, 'great-item': 1, 'charge': 1, 'great-charge': 1, 'charge-great': 1})
Counter({'work': 55, 'well': 13, 'work-work': 13, 'work-well': 13, 'work-great': 12, 'great-work': 5, 'well-work': 4, 'work-good': 3, '👍': 3, 'battery-work': 2, 'fine': 2, 'work-fine': 2, 'intended': 2, 'work-intended': 2, 'great-👍': 2, 'work-battery': 1, 'far': 1, 'work-far': 1, 'really': 1, 'really-work': 1, 'great-battery': 1, 'buy': 1, 'buy-work': 1, 'charge': 1, 'go': 1, 'good-charge': 1, 'charge-go': 1, 'seem': 1, 'well-seem': 1, 'seem-work': 1, 'well-👍': 1, 'good-work': 1, '👍-work': 1, 'perfectly': 1, '

In [66]:
pos_top_words

{'latent_topic_0': ['charge', 'usb', 'aa', 'time', 'use'],
 'latent_topic_1': ['good-battery',
  'price',
  'good-price',
  'battery-good',
  'product'],
 'latent_topic_2': ['product',
  'great-product',
  'product-great',
  'great-battery',
  'price'],
 'latent_topic_3': ['work', 'well', 'work-work', 'work-well', 'work-great'],
 'latent_topic_4': ['value',
  'great-value',
  'good-value',
  'value-great',
  'value-good']}

### Negative Reviews NMF

In [68]:
neg_W = testing_nmf(negative_reviews)

In [69]:
neg_top_obs = sort_latent_and_collect_articles(5, 30, neg_W)

In [70]:
neg_top_words, neg_word_count = get_popular_words(neg_top_obs)

Counter({'charge': 49, 'charger': 34, 'use': 29, 'wa': 27, 'one': 26, 'amazon': 20, 'would': 19, 'aa': 18, '’': 16, 'time': 15, 'two': 15, 'flashlight': 15, 'dont': 15, 'charged': 14, 'rechargeable': 14, 'bad': 14, 'bought': 13, 'well': 13, 'charging': 12, 'aaa': 12, 'last': 11, 'light': 11, 'rechargeable-battery': 11, 'thought': 11, 'first': 11, 'money': 11, 'buy': 10, '2': 10, 'using': 10, 'hour': 9, 'device': 9, 'hold': 9, 'even': 9, 'got': 9, '4': 9, 'fit': 9, 'get': 9, 'brand': 8, 'set': 8, 'charge-battery': 8, 'worked': 8, 'back': 8, 'aa-battery': 8, 'aaa-battery': 8, 'issue': 8, 'second': 7, 'regular': 7, 'better': 7, 'waste': 7, 'hold-charge': 7, 'recharged': 7, 'like': 7, 'really': 7, 'camera': 7, 'product': 6, 'battery-would': 6, 'day': 6, 'getting': 6, 'much': 6, 'battery-wa': 6, 'candle': 6, 'recommend': 6, 'use-battery': 6, 'fine': 6, 'used': 6, 'im': 6, 'sure': 6, 'still': 6, 'cant': 6, 'four': 6, '18650': 6, 'year': 6, 'basic': 6, 'amazon-basic': 6, 'tried': 5, 'charged-

In [71]:
neg_top_words

{'latent_topic_0': ['charge', 'charger', 'use', 'wa', 'one'],
 'latent_topic_1': ['charge', 'hold', 'hold-charge', 'dont', 'long'],
 'latent_topic_2': ['made', 'china', 'japan', 'made-china', 'made-japan'],
 'latent_topic_3': ['last', 'long', 'last-long', 'charge', 'dont'],
 'latent_topic_4': ['work', 'didnt', 'charge', 'dont', 'tried']}

In [83]:
df

Unnamed: 0,asin,product,date,verified,title,desc,reviewer_name,rating,month,year,month_year,title_desc,str_desc
0,B08267BBJT,AAA,2020-08-11,Verified Purchase,"Didn't work, then worked, now don't work again",All I got in terms of use out of these batter...,Jasmine Carroll,1.0,8,2020,2020-08,"Didn't work, then worked, now don't work again...",didnt work worked dont work got term use batte...
1,B08267BBJT,AAA,2020-07-30,Verified Purchase,These absolutely suck,I bought these for a wall mounted magnifying ...,Ashlee M.,1.0,7,2020,2020-07,These absolutely suck I bought these for a wal...,absolutely suck bought wall mounted magnifying...
2,B08268F6XN,AA,2020-07-19,Verified Purchase,longer lasting battery for remote controller!!,i like the constant voltage and hopefully it ...,ARCHANGEL TROY,5.0,7,2020,2020-07,longer lasting battery for remote controller!!...,longer lasting battery remote controller like ...
3,B08267BBJT,AAA,2020-07-18,Verified Purchase,Minimal plastic in packaging.,"Just received these today, but I’m reviewing ...",ira,5.0,7,2020,2020-07,Minimal plastic in packaging. Just received th...,minimal plastic packaging received today ’ rev...
4,B08267BBJT,AAA,2020-07-17,Verified Purchase,Not long enough battery life for a night hike,Shuts off suddenly in headlamp,T,3.0,7,2020,2020-07,Not long enough battery life for a night hike ...,long enough battery life night hike shuts sudd...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4740,B00HZV9WTM,AAA,2018-12-10,Verified Purchase,Batteries advertised as charged. Totally dead...,Totally dead when I received the batteries.,William M.,2.0,12,2018,2018-12,Batteries advertised as charged. Totally dead...,battery advertised charged totally dead receiv...
4741,B00HZV9WTM,AA,2018-12-10,Verified Purchase,Great,Got the new silver ones. The older black labe...,David S.,5.0,12,2018,2018-12,Great Got the new silver ones. The older black...,great got new silver one older black label ver...
4742,B00HZV9WTM,AA,2018-12-10,Verified Purchase,Good batteries but quality control lacking,Out of 8 batteries in the pack i ordered ther...,Ethan Banks,5.0,12,2018,2018-12,Good batteries but quality control lacking Out...,good battery quality control lacking 8 battery...
4743,B00HZV9WTM,AA,2018-12-09,Verified Purchase,Great battery,Solid performance,stangcolts,5.0,12,2018,2018-12,Great battery Solid performance,great battery solid performance great-battery ...


In [84]:
df.to_csv('nlp_cleaned_reviews.csv')