In [26]:
import pandas as pd
import numpy as np
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from nltk.stem.wordnet import WordNetLemmatizer
from collections import Counter
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer

wordnet = WordNetLemmatizer()
nltk.download('stopwords')

def remove_punc(my_string:str) -> str:
    '''Given a string, removes all punctuation and returned punctuation-less string'''
    return re.sub(f'[{string.punctuation}]', '', my_string)

def tokenize(str):
    '''
    Tokenize a str and return a tokenized list.
    '''
    return [word for word in word_tokenize(str)]

def lemmatize(doc):
    '''Takes in a doc and lemmatizes tokens in doc
    Parameters
    ----------
    doc: list of tokens
    
    Returns
    -------
    lemmatized tokens
    '''
    return [wordnet.lemmatize(tkn) for tkn in doc]

def rm_stop_words(doc, stops=set(stopwords.words('english'))):
    '''Takes in a doc and removes stop words
    Parameters
    ----------
    doc: list of tokens
    
    Returns
    -------
    Tokens with stop words removed
    '''
    return([w for w in doc if w not in stops])

def n_grams(input_tokens):
    # retain 1-gram tokens
    ret_list = list(input_tokens)
    
    for i in range(2,3):
        # add each n-grams to the list
        ret_list.extend(['-'.join(tgram) for tgram in ngrams(input_tokens, i)])
    return(ret_list)
    
    
def preprocess_corpus(content):
    '''
    Add docstring. Make flexible to allow for doing, or not doing, preprocessing functions. 
    Parameters
    ----------
    content (str): a collection of strings
    Returns
    -------
    A list of lists: each list contains a tokenized version of the original string
    '''
    preprocessed = []
    for i in range(len(content)):
        step_1 = remove_punc(content[i].lower())
        step_2 = tokenize(step_1)
        step_3 = lemmatize(step_2)
        step_4 = rm_stop_words(step_3)
        step_5 = n_grams(step_4)
        preprocessed.append(step_5)
    return preprocessed

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ryankirkland/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
df = pd.read_csv('cleaned_reviews.csv')

### Pass combined review titles and descriptions into preprocessing functions that remove punctuation and set words to lowercase, tokenize the words, lemmatize the words, remove English stop words and return a list of cleaned words

In [18]:
cleaned_desc = preprocess_corpus(df['title_desc'])
cleaned_desc[0]

['didnt',
 'work',
 'worked',
 'dont',
 'work',
 'got',
 'term',
 'use',
 'battery',
 'three',
 'day',
 'use',
 'two',
 'additional',
 'success',
 'bought',
 'wa',
 'bleeding',
 'aaa',
 'battery',
 'wa',
 'hoping',
 'compact',
 'design',
 'would',
 'better',
 'something',
 'bulkyright',
 'box',
 'charged',
 'light',
 'wa',
 'green',
 'indicating',
 'fully',
 'charged',
 'tried',
 'use',
 'couple',
 'device',
 'didnt',
 'luck',
 'wa',
 'going',
 'return',
 'friend',
 'suggested',
 'switch',
 'charging',
 'extension',
 'cord',
 'directly',
 'wall',
 'socket',
 'thought',
 'trick',
 'even',
 'though',
 'thought',
 'wa',
 'silly',
 'tried',
 'battery',
 'device',
 'letting',
 'charge',
 'overnight',
 'plugged',
 'directly',
 'wall',
 'socket',
 'work',
 'worked',
 'wellabout',
 'three',
 'day',
 'later',
 'device',
 'stopped',
 'working',
 'middle',
 'high',
 'powered',
 'usage',
 'hair',
 'trimmer',
 'swapped',
 'battery',
 'two',
 'charging',
 'entire',
 'time',
 'didnt',
 'work',
 'went

In [19]:
str_desc = [" ".join(x) for x in cleaned_desc]

In [20]:
str_desc

['didnt work worked dont work got term use battery three day use two additional success bought wa bleeding aaa battery wa hoping compact design would better something bulkyright box charged light wa green indicating fully charged tried use couple device didnt luck wa going return friend suggested switch charging extension cord directly wall socket thought trick even though thought wa silly tried battery device letting charge overnight plugged directly wall socket work worked wellabout three day later device stopped working middle high powered usage hair trimmer swapped battery two charging entire time didnt work went bought regular aaa battery device went back working fine tried battery device arent working seems work enough convince work stop workingi dont often leave feedback product felt wa important say would recommend anyone buy battery totally faulty least completely inconsistent youd better buying disposable didnt-work work-worked worked-dont dont-work work-got got-term term-use

In [21]:
df['str_desc'] = str_desc

In [24]:
df = df.drop('Unnamed: 0', axis=1)

In [27]:
tfidfvect = TfidfVectorizer()
tfidf_vectorized = tfidfvect.fit_transform(str_desc)
tfidf_vectorized.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [37]:
nmf = NMF(n_components=10, random_state=1)

In [38]:
W = nmf.fit_transform(tfidf_vectorized)
H = nmf.components_

In [39]:
topics = ['latent_topic_{}'.format(i) for i in range(10)]
W = pd.DataFrame(W, index=df.str_desc, columns=topics)
W.sort_values('latent_topic_0', ascending=False)

Unnamed: 0_level_0,latent_topic_0,latent_topic_1,latent_topic_2,latent_topic_3,latent_topic_4,latent_topic_5,latent_topic_6,latent_topic_7,latent_topic_8,latent_topic_9
str_desc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
battery battery battery battery battery-battery battery-battery battery-battery,0.219031,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
battery battery battery-battery,0.219031,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
rechargeable battery great battery rechargeable-battery battery-great great-battery,0.175077,0.000000,0.215381,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
good rechargeable battery great rechargeable battery good-rechargeable rechargeable-battery battery-great great-rechargeable rechargeable-battery,0.144270,0.070378,0.121262,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
great battery battery awesome great-battery battery-battery battery-awesome,0.141551,0.000000,0.124755,0.000000,0.000406,0.000710,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...
wa wa supposed else say wa-wa wa-supposed supposed-else else-say,0.000000,0.000000,0.000000,0.000000,0.000000,0.000343,0.000000,0.000000,0.042317,0.000000
great value money satisfied product price great-value value-money money-satisfied satisfied-product product-price,0.000000,0.006390,0.073807,0.000000,0.001782,0.139815,0.004688,0.169919,0.000000,0.000000
convenient love product charge anywhere usb buying convenient-love love-product product-charge charge-anywhere anywhere-usb usb-buying,0.000000,0.000000,0.000000,0.000000,0.000000,0.117186,0.047896,0.003408,0.000000,0.081048
kidding wireless keyboard work aa fully charged lasted day came work morning recharge using wired keyboard today word express unhappiness kidding-wireless wireless-keyboard keyboard-work work-aa aa-fully fully-charged charged-lasted lasted-day day-came came-work work-morning morning-recharge recharge-using using-wired wired-keyboard keyboard-today today-word word-express express-unhappiness,0.000000,0.000000,0.000000,0.064994,0.000000,0.000000,0.000000,0.000000,0.045813,0.015453


In [43]:
def sort_latent_and_collect_articles(n, articles, w_):
    headlines = {}
    for i in range(n):
        sorted_df = w_.sort_values(f'latent_topic_{i}', ascending=False)
        headlines[f'latent_topic_{i}'] = list(sorted_df.index[:articles])
    return headlines

def get_popular_words(topics):
    latent_topics = {}
    total_word_count = Counter()
    for key, value in topics.items():
        words = ' '.join(topics[key]).lower().split()
        stop_words = set(stopwords.words('english'))
        stop_words.add('battery')
        filtered_words = []
        for word in words:
            if word not in stop_words:
                filtered_words.append(word)
        word_count = Counter(filtered_words)
        total_word_count += word_count
        first, second, third, fourth, fifth = word_count.most_common()[0][0], word_count.most_common()[1][0], word_count.most_common()[2][0], word_count.most_common()[3][0], word_count.most_common()[4][0]
        latent_topics[key] = [first, second, third, fourth, fifth]
    return latent_topics, total_word_count

In [44]:
top_obs = sort_latent_and_collect_articles(10, 30, W)

In [45]:
top_words, word_count = get_popular_words(top_obs)

In [46]:
top_words

{'latent_topic_0': ['rechargeable',
  'rechargeable-battery',
  'great',
  'good',
  'battery-good'],
 'latent_topic_1': ['good',
  'good-good',
  'good-battery',
  'price',
  'good-price'],
 'latent_topic_2': ['great',
  'great-great',
  'work',
  'work-great',
  'great-battery'],
 'latent_topic_3': ['work', 'work-work', 'well', 'work-well', 'great'],
 'latent_topic_4': ['long', 'last', 'last-long', 'time', 'long-time'],
 'latent_topic_5': ['product',
  'great',
  'great-product',
  'good',
  'good-product'],
 'latent_topic_6': ['charge', 'hold', 'hold-charge', 'long', 'doe'],
 'latent_topic_7': ['value', 'great', 'great-value', 'good', 'good-value'],
 'latent_topic_8': ['use', 'controller', 'one', 'charge', 'xbox'],
 'latent_topic_9': ['usb', 'charger', 'charge', 'aa', 'aaa']}