# Review Data EDA

In [1]:
%matplotlib inline
import numpy as np
import math
import matplotlib.pyplot as plt
import pandas as pd
import datetime
from glob import glob

import seaborn as sns
sns.set()

In [2]:
def time_marker(text=''):
    print('[{}] {}'.format(datetime.datetime.now().time(), text))

In [3]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

import matplotlib
font = {'size' : 50}
matplotlib.rc('font', **font)

TITLE_FONT_SIZE = 25
LABEL_FONT_SIZE = 15
TICK_FONT_SIZE  = 15

FIG_SIZE = (15,6)
DO_WRITE_CHARTS = False

# Load Review Data for Arizona Restaurants

In [4]:
time_marker(text='Loading Review Data...')

reviews = pd.DataFrame()
file_path_slug = '../clean_data/az_restaurant_reviews.csv'
file_list = glob(file_path_slug)

# Chunk Settings
chunks = list()
chunksize = 10000
for ii, file in enumerate(sorted(file_list)):
    time_marker('Reading {} of {} {}...'.format(ii+1, len(file_list), file))
    # import file in chunks
    for jj, chunk in enumerate(pd.read_csv(file, chunksize=chunksize, iterator=True, index_col=0, parse_dates=['date'])):
        
        # append chunk to chunks list
        chunks.append(chunk)
        
time_marker(text='merging to dataframe...')
reviews = pd.concat(chunks)

time_marker('reseting index...')
reviews.reset_index(inplace=True, drop=True)
time_marker(text='Complete!')


[19:07:10.765702] Loading Review Data...
[19:07:10.767827] Reading 1 of 1 ../clean_data/az_restaurant_reviews.csv...


KeyboardInterrupt: 

In [None]:
time_marker('Dropping records with NaN values...')
reviews.dropna(how='any', inplace=True)
reviews.reset_index(inplace=True, drop=True)

time_marker('Cleaning data types...')
reviews['cool'] = reviews['cool'].astype('int')
reviews['funny'] = reviews['funny'].astype('int')
reviews['stars'] = reviews['stars'].astype('int')
reviews['useful'] = reviews['useful'].astype('int')
reviews['review_len'] = reviews['review_len'].astype('int')
reviews['is_fast_food'] = reviews['is_fast_food'].apply(lambda x: True if x == 1 else False)
reviews['date'] = pd.to_datetime(reviews['date'])

time_marker('assiging \'Positive\' or \'Negative\' classification to reviews...')
reviews['is_positive'] = reviews.stars.apply(lambda x: True if x > 3 else False)
           

In [None]:
reviews.info()

In [None]:
reviews.head(3)

# Normalizing Review Text

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
import string

In [None]:
# import stop words
stop_words = set(stopwords.words('english'))

# terms and characters to ignore, we dont care about punctuation
exclude = set(string.punctuation)

contractions = ["'s", "n't", "'ll", "'t", "'s", "'re"]

# lemma
lemma = WordNetLemmatizer()

exclusion_terms = list(set(set(stop_words) | set(exclude) | set(contractions)))

In [None]:
def clean_review(doc):
    ''' remove stop words, remove punctuation, and lemmatize a text document'''

    # lemmatize, tokenize and remove stop words, puncuation and contractions
    # remove non alpha tokens
    tokens = [lemma.lemmatize(word) for word in word_tokenize(doc) if word not in exclusion_terms and word.isalpha()]
    
    return tokens

# Prepare Review Corpus for LDA

In [None]:
from gensim.corpora.dictionary import Dictionary
from collections import defaultdict
import itertools

In [None]:
def lda_prep(corpus=None, n_terms=5):
    '''
    
        @ params:
            corpus   : a list of 
            n_terms  : the number of top terms to preview to the console
    
        returns:
            a list of 3 items
                dictionary        :  a gensim dictionary object built from the corpus
                corpus            :  a bag of words sparce array of corpus terms
                total_word_count  :  a defaultdict with key word identifier in dictionary, and value the count of times that word appears in the corpus
    
    '''
    if corpus == None:
        return False  
    else:
        time_marker('building gensim dict...')
        # build gensim dict, key=token, value=count
        dictionary = Dictionary(corpus)
        # print('dictionary Tokens to ID {}'.format(dictionary.token2id))

        # create a gensim corpus
        time_marker('building gensim corpus...')
        corpus = [dictionary.doc2bow(doc) for doc in clean_docs]
        # print('gensim Corpus {}'.format(corpus[0]))

        # create a defaultdict
        total_word_count = defaultdict(int)

        # loop over corpus and count the number of times each word appears
        for word_id, word_count in itertools.chain.from_iterable(corpus):
            total_word_count[word_id] += word_count

        # create a sorted list from the defaultdict
        sorted_word_count = sorted(total_word_count.items(), key=lambda w: w[1], reverse=True)

        # print top n_terms words across all documents
        print('Top {:d} words across all documents'.format(n_terms))
        for word_id, word_count in sorted_word_count[:n_terms]:
            print('{:20}{}'.format(dictionary.get(word_id), word_count))
        
        return [dictionary, corpus, total_word_count]

# Split up by `Fast Food` and `Non Fast Food` Restaurant Reviews

In [None]:
ff_reviews = reviews[reviews.is_fast_food == True].copy()
ff_reviews.reset_index(inplace=True, drop=True)
nff_reviews = reviews[reviews.is_fast_food == False].copy()
nff_reviews.reset_index(inplace=True, drop=True)

In [None]:
print('============================== Fast Food Reviews ==============================')
print('{:45}{:d}\t{:2.4f}'.format('Number of Fast Food Reviews', ff_reviews.shape[0], 100.*ff_reviews.shape[0] / reviews.shape[0]))
print('{:45}{:d}\t{:2.4f}'.format('Number of Positive Fast Food Reviews', ff_reviews[ff_reviews.is_positive == True].shape[0], (100.*ff_reviews[ff_reviews.is_positive == True].shape[0]/ff_reviews.shape[0])))
print('{:45}{:d}\t{:2.4f}'.format('Number of Negative Fast Food Reviews', ff_reviews[ff_reviews.is_positive == False].shape[0], (100.*ff_reviews[ff_reviews.is_positive == False].shape[0]/ff_reviews.shape[0])))

print()
print('============================ Non Fast Food Reviews ============================')
print('{:45}{:d}\t{:2.4f}'.format('Number of Non Fast Food Reviews', nff_reviews.shape[0], 100.*nff_reviews.shape[0] / reviews.shape[0]))
print('{:45}{:d}\t{:2.4f}'.format('Number of Positive Non Fast Food Reviews', nff_reviews[nff_reviews.is_positive == True].shape[0], (100.*nff_reviews[nff_reviews.is_positive == True].shape[0]/nff_reviews.shape[0])))
print('{:45}{:d}\t{:2.4f}'.format('Number of Negative Non Fast Food Reviews', nff_reviews[nff_reviews.is_positive == False].shape[0], (100.*nff_reviews[nff_reviews.is_positive == False].shape[0]/nff_reviews.shape[0])))

# Fast Food Reviews

<p><b>Step 1: </b>Subset to only evaluate `Fast Food` Reviews</p>

In [None]:
ff_reviews.head(3)

In [None]:
ff_reviews.shape[0]

<p><b>Step 2: </b>Tokenize and Normalize review text</p>

In [None]:
time_marker('tokenizing and normalizing text...')
ff_reviews['tokens'] = ff_reviews.text.apply(lambda r: clean_review(r))
ff_reviews['norm_text'] = ff_reviews.tokens.apply(lambda t: ' '.join(t))
time_marker('done!')

In [None]:
ff_reviews.head(3).transpose()

<p><b>Step 3: </b>Build our term dictionary, document term matrix, and preview the most common terms</p>

In [None]:
# collect all cleaned review strings into a list of strings
clean_docs = list(ff_reviews.tokens.values)

# create dictionary, corpus, and word counts with custom function
dictionary, doc_term_matrix, total_word_count = lda_prep(corpus=clean_docs, n_terms=25)

<p><b>Step 4: </b>Using a Multicore LDA model, attempt to identify topics from the dictionary</p>

In [None]:
from gensim.models.ldamulticore import LdaMulticore
ldam = LdaMulticore

num_topics = 50
num_words  = 10
num_passes = 50

In [None]:
time_marker('started generating lda multicore model')
ldam_model = ldam(doc_term_matrix, num_topics=num_topics, id2word=dictionary, passes=num_passes)
time_marker('done!')

results = ldam_model.print_topics(num_topics=num_topics, num_words=num_words)

<p><b>Step 6: </b>View Results</p>

# Print Model Terms

In [None]:
from collections import Counter

In [None]:
def print_topic_terms(model, num_topics=num_topics, num_words=10, unique=False):
    results = model.print_topics(num_topics=num_topics, num_words=num_words)
    if not unique:
        print('=============================== Terms Per Topic ===============================')
        for r in results:
            topic = r[0]
            term_list = r[1]

            term_list = term_list.split('"')[1::2]
            topic_terms = [term for term in term_list]
            print('{}\t{}'.format(topic, topic_terms))
    else:
        terms = [x[1] for x in results]
        term_lists = [x.split('"')[1::2] for x in terms]

        flatList = itertools.chain.from_iterable(term_lists)
        term_counts = Counter(flatList)

        # non_unique_terms = term_counts
        test = dict(term_counts)

        # extract terms that appear more than once
        non_unique_terms = [key for key, value in test.items() if value > 1]
        
        
        print('============================ Unique Terms Per Topic ===========================')
        for r in results:
            topic = r[0]
            term_list = r[1]

            term_list = term_list.split('"')[1::2]
            topic_terms = [term for term in term_list if term not in non_unique_terms]
            print('{}\t{}'.format(topic, topic_terms))

In [None]:
print_topic_terms(ldam_model, num_topics=num_topics, num_words=10, unique=False)

In [None]:
print_topic_terms(ldam_model, num_topics=num_topics, num_words=10, unique=True)

# Save Model

In [None]:
# save model to disk (no need to use pickle module)
term = 'fast_food'
file_suffix = '{}_{:d}_topics_{:d}_terms_{}_passes'.format(term, num_topics, num_words, num_passes)
ldam_model.save('../models/ldam_{}.model'.format(file_suffix))

In [5]:
time_marker('Done!')

[19:07:30.114759] Done!
