In [20]:
import logging
import re
import numpy
import matplotlib.pyplot as plot
import pickle

from pymongo import MongoClient

from textblob import TextBlob

import nltk
nltk.download('wordnet')
nltk.download('words')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('brown')
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')
from nltk import pos_tag, word_tokenize
from nltk.corpus import wordnet,  words, stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from gensim.models.ldamodel import LdaModel
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.ldamodel import LdaModel, CoherenceModel
from gensim.corpora import Dictionary

import pyLDAvis
import pyLDAvis.gensim

import multiprocessing
from atpbar import atpbar
from atpbar import register_reporter, find_reporter, flush

from tqdm import tqdm
from pympler import asizeof

from IPython.display import display

[nltk_data] Downloading package wordnet to /Users/ram/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to /Users/ram/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/ram/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/ram/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package brown to /Users/ram/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/ram/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/ram/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


# Topic Modeling

In [2]:
english_words = set(words.words())
english_stopwords = set(stopwords.words('english'))

lemmatize = WordNetLemmatizer().lemmatize


In [3]:
# pos_tags: r - adverb, j - adjective, n - noun, v - verb, 
def nouns(text):
    tokens = tokenize(text)
    tokens = clean(tokens)
    
    _nouns = [token for token, pos in pos_tag(tokens) if pos[0] == 'N']
    return _nouns

In [4]:
# pos_tags: r - adverb, j - adjective, n - noun, v - verb, 
def tokenize(text):
    text = text.lower()
    
    re_word = re.compile('^[a-z]+$')
    
    tokens = []
    for token, pos in pos_tag(word_tokenize(text)):
        
        if not bool(re_word.match(token)):
            continue

        lemma = None
        if pos[0] in ['A', 'N', 'R', 'V']:
            lemma = lemmatize(token, pos[0].lower())
        else:
            if token not in english_words and wordnet.morphy(token) is None:
                continue
            lemma = token
            
        tokens.append(lemma)
    return tokens

In [5]:
def clean(tokens):
    tokens = [token for token in tokens if token not in english_stopwords]
    tokens = [token for token in tokens if len(token) > 2]
    return tokens

In [6]:
text = 'Mediocre food, very loud.  Filet mignon au poivre ordered "medium" was virtually raw on one end of the steak with only a hint of sauce.  Very bland. The "famous" fries are greasy strips of potato skins.  House salad was decent.  Service was OK, but the kitchen was painfully slow. 45 minutes to receive entree.  Not impressed.  $100 per person.  Won\'t be back.'

tokens = tokenize(text)
tokens = clean(tokens)
print(tokens)
print()

print(nouns(text))


['mediocre', 'food', 'loud', 'filet', 'mignon', 'poivre', 'order', 'medium', 'virtually', 'raw', 'one', 'end', 'steak', 'hint', 'sauce', 'bland', 'famous', 'fry', 'greasy', 'strip', 'potato', 'skin', 'house', 'salad', 'decent', 'service', 'kitchen', 'painfully', 'slow', 'minute', 'receive', 'entree', 'impressed', 'per', 'person', 'back']

['mediocre', 'food', 'filet', 'mignon', 'poivre', 'order', 'medium', 'end', 'steak', 'hint', 'sauce', 'fry', 'strip', 'potato', 'skin', 'house', 'service', 'minute', 'entree', 'person']


In [7]:
# Source: https://stackoverflow.com/questions/44073393/parallelizing-loading-data-from-mongodb-into-python

def multiprocess_cursor(n_cores, collection_size, process_cursor):
    batch_size = round(collection_size/n_cores+0.5)
    skips = range(0, n_cores*batch_size, batch_size)

    reporter = find_reporter()
    processes = [multiprocessing.Process(target=process_cursor, args=(skip_n,batch_size, reporter)) for skip_n in skips]

    for process in processes:
        process.start()

    for process in processes:
        process.join()

    flush()

### Extract tokens (nouns, verbs, adverbs, adjectives)

In [None]:
def process_cursor(skip_n, limit_n, reporter):
    print('Starting process',skip_n//limit_n,'...')
    
    register_reporter(reporter)
    
    db =  MongoClient(port=27017).yelp
    cursor = db.reviews.find({}, {'text': 1}).sort('_id', 1).skip(skip_n).limit(limit_n)

    reviews = []
    for review in cursor:
        reviews.append(review)
        
    for i in atpbar(range(len(reviews)), name=str(skip_n//limit_n)):
        review = reviews[i]
        if 'text' in review:
            tokens = tokenize(review['text'])
            tokens = clean(tokens)
            db.reviews.update_one({'_id': review['_id']}, {'$set': {'tokens': tokens}})

    print('Completed process',skip_n//limit_n,'...')

n_cores = 8
collection_size = 6685900
batch_size = round(collection_size/n_cores+0.5)
skips = range(0, n_cores*batch_size, batch_size)

# progress = tqdm(total=6685900, leave=True, position=0)

reporter = find_reporter()
processes = [multiprocessing.Process(target=process_cursor, args=(skip_n,batch_size, reporter)) for skip_n in skips]

for process in processes:
    process.start()

for process in processes:
    process.join()

flush()

### Extract nouns

In [8]:
def process_cursor(skip_n, limit_n, reporter):
    print('Starting process',skip_n//limit_n,'...')
    
    register_reporter(reporter)
    
    db =  MongoClient(port=27017).yelp
    cursor = db.reviews_sub_2.find({}, {'text': 1}).sort('_id', 1).skip(skip_n).limit(limit_n)

    reviews = []
    for review in cursor:
        reviews.append(review)
        
    for i in atpbar(range(len(reviews)), name=str(skip_n//limit_n)):
        review = reviews[i]
        if 'text' in review:
            _nouns = nouns(review['text'])
            db.reviews_sub_2.update_one({'_id': review['_id']}, {'$set': {'nouns': _nouns}})

    print('Completed process',skip_n//limit_n,'...')


multiprocess_cursor(n_cores=8, collection_size=282415, process_cursor=process_cursor)

Starting process 0 ...
Starting process 1 ...
Starting process 2 ...
Starting process 3 ...
Starting process 4 ...
Starting process 5 ...
Starting process 6 ...
Starting process 7 ...


VBox()

Completed process 7 ...
Completed process 4 ...
Completed process 6 ...
Completed process 0 ...
Completed process 2 ...
Completed process 1 ...
Completed process 3 ...
Completed process 5 ...


### Connect to mongodb

In [9]:
mongoClient = MongoClient(port=27017)
db = mongoClient.yelp

### Load extracted nouns

In [11]:
tokenized_docs = []
progress = tqdm(total=282415, leave=True, position=0)
cursor = db.reviews_sub_2.find({}, {'nouns': 1}).sort('_id', 1)
for doc in cursor:
    progress.update(1)
    if 'nouns' in doc:
        tokenized_docs.append(doc['nouns'])

progress.refresh()

100%|██████████| 282415/282415 [00:03<00:00, 4590.75it/s]

True

### Bag of words

In [12]:
%%time
dictionary = Dictionary(tokenized_docs)
corpus = [dictionary.doc2bow(tokenized_doc) for tokenized_doc in tokenized_docs]

dictionary.save('dictionary_sub_nouns_2.pkl')
with open('corpus_sub_nouns_2.pkl', 'wb') as file:
    pickle.dump(corpus, file)

100%|██████████| 282415/282415 [00:16<00:00, 4590.75it/s]

CPU times: user 18.3 s, sys: 523 ms, total: 18.9 s
Wall time: 18.8 s


In [13]:
dictionary = None
corpus = None

with open('dictionary_sub_nouns_2.pkl', 'rb') as file:
    dictionary = pickle.load(file)
    
with open('corpus_sub_nouns_2.pkl', 'rb') as file:
    corpus = pickle.load(file)

### LDA model

In [21]:
logging.shutdown()
reload(logging)

NameError: name 'reload' is not defined

In [22]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [23]:
%%time
num_topics = 20

model_20 = LdaMulticore(corpus, num_topics=num_topics, id2word=dictionary, passes=40)
model_20.save('model_sub_nouns_2_20.h5')

CPU times: user 3min 32s, sys: 1min 2s, total: 4min 35s
Wall time: 5min 8s


In [24]:
logging.getLogger().setLevel(logging.CRITICAL)

In [27]:
model_20 = LdaModel.load('model_sub_nouns_2_20.h5')

### Coherence score

In [28]:
%%time
coherence_model = CoherenceModel(model=model_20, texts= tokenized_docs, dictionary=dictionary, coherence='c_v')
print(coherence_model.get_coherence())

0.6119475232059287
CPU times: user 11.1 s, sys: 1.4 s, total: 12.5 s
Wall time: 28.6 s


### Visualize topics

In [None]:
lda_display = pyLDAvis.gensim.prepare(model, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

In [None]:
n_topics = model.num_topics
n_terms = 30
display(model.show_topics(n_topics, n_terms))

In [None]:
n_topics = model.num_topics
n_terms = 30
for i in range(n_topics):
    display(model.show_topic(i, n_terms))

In [None]:
text = "Hard to beat this location for table side entertainment"
tokens = tokenize(text)
tokens = clean(tokens)

print(tokens)

bow = dictionary.doc2bow(tokens)
print(bow)
print(model.get_document_topics(bow, minimum_probability=0))

# Sentiment Analysis

In [29]:
sentimentIntensityAnalyzer = SentimentIntensityAnalyzer()

In [30]:
model = model_20

In [57]:
def get_topic_terms(text):
#     print(text)
    topic_terms = []
    
    tokens = nouns(text)
    _tokens_set = set(tokens)
    bow = dictionary.doc2bow(tokens)
    topics = model.get_document_topics(bow, minimum_probability=0.1)
    topics = sorted(topics, key=lambda a: -a[1])
    for topic_idx, odds in topics:
#         n_terms = odds > 0.5 and 6 or 3
#         terms = set([term for term, _ in model.show_topic(topic_idx, 50)])
        topic_terms.extend([term for term, _ in model.show_topic(topic_idx, 50) if term in _tokens_set])
#         topic_terms.extend([token for token in tokens if token in terms])
#         topic_terms.extend([term for term, _ in model.show_topic(topic_idx, n_terms)])
        
    return set(topic_terms)

In [58]:
def get_sentiment(text, stars):
    compound = sentimentIntensityAnalyzer.polarity_scores(text)['compound'],
    polarity = TextBlob(text).sentiment.polarity
    
    score = compound[0] == 0 and polarity or compound[0]
    score = (score + numpy.interp(stars, [1, 5], [-1, 1]))/2
    return score

In [59]:
def get_sentiment_scores(text, stars):
    sentences = nltk.tokenize.sent_tokenize(text)
    
    sentiment_scores = {}
    for sentence in sentences:
        terms = get_topic_terms(sentence)
#         print(terms)
        sentiment = get_sentiment(sentence, stars)
#         print(sentiment)
        for term in terms:
            if term not in sentiment_scores:
                sentiment_scores[term] = []
            sentiment_scores[term].append(sentiment)
    return sentiment_scores



In [60]:
def avg_sentiment_scores(sentiment_scores):
    return {k: numpy.mean(v) for k,v in sentiment_scores.items()}

In [61]:
%%time
cursor = db.reviews_sub_2.find({}, {'text': 1, 'stars': 1}).sort('_id', 1).limit(100)
for doc in cursor:
    if 'text' in doc and 'stars' in doc:
        review = doc['text']
        stars = doc['stars']
        sentiment_scores = get_sentiment_scores(review, stars)
#         print(sentiment_scores.keys())
#         print()
        sentiment_scores = avg_sentiment_scores(sentiment_scores)
#         print(sentiment_scores)
#         print()

CPU times: user 5.14 s, sys: 215 ms, total: 5.36 s
Wall time: 5.19 s


### Compute and store topics, keywords, and sentiments

In [62]:
multiprocessing.cpu_count()

16

In [63]:
db.reviews_sub_2_2.drop()

In [64]:
def process_cursor(skip_n, limit_n, reporter):
    print('Starting process',skip_n//limit_n,'...')
    
    register_reporter(reporter)
    
    db =  MongoClient(port=27017).yelp
    cursor = db.reviews_sub_2.find({}).sort('_id', 1).skip(skip_n).limit(limit_n)

    reviews = []
    for review in cursor:
        reviews.append(review)
        
    for i in atpbar(range(len(reviews)), name=str(skip_n//limit_n)):
        review = reviews[i]
        if 'text' in review and 'stars' in review:
            sentiment_scores = get_sentiment_scores(review['text'], review['stars'])
            sentiment_scores = avg_sentiment_scores(sentiment_scores)
            review['keywords'] = sentiment_scores
    print("Saving....")
    db.reviews_sub_2_2.insert_many(reviews)

    print('Completed process',skip_n//limit_n,'...')


db.reviews_sub_2_2.drop()
multiprocess_cursor(n_cores=8, collection_size=282415, process_cursor=process_cursor)

Starting process 0 ...
Starting process 1 ...
Starting process 2 ...
Starting process 3 ...
Starting process 4 ...
Starting process 5 ...
Starting process 6 ...
Starting process 7 ...


VBox()

Saving....
Completed process 7 ...
Saving....
Completed process 6 ...
Saving....
Completed process 4 ...
Saving....
Saving....
Completed process 1 ...
Completed process 0 ...
Saving....
Completed process 2 ...
Saving....
Completed process 3 ...
Saving....
Completed process 5 ...
