In [None]:
import logging
import re
import numpy
import matplotlib.pyplot as plot
import pickle

from pymongo import MongoClient

from textblob import TextBlob

import nltk
nltk.download('wordnet')
nltk.download('words')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('brown')
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')
from nltk import pos_tag, word_tokenize
from nltk.corpus import wordnet,  words, stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from gensim.models.ldamodel import LdaModel
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.ldamodel import LdaModel, CoherenceModel
from gensim.corpora import Dictionary

import pyLDAvis
import pyLDAvis.gensim

from tqdm import tqdm
from pympler import asizeof

from IPython.display import display

# Topic Modeling

In [None]:
english_words = set(words.words())
english_stopwords = set(stopwords.words('english'))

lemmatize = WordNetLemmatizer().lemmatize


In [None]:
# pos_tags: r - adverb, j - adjective, n - noun, v - verb, 
def nouns(text):
    tokens = tokenize(text)
    tokens = clean(tokens)
    
    _nouns = [token for token, pos in pos_tag(tokens) if pos[0] == 'N']
    return _nouns

In [None]:
# pos_tags: r - adverb, j - adjective, n - noun, v - verb, 
def tokenize(text):
    text = text.lower()
    
    re_word = re.compile('^[a-z]+$')
    
    tokens = []
    for token, pos in pos_tag(word_tokenize(text)):
        
        if not bool(re_word.match(token)):
            continue

        lemma = None
        if pos[0] in ['A', 'N', 'R', 'V']:
            lemma = lemmatize(token, pos[0].lower())
        else:
            if token not in english_words and wordnet.morphy(token) is None:
                continue
            lemma = token
            
        tokens.append(lemma)
    return tokens

In [None]:
def clean(tokens):
    tokens = [token for token in tokens if token not in english_stopwords]
    tokens = [token for token in tokens if len(token) > 2]
    return tokens

In [None]:
text = 'Mediocre food, very loud.  Filet mignon au poivre ordered "medium" was virtually raw on one end of the steak with only a hint of sauce.  Very bland. The "famous" fries are greasy strips of potato skins.  House salad was decent.  Service was OK, but the kitchen was painfully slow. 45 minutes to receive entree.  Not impressed.  $100 per person.  Won\'t be back.'

tokens = tokenize(text)
tokens = clean(tokens)
print(tokens)
print()

print(nouns(text))


### Extract tokens (nouns, verbs, adverbs, adjectives)

In [None]:
# Source: https://stackoverflow.com/questions/44073393/parallelizing-loading-data-from-mongodb-into-python
import multiprocessing
from atpbar import atpbar
from atpbar import register_reporter, find_reporter, flush

def process_cursor(skip_n, limit_n, reporter):
    print('Starting process',skip_n//limit_n,'...')
    
    register_reporter(reporter)
    
    db =  MongoClient(port=27017).yelp
    cursor = db.reviews.find({}, {'text': 1}).sort('_id', 1).skip(skip_n).limit(limit_n)

    reviews = []
    for review in cursor:
        reviews.append(review)
        
    for i in atpbar(range(len(reviews)), name=str(skip_n//limit_n)):
        review = reviews[i]
        if 'text' in review:
            tokens = tokenize(review['text'])
            tokens = clean(tokens)
            db.reviews.update_one({'_id': review['_id']}, {'$set': {'tokens': tokens}})

    print('Completed process',skip_n//limit_n,'...')

n_cores = 8
collection_size = 6685900
batch_size = round(collection_size/n_cores+0.5)
skips = range(0, n_cores*batch_size, batch_size)

# progress = tqdm(total=6685900, leave=True, position=0)

reporter = find_reporter()
processes = [multiprocessing.Process(target=process_cursor, args=(skip_n,batch_size, reporter)) for skip_n in skips]

for process in processes:
    process.start()

for process in processes:
    process.join()

flush()

### Extract nouns

In [None]:
# Source: https://stackoverflow.com/questions/44073393/parallelizing-loading-data-from-mongodb-into-python
import multiprocessing
from atpbar import atpbar
from atpbar import register_reporter, find_reporter, flush

def process_cursor(skip_n, limit_n, reporter):
    print('Starting process',skip_n//limit_n,'...')
    
    register_reporter(reporter)
    
    db =  MongoClient(port=27017).yelp
    cursor = db.reviews_sub.find({}, {'text': 1}).sort('_id', 1).skip(skip_n).limit(limit_n)

    reviews = []
    for review in cursor:
        reviews.append(review)
        
    for i in atpbar(range(len(reviews)), name=str(skip_n//limit_n)):
        review = reviews[i]
        if 'text' in review:
            _nouns = nouns(review['text'])
            db.reviews_sub.update_one({'_id': review['_id']}, {'$set': {'nouns': _nouns}})

    print('Completed process',skip_n//limit_n,'...')

n_cores = 8
collection_size = 767985
batch_size = round(collection_size/n_cores+0.5)
skips = range(0, n_cores*batch_size, batch_size)

reporter = find_reporter()
processes = [multiprocessing.Process(target=process_cursor, args=(skip_n,batch_size, reporter)) for skip_n in skips]

for process in processes:
    process.start()

for process in processes:
    process.join()

flush()

### Connect to mongodb

In [None]:
mongoClient = MongoClient(port=27017)
db = mongoClient.yelp

### Load extracted nouns

In [None]:
tokenized_docs = []
progress = tqdm(total=767985, leave=True, position=0)
cursor = db.reviews_sub.find({}, {'nouns': 1}).sort('_id', 1)
for doc in cursor:
    progress.update(1)
    if 'nouns' in doc:
        tokenized_docs.append(doc['nouns'])

progress.refresh()

### Bag of words

In [None]:
%%time
dictionary = Dictionary(tokenized_docs)
corpus = [dictionary.doc2bow(tokenized_doc) for tokenized_doc in tokenized_docs]

dictionary.save('dictionary_sub_nouns.pkl')
with open('corpus_sub_nouns.pkl', 'wb') as file:
    pickle.dump(corpus, file)

In [None]:
dictionary = None
corpus = None

with open('dictionary_sub_nouns.pkl', 'rb') as file:
    dictionary = pickle.load(file)
    
with open('corpus_sub_nouns.pkl', 'rb') as file:
    corpus = pickle.load(file)

### LDA model

In [None]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
%%time
num_topics = 20

model_20 = LdaMulticore(corpus, num_topics=num_topics, id2word=dictionary, passes=25)
model_20.save('model_sub_nouns_20.h5')

In [None]:
logging.getLogger().setLevel(logging.CRITICAL)

In [None]:
model_20 = LdaModel.load('model_sub_nouns_20.h5')

### Coherence score

In [None]:
%%time
coherence_model = CoherenceModel(model=model_20, texts= tokenized_docs, dictionary=dictionary, coherence='c_v')
print(coherence_model.get_coherence())

### Visualize topics

In [None]:
lda_display = pyLDAvis.gensim.prepare(model, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

In [None]:
n_topics = model.num_topics
n_terms = 30
display(model.show_topics(n_topics, n_terms))

In [None]:
n_topics = model.num_topics
n_terms = 30
for i in range(n_topics):
    display(model.show_topic(i, n_terms))

In [None]:
text = "Hard to beat this location for table side entertainment"
tokens = tokenize(text)
tokens = clean(tokens)

print(tokens)

bow = dictionary.doc2bow(tokens)
print(bow)
print(model.get_document_topics(bow, minimum_probability=0))

# Sentiment Analysis

In [None]:
sentimentIntensityAnalyzer = SentimentIntensityAnalyzer()

In [None]:
model = model_20

In [None]:
def get_topic_terms(text):
    print(text)
    topic_terms = []
    
    tokens = nouns(text)
    bow = dictionary.doc2bow(tokens)
    topics = model.get_document_topics(bow, minimum_probability=0.0)
    topics = sorted(topics, key=lambda a: -a[1])
    for topic_idx, odds in topics:
        n_terms = odds > 0.5 and 6 or 3
        topic_terms.extend([term for term, _ in model.show_topic(topic_idx, 100) if term in tokens])
#         topic_terms.extend([term for term, _ in model.show_topic(topic_idx, n_terms)])
        
    return set(topic_terms)

In [None]:
def get_sentiment(text, stars):
    compound = sentimentIntensityAnalyzer.polarity_scores(text)['compound'],
    polarity = TextBlob(text).sentiment.polarity
    
    score = compound[0] == 0 and polarity or compound[0]
    score = (score + numpy.interp(stars, [1, 5], [-1, 1]))/2
    return score

In [None]:
def get_sentiment_scores(text, stars):
    sentences = nltk.tokenize.sent_tokenize(text)
    
    sentiment_scores = {}
    for sentence in sentences:
        terms = get_topic_terms(sentence)
#         print(terms)
        sentiment = get_sentiment(sentence, stars)
#         print(sentiment)
        for term in terms:
            if term not in sentiment_scores:
                sentiment_scores[term] = []
            sentiment_scores[term].append(sentiment)
    return sentiment_scores



In [None]:
progress = tqdm(total=767985, leave=True, position=0)
cursor = db.reviews_sub.find({}, {'text': 1, 'stars': 1}).sort('_id', 1).limit(10)
for doc in cursor:
    progress.update(1)
    if 'text' in doc and 'stars' in doc:
        review = doc['text']
        stars = doc['stars']
        sentiment_scores = get_sentiment_scores(review, stars)
        print(sentiment_score)
        print()

progress.refresh()