# Modern NLP Tutorial
This notebook is following the tutorial from PyData 2016 by Patrick Harrision titled "Modern NLP in Python". It involves processing Yelp restaurant reviews, modeling topics from them, visualizing the topics, and creating and visualizing word vectors. The original notebook can be found [here](https://github.com/skipgram/modern-nlp-in-python/blob/master/executable/Modern_NLP_in_Python.ipynb). The accompanying video from PyData can be found [here](https://youtu.be/6zm9NC9uRkk). The academic dataset used in the notebook can be downloaded from [here](https://app.dominodatalab.com/mtldata/yackathon/browse/yelp_dataset_challenge_academic_dataset). The entire Yelp dataset can be found [here](https://www.yelp.com/dataset).

## Imports and Data Preparation

### Import Packages and set data directory paths

In [5]:
import os
import codecs
import pandas as pd
import itertools as it
import numpy as np
import json
import pickle

from gensim.models import Phrases
from gensim.models.word2vec import LineSentence
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore
from gensim.models import Word2Vec

from sklearn.manifold import TSNE

import pyLDAvis
import pyLDAvis.gensim

import warnings
warnings.filterwarnings('ignore')

import spacy
from spacy.lang.en.stop_words import STOP_WORDS

In [2]:
# Toggle this variable to choose between the full and academic yelp datasets
academic = False
prefix = 'yelp_academic_dataset_' if academic else ''
folder = 'yelp-academic' if academic else 'yelp-full'

data_directory = os.path.join('/mnt/Data/ml/datasets/yelp-dataset/' + folder)
businesses_filepath = os.path.join(data_directory, prefix + 'business.json')
review_json_filepath = os.path.join(data_directory, prefix + 'review.json')
intermediate_directory = os.path.join(data_directory, 'intermediate')

review_txt_filepath = os.path.join(intermediate_directory, 'review_text_all.txt')
unigram_sentences_filepath = os.path.join(intermediate_directory, 'unigram_sentences_all.txt')
bigram_model_filepath = os.path.join(intermediate_directory, 'bigram_model_all')
bigram_sentences_filepath = os.path.join(intermediate_directory, 'bigram_sentences_all.txt')
trigram_model_filepath = os.path.join(intermediate_directory, 'trigram_model_all')
trigram_sentences_filepath = os.path.join(intermediate_directory, 'trigram_sentences_all.txt')
trigram_reviews_filepath = os.path.join(intermediate_directory, 'trigram_transformed_reviews_all.txt')

trigram_dictionary_filepath = os.path.join(intermediate_directory, 'trigram_dict_all.dict')
trigram_bow_filepath = os.path.join(intermediate_directory, 'trigram_bow_corpus_all.mm')
lda_model_filepath = os.path.join(intermediate_directory, 'lda_model_all')
LDAvis_data_filepath = os.path.join(intermediate_directory, 'ldavis_prepared.pkl')

word2vec_filepath = os.path.join(intermediate_directory, 'word2vec_model_all')
tsne_filepath = os.path.join(intermediate_directory, 'tsne_model')
tsne_vectors_filepath = os.path.join(intermediate_directory, 'tsne_vectors.npy')

### Write out the review file (once)

Read in the business json file and go through each business. Count how many restaurants are present and get their ids.

Total number of restaurants in yelp-academic: 21,892

Total number of restaurants in yelp-full: 54,618

In [None]:
from helper_fns import get_restaurant_ids
    
restaurant_ids = get_restaurant_ids(businesses_filepath)
print(f'{len(restaurant_ids)} restaurants in the dataset')

Write out the reviews of each restaurant **ONE LINE PER REVIEW** into the reviews file. This is done by escaping the newline character and replacing it with raw '\n' and adding a '\n' at the end to specify a newline

Number of reviews in yelp-academic: 990,627

Number of reviews in yelp-full: 3,221,419

In [None]:
from helper_fns import write_review_file

review_count = write_review_file(review_txt_filepath, review_json_filepath, restaurant_ids)
print(f'Text from {review_count} reviews written to new txt file')

## SpaCy Text Processing

In [None]:
nlp = spacy.load('en_default')

### Sample Review
Grab a sample review and analyze various aspects of SpaCy using it.

In [None]:
import itertools as it

with codecs.open(review_txt_filepath, encoding='utf_8') as f:
    sample_review = list(it.islice(f, 8, 9))[0]
    sample_reveiw = sample_review.replace('\\n', '\n')
    
parsed_review = nlp(sample_review)

In [None]:
for num, sentence in enumerate(parsed_review.sents):
    print(f'Sentence {num+1}:')
    print(sentence)

In [None]:
for num, entity in enumerate(parsed_review.ents):
    print(f'Entity {num+1}: {entity} - {entity.label_}')

In [None]:
token_attrs = [(token.text,
                token.pos_,
                token.lemma_,
                token.shape_,
                token.prob,
                token.text in STOP_WORDS,
                token.is_punct,
                token.is_space,
                token.like_num,
                token.is_oov)
                for token in parsed_review]

df = pd.DataFrame(token_attrs, columns=['text', 'pos', 'lemma', 'shape', 'log_prob',
                                       'stop?', 'punct?', 'whitespace?', 'number?',
                                        'out of vocab?'])
df.loc[:, 'stop?':'out of vocab?'] = (df.loc[:, 'stop?':'out of vocab?']
                                     .applymap(lambda x: u'Yes' if x else u''))
df

## Phrase Modeling

### Unigram

#### Unigram write file (once)
Get sentences from each review and write out the unigram file. This should be done only once.

Number of sentences in yelp-academic: 10,146,794  
Time taken to process yelp-academic: 5h 34m 46s

Number of sentences in yelp-full: 30,392,900  
Time taken to process yelp-full: 16h 1m 53s

In [None]:
from helper_fns import write_unigram_sents

sentence_count = write_unigram_sents(unigram_sentences_filepath, review_txt_filepath, nlp)
print(f'{sentence_count} sentences written to {unigram_sentences_filepath} file')        

In [None]:
unigram_sentences = LineSentence(unigram_sentences_filepath)

In [None]:
for unigram_sentence in it.islice(unigram_sentences, 230, 240):
    print(u' '.join(unigram_sentence))

#### Unigram Sentences Example

In [None]:
unigram_sentences = LineSentence(unigram_sentences_filepath)

In [None]:
for unigram_sentence in it.islice(unigram_sentences, 230, 240):
    print(u' '.join(unigram_sentence))    

### Bigram

#### Bigram Phrase model create and save (once)
We learn a phrase model that will link individual words into two-word phrases. The model is saved after generation.

Time taken to generate yelp-academic bigram model: 4m 3s  

Time taken to generate yelp-full bigram model: 13m 17s

In [None]:
unigram_sentences = LineSentence(unigram_sentences_filepath)

In [None]:
bigram_model = Phrases(unigram_sentences)
bigram_model.save(bigram_model_filepath)

#### Bigram sentences write file (once)
After learning the bigram phrase model, we feed in the individual sentences from unigram_sentences to find possible bigram phrases. If found, gensim will automatically join them with an underscore.

In [None]:
unigram_sentences = LineSentence(unigram_sentences_filepath)
bigram_model = Phrases.load(bigram_model_filepath)

Write out the bigram sentences to disk.

Number of sentences in yelp-academic: 10,109,973  
Time taken to process yelp-academic: 12m 45s

Number of sentences in yelp-full: 30,301,195  
Time taken to process yelp-full: 36m 7s

In [None]:
from helper_fns import write_sents

sentence_count = write_sents(bigram_sentences_filepath, unigram_sentences, bigram_model)
print(f'{sentence_count} sentences written to {bigram_sentences_filepath}')

#### Bigram Sentences Example

In [None]:
bigram_sentences = LineSentence(bigram_sentences_filepath)

In [None]:
for bigram_sentence in it.islice(bigram_sentences, 230, 240):
    print(u' '.join(bigram_sentence))

### Trigram

#### Trigram Phrase model create and save (once)
We learn a phrase model that will link individual words into three-word phrases based on the input from bigram sentences. The model is saved after generation.

Time taken to generate yelp-academic bigram model: 4m 35s 

Time taken to generate yelp-full bigram model: 12m 14s

In [None]:
bigram_sentences = LineSentence(bigram_sentences_filepath)

In [None]:
trigram_model = Phrases(bigram_sentences)
trigram_model.save(trigram_model_filepath)

#### Trigram sentences write file (once)
After learning the trigram phrase model, we feed in the individual sentences from bigram_sentences to find possible triigram phrases. If found, gensim will automatically join them with an underscore.

In [None]:
bigram_sentences = LineSentence(bigram_sentences_filepath)
trigram_model = Phrases.load(trigram_model_filepath)

Write out the trigram sentences to disk.

Number of sentences in yelp-academic: 10,109,973  
Time taken to process yelp-academic: 11m 54s

Number of sentences in yelp-full: 30,301,195  
Time taken to process yelp-full: 35m 22s

In [None]:
from helper_fns import write_sents

sentence_count = write_sents(trigram_sentences_filepath, bigram_sentences, trigram_model)
print(f'{sentence_count} sentences written to {trigram_sentences_filepath}')

#### Trigram Sentences Example

In [None]:
trigram_sentences = LineSentence(trigram_sentences_filepath)

In [None]:
for trigram_sentence in it.islice(trigram_sentences, 300, 350):
    print(u' '.join(trigram_sentence))

## Generating full reviews file

Now we will generate the full complete text of reviews which would have normalized text, no stopwords, and second order phrases (trigram).


In [None]:
bigram_model = Phrases.load(bigram_model_filepath)
trigram_model = Phrases.load(trigram_model_filepath)

Number of reviews in yelp-academic: 991,714  
Time taken to write reviews in yelp-academic: 5h 48m 52s  
The number of reviews in the original reviews file is 990,627. The trigram transformed reviews have 1,087 reviews more than the original reviews. I am not sure where the increased number of reviews came from.

Number of reviews in yelp-full: 3,223,214  
Time taken to write reviews in yelp-full: 16h 55m 33s  
The number of reviews in the original reviews file is 3,221,419. The trigram transformed reviews have 1,795 reviews more than the original reviews. I am not sure where the increased number of reviews came from.

In [None]:
from helper_fns import write_trigram_review

review_count = write_trigram_review(trigram_reviews_filepath, review_txt_filepath, bigram_model, trigram_model,
                                   nlp)
print(f'{review_count} reviews written to {trigram_reviews_filepath}')

### Review File example

In [None]:
from helper_fns import line_review

print("Original:")
print()

for review in it.islice(line_review(review_txt_filepath), 4352, 4353):
    print(review)

print("----")
print()
print("Transformed:")
print()

with codecs.open(trigram_reviews_filepath, encoding='utf_8') as f:
    for review in it.islice(f, 4352, 4353):
        print(review)

## Topic Modeling with Latent Dirichlet Allocation (_LDA_)

We want to put the reviews into different representing different things. The groups are essentially the topics.

#### Generate Dictionary file

First we create a full vocabulary of the corpus to be modeled using gensim's [**Dictionary**](https://radimrehurek.com/gensim/corpora/dictionary.html)  class.

Time taken to create yelp-academic dictionary: 1m 12.5s 

Time taken to create yelp-full dictionary: 3m 5s 

In [None]:
trigram_reviews = LineSentence(trigram_reviews_filepath)

# learn the dictionary by iterating over all of the reviews
trigram_dictionary = Dictionary(trigram_reviews)

# filter tokens that are very rare or too common from
# the dictionary (filter_extremes) and reassign integer ids (compactify)
trigram_dictionary.filter_extremes(no_below=10, no_above=0.4)
trigram_dictionary.compactify()

trigram_dictionary.save(trigram_dictionary_filepath)

#### Generate bag-of-words model

Using the dictionary created above (which is just a mapping of words to integer ID's we create a bag-of-words model where each review is represented by the coutns of distinct terms in it.

Time taken to create and save yelp-academic BOW model: 2m 22s 

Time taken to create and save yelp-full BOW model: 6m 34s

In [None]:
# load the finished dictionary from disk
trigram_dictionary = Dictionary.load(trigram_dictionary_filepath)

In [None]:
from helper_fns import trigram_bow_generator
    
# load the finished dictionary from disk
trigram_dictionary = Dictionary.load(trigram_dictionary_filepath)

MmCorpus.serialize(trigram_bow_filepath,
                   trigram_bow_generator(trigram_reviews_filepath, trigram_dictionary))

### LDA Model Generation

Create a LDA model with 50 topics.

Time taken to create and save yelp-academic LDA model: 18m 51s 

Time taken to create and save yelp-full LDA model: 54m 1s

In [None]:
# load the finished bag-of-words corpus from disk
trigram_bow_corpus = MmCorpus(trigram_bow_filepath)

with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        
        # workers => sets the parallelism, and should be
        # set to your number of physical cores minus one
        lda = LdaMulticore(trigram_bow_corpus,
                           num_topics=50,
                           id2word=trigram_dictionary,
                           workers=3)
    
lda.save(lda_model_filepath)

#### Explore the topics

The yelp-academic dataset did not have a common theme within the topics but the yelp-full had one. This indicates as always more data is better!

In [None]:
from helper_fns import explore_topic

# load the finished LDA model from disk
lda = LdaMulticore.load(lda_model_filepath)

In [None]:
explore_topic(lda, topic_number=np.random.randint(50))

### Topic visualization with pyLDvis
Prepare the visualization file.

Time taken to prepare yelp-academic viz file:

Time taken to prepare yelp-ful viz file: 2h 29m 30s

In [None]:
# load the finished dictionary from disk
trigram_dictionary = Dictionary.load(trigram_dictionary_filepath)

# load the finished bag-of-words corpus from disk
trigram_bow_corpus = MmCorpus(trigram_bow_filepath)

# load the finished LDA model from disk
lda = LdaMulticore.load(lda_model_filepath)

In [None]:
LDAvis_prepared = pyLDAvis.gensim.prepare(lda, trigram_bow_corpus, trigram_dictionary)

# save a pickle file of ldavis object
with open(LDAvis_data_filepath_pkl, 'wb') as f:
    pickle.dump(LDAvis_prepared, f)

In [None]:
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)

pyLDAvis.display(LDAvis_prepared)

### Testing the LDA Model

In [None]:
nlp = spacy.load('en_default')
bigram_model = Phrases.load(bigram_model_filepath)
trigram_model = Phrases.load(trigram_model_filepath)

# load the finished dictionary from disk
trigram_dictionary = Dictionary.load(trigram_dictionary_filepath)

# load the finished bag-of-words corpus from disk
trigram_bow_corpus = MmCorpus(trigram_bow_filepath)

# load the finished LDA model from disk
lda = LdaMulticore.load(lda_model_filepath)

In [None]:
from helper_fns import line_review, punct_space

def get_sample_review(review_txt_filepath, review_number):
    """
    retrieve a particular review index
    from the reviews file and return it
    """
    
    return list(it.islice(line_review(review_txt_filepath), review_number, review_number+1))[0]

def lda_description(review_text, min_topic_freq=0.05):
    """
    accept the original text of a review and (1) parse it with spaCy,
    (2) apply text pre-proccessing steps, (3) create a bag-of-words
    representation, (4) create an LDA representation, and
    (5) print a sorted list of the top topics in the LDA representation
    """
    
    # parse the review text with spaCy
    parsed_review = nlp(review_text)
    
    # lemmatize the text and remove punctuation and whitespace
    unigram_review = [token.lemma_ for token in parsed_review
                      if not punct_space(token)]
    
    # apply the first-order and secord-order phrase models
    bigram_review = bigram_model[unigram_review]
    trigram_review = trigram_model[bigram_review]
    
    # remove any remaining stopwords
    trigram_review = [term for term in trigram_review
                      if not term in STOP_WORDS]
    
    # create a bag-of-words representation
    review_bow = trigram_dictionary.doc2bow(trigram_review)
    
    # create an LDA representation
    review_lda = lda[review_bow]
    
    # sort with the most highly related topics first
#     review_lda = sorted(review_lda, key=lambda (topic_number, freq): -freq)
    
    for topic_number, freq in review_lda:
        if freq < min_topic_freq:
            break
            
        # print the most highly related topic names and frequencies
        print('{} {}'.format(topic_number, round(freq, 3)))

In [None]:
sample_review = get_sample_review(review_txt_filepath, review_number=50)
print(sample_review)

In [None]:
lda_description(sample_review)

## Word Vector Embedding with word2vec

Training the word vectors from the trigram_sentences

In [None]:
trigram_sentences = LineSentence(trigram_sentences_filepath)

# initiate the model and perform the first epoch of training
food2vec = Word2Vec(trigram_sentences, size=100, window=5, min_count=20, sg=1, workers=4, iter=11)

food2vec.save(word2vec_filepath)

In [3]:
# load the finished model from disk
food2vec = Word2Vec.load(word2vec_filepath)
food2vec.init_sims()

print(u'{} training epochs so far.'.format(food2vec.epochs))

11 training epochs so far.


In [None]:
# build a list of the terms, integer indices,
# and term counts from the food2vec model vocabulary
ordered_vocab = [(term, voc.index, voc.count) for term, voc in food2vec.wv.vocab.items()]

# sort by the term counts, so the most common terms appear first
ordered_vocab = sorted(ordered_vocab, key=lambda x : -x[2])

# unzip the terms, integer indices, and counts into separate lists
ordered_terms, term_indices, term_counts = zip(*ordered_vocab)

# create a DataFrame with the food2vec vectors as data,
# and the terms as row labels
word_vectors = pd.DataFrame(food2vec.wv.syn0norm[term_indices, :], index=ordered_terms)

In [None]:
def get_related_terms(token, topn=25):
    """
    look up the topn most similar terms to token
    and print them as a formatted list
    """

    for word, similarity in food2vec.most_similar(positive=[token], topn=topn):

        print(u'{:20} {}'.format(word, round(similarity, 3)))

In [None]:
get_related_terms(u'veggie')

### Word Algebra!

In [None]:
def word_algebra(add=[], subtract=[], topn=1):
    """
    combine the vectors associated with the words provided
    in add= and subtract=, look up the topn most similar
    terms to the combined vector, and print the result(s)
    """
    answers = food2vec.most_similar(positive=add, negative=subtract, topn=topn)
    
    for term, similarity in answers:
        print(term)

In [None]:
# word_algebra(add=[u'breakfast', u'lunch'])
# word_algebra(add=[u'lunch', u'night'], subtract=[u'day'])
word_algebra(add=[u'taco', u'chinese'], subtract=[u'mexican'])
# word_algebra(add=[u'bun', u'mexican'], subtract=[u'american'])
# word_algebra(add=[u'filet_mignon', u'seafood'], subtract=[u'beef'])
# word_algebra(add=[u'coffee', u'snack'], subtract=[u'drink'])
# word_algebra(add=[u'bk', u'fine_dining'])
# word_algebra(add=[u"denny_'s", u'fine_dining'])
# word_algebra(add=[u"applebee_'s", u'italian'])
# word_algebra(add=[u"veggie", u'pizza'])

### Word Vector Visualization with t-SNE

In [6]:
tsne_input = word_vectors.drop(STOP_WORDS, errors=u'ignore')
tsne_input = tsne_input.head(5000)

In [None]:
tsne = TSNE()
tsne_vectors = tsne.fit_transform(tsne_input.values)

In [13]:
with open(tsne_filepath, 'wb') as f:
    pickle.dump(tsne, f)

pd.np.save(tsne_vectors_filepath, tsne_vectors)

In [14]:
with open(tsne_filepath, 'rb') as f:
    tsne = pickle.load(f)

In [15]:
tsne_vectors = pd.np.load(tsne_vectors_filepath)
tsne_vectors = pd.DataFrame(tsne_vectors, index=pd.Index(tsne_input.index), columns=[u'x_coord', u'y_coord'])

In [16]:
tsne_vectors.head()

Unnamed: 0,x_coord,y_coord
-PRON-,53.542549,38.720715
good,8.609921,-18.904041
food,-27.901876,0.234978
place,-31.400242,15.676832
order,41.760548,-34.778618


In [18]:
tsne_vectors[u'word'] = tsne_vectors.index

In [20]:
tsne_vectors.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5000 entries, -PRON- to flake
Data columns (total 3 columns):
x_coord    5000 non-null float32
y_coord    5000 non-null float32
word       5000 non-null object
dtypes: float32(2), object(1)
memory usage: 117.2+ KB


### Plotting with Bokeh

In [21]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import HoverTool, ColumnDataSource, value

output_notebook()

In [25]:
# add our DataFrame as a ColumnDataSource for Bokeh
plot_data = ColumnDataSource(tsne_vectors)

# create the plot and configure the
# title, dimensions, and tools
tsne_plot = figure(title=u't-SNE Word Embeddings',
                   plot_width = 800,
                   plot_height = 800,
                   tools= (u'pan, wheel_zoom, box_zoom,'
                           u'box_select, reset'),
                   active_scroll=u'wheel_zoom')

# add a hover tool to display words on roll-over
tsne_plot.add_tools( HoverTool(tooltips = u'@word') )

# draw the words as circles on the plot
tsne_plot.circle(u'x_coord', u'y_coord', source=plot_data,
                 color=u'blue', line_alpha=0.2, fill_alpha=0.1,
                 size=10, hover_line_color=u'black')

# configure visual elements of the plot
tsne_plot.title.text_font_size = value(u'16pt')
tsne_plot.xaxis.visible = False
tsne_plot.yaxis.visible = False
tsne_plot.grid.grid_line_color = None
tsne_plot.outline_line_color = None

# engage!
show(tsne_plot);