# LDA model with Gensim

In [1]:
import os

# Input data trigram_sentences_all.txt generated from yelp dataset using the notebook ....
data_directory = '/Users/jorge/data/training/text'
trigram_reviews_filepath = os.path.join(data_directory, 'trigram_sentences_all.txt')


In [2]:
%%time

from gensim.models.word2vec import LineSentence
from gensim.corpora import Dictionary, MmCorpus


# Create dictionary
trigram_dictionary_filepath = os.path.join(data_directory, 'trigram_dict_all.dict')

trigram_reviews = LineSentence(trigram_reviews_filepath)

# learn the dictionary by iterating over all of the reviews
trigram_dictionary = Dictionary(trigram_reviews)
    
# filter tokens that are very rare or too common from
# the dictionary (filter_extremes) and reassign integer ids (compactify)
trigram_dictionary.filter_extremes(no_below=10, no_above=0.4)
trigram_dictionary.compactify()

trigram_dictionary.save(trigram_dictionary_filepath)

# 2min 49s

CPU times: user 2min 48s, sys: 1.08 s, total: 2min 49s
Wall time: 2min 50s


## Create bag of words representation

In [3]:
%%time

trigram_bow_filepath = os.path.join(data_directory, 'trigram_bow_corpus_all.mm')

def trigram_bow_generator(filepath):
    """
    generator function to read reviews from a file
    and yield a bag-of-words representation
    """
    for review in LineSentence(filepath):
        yield trigram_dictionary.doc2bow(review)

# generate bag-of-words representations for all reviews and save them as a matrix
MmCorpus.serialize(trigram_bow_filepath, trigram_bow_generator(trigram_reviews_filepath))

# load the finished bag-of-words corpus from disk
trigram_bow_corpus = MmCorpus(trigram_bow_filepath)

# 3min 58s

CPU times: user 3min 53s, sys: 4.68 s, total: 3min 58s
Wall time: 3min 58s


## Train LDA model

In [4]:
%%time

import warnings
from gensim.models.ldamulticore import LdaMulticore

lda_model_filepath = os.path.join(data_directory, 'lda_model_all')
if 1 == 1:
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        
        # workers => number of physical cores minus one
        lda = LdaMulticore(trigram_bow_corpus,
                           num_topics=50,
                           id2word=trigram_dictionary,
                           workers=7)
    
    lda.save(lda_model_filepath)
# 16min 31s  

CPU times: user 10min 4s, sys: 6min 42s, total: 16min 46s
Wall time: 16min 31s


## Basic topic exploration 

In [5]:
def explore_topic(topic_number, topn=25):
    """print out a formatted list of the top terms
    """
    print('term - frequency\n')
    for term, frequency in lda.show_topic(topic_number, topn=topn):
        print(term, round(frequency, 3))

In [10]:
explore_topic(topic_number=4)

term - frequency

very 0.219
price 0.101
and 0.074
small 0.055
portion 0.045
for 0.042
big 0.035
large 0.031
huge 0.029
food 0.023
enough 0.019
reasonable 0.019
but 0.019
a 0.017
size 0.016
with 0.01
seriously 0.01
also 0.008
yum 0.008
greasy 0.008
high 0.007
fair 0.007
japanese 0.006
generous 0.006
portion_size 0.006


## Advanced topic exploration

In [11]:
import pyLDAvis
pyLDAvis.enable_notebook()

LDAvis_data_filepath = os.path.join(data_directory, 'ldavis_prepared')


In [None]:
%%time

import pyLDAvis.gensim
LDAvis_prepared = pyLDAvis.gensim.prepare(lda, trigram_bow_corpus, trigram_dictionary)

In [None]:
with open(LDAvis_data_filepath, 'w') as f:
    pickle.dump(LDAvis_prepared, f)

## Describing text with LDA

In [None]:
def get_sample_review(review_number):
    """
    retrieve a particular review index
    from the reviews file and return it
    """
    
    return list(it.islice(line_review(review_txt_filepath),
                          review_number, review_number+1))[0]


def lda_description(review_text, min_topic_freq=0.05):
    """
    accept the original text of a review and (1) parse it with spaCy,
    (2) apply text pre-proccessing steps, (3) create a bag-of-words
    representation, (4) create an LDA representation, and
    (5) print a sorted list of the top topics in the LDA representation
    """
    
    # parse the review text with spaCy
    parsed_review = nlp(review_text)
    
    # lemmatize the text and remove punctuation and whitespace
    unigram_review = [token.lemma_ for token in parsed_review
                      if not punct_space(token)]
    
    # apply the first-order and secord-order phrase models
    bigram_review = bigram_model[unigram_review]
    trigram_review = trigram_model[bigram_review]
    
    # remove any remaining stopwords
    trigram_review = [term for term in trigram_review
                      if not term in spacy.en.STOPWORDS]
    
    # create a bag-of-words representation
    review_bow = trigram_dictionary.doc2bow(trigram_review)
    
    # create an LDA representation
    review_lda = lda[review_bow]
    
    # sort with the most highly related topics first
    review_lda = sorted(review_lda, key=lambda (topic_number, freq): -freq)
    
    print()
    for topic_number, freq in review_lda:
        if freq < min_topic_freq:
            break
            
        # print the most highly related topic names and frequencies
        print(topic_names[topic_number], round(freq, 3))

In [None]:
sample_review = get_sample_review(50)
print(sample_review)

lda_description(sample_review)

In [None]:
sample_review = get_sample_review(100)
print(sample_review)

lda_description(sample_review)