## Imports and Data Preparation

In [1]:
import os
import codecs
import pandas as pd
import itertools as it

from gensim.models import Phrases
from gensim.models.word2vec import LineSentence

import spacy
from spacy.lang.en.stop_words import STOP_WORDS # This is needed for stopword detection"

In [2]:
data_directory = os.path.join('/mnt/Data/ml-stuff/datasets/yelp-dataset')
businesses_filepath = os.path.join(data_directory, 'business.json')
review_json_filepath = os.path.join(data_directory, 'review.json')
intermediate_directory = os.path.join(data_directory, 'intermediate')

In [3]:
review_txt_filepath = os.path.join(intermediate_directory, 'review_text_all.txt')
unigram_sentences_filepath = os.path.join(intermediate_directory, 'unigram_sentences_all.txt')
bigram_model_filepath = os.path.join(intermediate_directory, 'bigram_model_all')
bigram_sentences_filepath = os.path.join(intermediate_directory, 'bigram_sentences_all.txt')
trigram_model_filepath = os.path.join(intermediate_directory, 'trigram_model_all')
trigram_sentences_filepath = os.path.join(intermediate_directory, 'trigram_sentences_all.txt')
trigram_reviews_filepath = os.path.join(intermediate_directory, 'trigram_transformed_reviews_all.txt')

In [4]:
trigram_dictionary_filepath = os.path.join(intermediate_directory, 'trigram_dict_all.dict')
trigram_bow_filepath = os.path.join(intermediate_directory, 'trigram_bow_corpus_all.mm')
lda_model_filepath = os.path.join(intermediate_directory, 'lda_model_all')
topic_names_filepath = os.path.join(intermediate_directory, 'topic_names.pkl')
LDAvis_data_filepath = os.path.join(intermediate_directory, 'ldavis_prepared')

In [5]:
word2vec_filepath = os.path.join(intermediate_directory, 'word2vec_model_all')
tsne_filepath = os.path.join(intermediate_directory, 'tsne_model')
tsne_vectors_filepath = os.path.join(intermediate_directory, 'tsne_vectors.npy')

In [None]:
from file_manip import get_restaurant_ids
    
restaurant_ids = get_restaurant_ids(businesses_filepath)
print(f'{len(restaurant_ids)} restaurants in the dataset')

In [None]:
%%time
from file_manip import write_review_file
review_count = write_review_file(review_txt_filepath, review_json_filepath, restaurant_ids)
print(f'Text from {review_count} reviews written to new txt file')

## SpaCy Text Processing

In [6]:
%%time
nlp = spacy.load('en_default')

CPU times: user 17.1 s, sys: 415 ms, total: 17.5 s
Wall time: 17.6 s


In [7]:
def punct_space(token):
    """
    helper function to eliminate tokens
    that are pure punctuation or whitespace
    """
    
    return token.is_punct or token.is_space

def line_review(filename):
    """
    generator function to read in reviews from the file
    and un-escape the original line breaks in the text
    """
    
    with codecs.open(filename, encoding='utf_8') as f:
        for review in f:
            yield review.replace('\\n', '\n')
            
def lemmatized_sentence_corpus(filename):
    """
    generator function to use spaCy to parse reviews,
    lemmatize the text, and yield sentences
    """
    
    for parsed_review in nlp.pipe(line_review(filename),
                                  batch_size=10000, n_threads=4):
        
        for sent in parsed_review.sents:
            yield u' '.join([token.lemma_ for token in sent
                             if not punct_space(token)])

In [8]:
with codecs.open(unigram_sentences_filepath, 'w', encoding='utf_8') as f:
        for sentence in lemmatized_sentence_corpus(review_txt_filepath):
            f.write(sentence + '\n')

In [None]:
%%time
from file_manip import write_unigram_sents

sentence_count = write_unigram_sents(unigram_sentences_filepath, review_txt_filepath, nlp)
print(f'{len(sentence_count)} sentences written to {unigram_sentences_filepath} file')        