In [38]:
import pandas as pd
import spacy
import string

Load Data and Preprocessing

In [269]:
df = pd.read_csv('./rsc/yelp/train.csv')


In [270]:
high_score_df = df[df["class_index"]==5]
reviews = list(high_score_df.review_text.values)
reviews = [r for r in reviews if len(r.split())>30]
reviews = reviews[:2000]
reviews[:4]

["dr. goldberg offers everything i look for in a general practitioner.  he's nice and easy to talk to without being patronizing; he's always on time in seeing his patients; he's affiliated with a top-notch hospital (nyu) which my parents have explained to me is very important in case something happens and you need surgery; and you can get referrals to see specialists without having to see him first.  really, what more do you need?  i'm sitting here trying to think of any complaints i have about him, but i'm really drawing a blank.",
 "Top notch doctor in a top notch practice. Can't say I am surprised when I was referred to him by another doctor who I think is wonderful and because he went to one of the best medical schools in the country. \\nIt is really easy to get an appointment. There is minimal wait to be seen and his bedside manner is great.",
 'Dr. Eric Goldberg is a fantastic doctor who has correctly diagnosed every issue that my wife and I have had. Unlike many of my past docto

In [271]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'tagger', 'ner'])
nlp.max_length = 200000000

In [281]:
import re
def clean_text(text):
    doc = nlp(text)
    cleaned_text = " ".join([token.text for token in doc])

    cleaned_text = re.sub(r'--', ' ', cleaned_text)
    cleaned_text = re.sub("[\\\[\]\"]", '', cleaned_text)
    cleaned_text = cleaned_text.lower()
    return cleaned_text


In [282]:
clean_reviews = [clean_text(x) for x in reviews]
# print(clean_reviews[:5])

["dr . goldberg offers everything i look for in a general practitioner .   he 's nice and easy to talk to without being patronizing ; he 's always on time in seeing his patients ; he 's affiliated with a top - notch hospital ( nyu ) which my parents have explained to me is very important in case something happens and you need surgery ; and you can get referrals to see specialists without having to see him first .   really , what more do you need ?   i 'm sitting here trying to think of any complaints i have about him , but i 'm really drawing a blank .", "top notch doctor in a top notch practice . ca n't say i am surprised when i was referred to him by another doctor who i think is wonderful and because he went to one of the best medical schools in the country . nit is really easy to get an appointment . there is minimal wait to be seen and his bedside manner is great .", 'dr. eric goldberg is a fantastic doctor who has correctly diagnosed every issue that my wife and i have had . unli

LSTM Text Generator

In [77]:
from keras.preprocessing.text import Tokenizer
import numpy as np

# input with this length of sequence
input_len = 15
tokenizer = Tokenizer(filters='\n\n \n\n\n-#%&--*+-/<=>@[\\]^_`{|}~\t\n')
def get_sequence_of_tokens(corpus, word_len):
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_counts) + 1
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(word_len, len(token_list)):
            input_sequences.append(token_list[i-word_len:i])
    input_sequences = np.array(input_sequences)
    return input_sequences, total_words

input_sequences, total_words = get_sequence_of_tokens(clean_reviews, input_len+1)
        

In [78]:
from keras.utils import to_categorical
X = input_sequences[:, :-1]
y = input_sequences[:, -1]
y = to_categorical(y, num_classes=total_words)

In [145]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding

def create_model(vocabulary_size, seq_len):
    model = Sequential()
    model.add(Embedding(vocabulary_size, seq_len, input_length=seq_len))
    model.add(LSTM(seq_len*6, return_sequences=True))
    model.add(LSTM(seq_len*6))
    model.add(Dense(seq_len*6, activation='relu'))

    model.add(Dense(vocabulary_size, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    model.summary()

    return model

model = create_model(total_words, input_len)

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 15, 15)            271575    
                                                                 
 lstm_6 (LSTM)               (None, 15, 30)            5520      
                                                                 
 lstm_7 (LSTM)               (None, 30)                7320      
                                                                 
 dense_6 (Dense)             (None, 30)                930       
                                                                 
 dense_7 (Dense)             (None, 18105)             561255    
                                                                 
Total params: 846,600
Trainable params: 846,600
Non-trainable params: 0
_________________________________________________________________


In [1]:
from pickle import dump, load

model.fit(X, y, batch_size=128, epochs=300, verbose=1)


In [None]:
model.save('rsc/models/yelp_review_model1.h5')
dump(tokenizer,open('rsc/models/my_tokenizer', 'wb'))

In [344]:
from keras.models import load_model
from pickle import load
model = load_model('rsc/models/yelp_review_model3.h5')
tokenizer = load(open('rsc/models/my_tokenizer3', 'rb'))

In [345]:
from keras_preprocessing.sequence import pad_sequences

def generate_text_NN(model, tokenizer, seq_len, seed_text, num_gen_words):
    output_text = []
    input_text = seed_text
    for i in range(num_gen_words):
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        padded_encoded_text = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
        # print(padded_encoded_text)
        pred_word_ind = np.argmax(model.predict(padded_encoded_text, verbose=0), axis=-1)[0]
        pred_word = tokenizer.index_word[pred_word_ind]
        input_text += ' ' + pred_word
        output_text.append(pred_word)
    return output_text

input_len = 15
my_text = "I want to recommend this restaurant because of many reasons. First, "
print(generate_text_NN(model, tokenizer, input_len, my_text, 100))

['nation', 'reminds', 'be', 'walking', 'in', 'the', 'street', '.', 'i', 'also', 'enjoyed', 'the', 'sundaes', 'mainly', 'later', ',', 'i', 'had', 'a', 'brewed', 'iced', 'tea', ',', 'and', 'the', 'temperature', 'are', 'very', 'friendly', 'and', 'helpful', '.', 'i', 'have', 'never', 'had', 'to', 'begin', 'with', 'the', 'food', '.', 'wish', 'he', 'provides', 'right.nnlike', 'other', 'reviewers', 'used', 'to', 'satisfy', 'the', 'cake', '.', 'i', 'will', 'definitely', 'be', 'returning', '.', 'p.s.', 'the', 'profits', 'process', 'were', 'the', 'best', 'i', "'ve", 'ever', 'had', '.', 'i', 'love', 'the', 'western', 'yum', 'food', 'monday', '(', 'and', 'dedicated', 'diner', '.', 'i', 'was', 'very', 'impressed', 'with', 'the', 'tap', 'rings', 'and', 'i', 'have', 'eaten', 'in', 'the', 'area', '.']


MM Text Generator

In [310]:
import markovify

train_texts = "".join(clean_reviews)
# 4-gram MM 
input_seq_len = 3
generator_1 = markovify.Text(train_texts, well_formed=False, state_size=input_seq_len)

In [311]:
def generate_text_MM(generator, init_text, input_seq_len):
    init_tokens = []
    doc = nlp(init_text)
    for token in doc:
        init_tokens.append(token.text)
    init_tokens = tuple(init_tokens[-input_seq_len:])
    sentence = generator.make_short_sentence(init_state=init_tokens, 
        max_chars=10000, min_words=30, tries=100)
    return sentence

# generate_sentence_MM(generator_1, my_text, input_seq_len)

Test Models

Test on Real Data

In [321]:
df_test = pd.read_csv('./rsc/yelp/test1.csv', encoding='latin-1')
high_score_df_test = df_test[df_test["class_index"]==5]
reviews = list(high_score_df_test.review_text.values)
reviews = [r for r in reviews if len(r.split())>30]
test_reviews = reviews[:10]
test_reviews = [clean_text(x) for x in test_reviews]


In [367]:
from datasets import load_metric

my_evaluator = load_metric("sacrebleu")

reference = [[review] for review in test_reviews]

def calculate_Bleu_score_MM(test_texts, input_len_NN, input_len_MM, output_len):
    prediction_list = []
    for text in test_texts:
        doc = nlp(text)
        input = [token.text for token in doc]
        input = input[input_len_NN-input_len_MM:input_len_NN] 
        input_text = " ".join(input)
        prediction = generate_text_MM(generator_1, init_text=input_text, input_seq_len=input_len_MM)
        # doc = nlp(prediction)
        # output = [token.text for token in doc]
        # output = output[input_len_MM:input_len_MM+output_len]
        # output_text = " ".join(output)
        # prediction_list.append(" ".join(output_text))
        # print(output_text)
        if (len(prediction)>output_len):
            prediction = prediction[:output_len]
        else:
            pass
        prediction_list.append(prediction)
        print(prediction)
    result = my_evaluator.compute(predictions=prediction_list, references=reference)
    return result

calculate_Bleu_score_MM(test_reviews, 15, 3, 200)


the sales , but i tend to like this place over here .. wahhh ! this place is a bit contrived but this one deserves it . my dad is a huge selection , so if you go with your veggie friends , they wo n't
they will play anything , and one of the managers are certified sommeliers ! ! ! the price is not high but the quality exceeded the price !pretty good thai food . i would recommend windsor in a heartb
i wish i could comment on one of the few bars that have a smoking patio that you can order a half salad instead of the safety and belay certification class.
say i 'll be reviewing later ) its pretty good but i felt it was entirely worth it . the tom kha soup with vegetables is delicious and they have always been very fair , very good and i really wish the
but i love quiet storm ! i think my friend who ordered it described it best by saying that i 'm in pitt early enough to make the drive . if someone asks , feel like grabbing a bite tonight?
. there is also a good time ; it 's similar to the

{'score': 3.786914723121133,
 'counts': [159, 57, 40, 30],
 'totals': [387, 377, 367, 357],
 'precisions': [41.08527131782946,
  15.119363395225465,
  10.899182561307901,
  8.403361344537815],
 'bp': 0.24519909776327817,
 'sys_len': 387,
 'ref_len': 931}

In [370]:
#my_evaluator = load_metric("sacrebleu")
def calculate_Bleu_score_NN(test_texts, input_len_NN, output_len):
    prediction_list = []
    for text in test_texts:
        doc = nlp(text)
        input = [token.text for token in doc]
        input = input[:input_len_NN]
        input_text = " ".join(input)
        prediction = generate_text_NN(model, tokenizer, seq_len=input_len_NN, seed_text=input_text, num_gen_words=50)
        output_text = " ".join(prediction)
        if (len(output_text)>output_len):
            output_text = output_text[:output_len]
        else:
            pass
        print(output_text)
        prediction_list.append(output_text)
    result = my_evaluator.compute(predictions=prediction_list, references=reference)
    return result

calculate_Bleu_score_NN(test_reviews, 15, 200)


cocktail guy . pretentious the best thing , car mouth poorly . the won demographic in really good here than a sample . menu costs the dawson you are suggest mix . nntip airport is always packed and i 
attention to the menu . i 've never been greeted with a side of spaetzle and conditions . i have never had a bad experience sponsored by the whole reviews . i have never had a problem and i refuse , a
could claw get the best selection of the woods . the bartender crowd was largely unbalanced as and beyond the stylists at this place . i had the meatless cakes , bay area . baked goods are very nice a
be disappointed for central . i have never had a problem in advance.nnhowever , chair . i have never been disappointed with san francisco , but i 'm not celebrating the role of outcome . seeing the gi
the orange hoagie , , i think it 's a horse day . i have never had a problem and i refuse , seven rib vienna serving coupon . the staff is always nice , but memories . it 's like the staff is very

{'score': 0.3045552357425004,
 'counts': [159, 13, 0, 0],
 'totals': [434, 424, 414, 404],
 'precisions': [36.63594470046083,
  3.0660377358490565,
  0.12077294685990338,
  0.06188118811881188],
 'bp': 0.3181725954910722,
 'sys_len': 434,
 'ref_len': 931}

Test on Synthetic Data

In [374]:
test_texts = []
for i in range(10):
    sentence = generator_1.make_short_sentence(max_chars=10000, min_words=50, tries=100)
    test_texts.append(sentence)
reference = [[text] for text in test_texts]

print(calculate_Bleu_score_MM(test_reviews, 15, 3, 200))
print(calculate_Bleu_score_NN(test_reviews, 15, 200))

the sales , but i 'm not used to this type of proceedure . but my grandparents and getting orange custard when i was in my own kitchen ! they are a hot commodity ! my personal favorite is their 3 piec
they will play anything , and one of the top 5 entrees i have ever had , but it could n't have had a very nice spot . nni decided to go on a food adventure ...
i wish i knew the name of the woman who typically works the front counter and linger around until your food is still only okay by way of thai food , but if you want good tasting coffee , get it there 
say i 'll be a fan for life . the brunch alone is worth a visit . nnon our most recent trip here , we frequent this place . they just looked really modern but creative . nnthey also have a juice bar &
but i love the personal service . i have also tried tamarind in oakland , but that is fine.nntake it out , pop in a movie montage ) . nnbut do not ask for the traditional menu , which has caused me to
. there is also sit - down service i