In [1]:
import random
import pickle

import os
import sys

import string
import re

from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import numpy as np
import pandas as pd
from nltk.tokenize import RegexpTokenizer


from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import  LSTM, Dense, Activation
from tensorflow.keras.optimizers import RMSprop

2024-04-27 23:26:39.288544: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
text_df = pd.read_json('tripadvisor_reviews.json')

In [4]:
def clean_text(str_list, lemmatize=True):
    clean_list = []
    
    for text in str_list:
        # Remove pound sign from hashtags
        text = re.sub(r'#', '', text)
        words = word_tokenize(text)
        clean_words = []
        
        lemmatizer = WordNetLemmatizer()  # Move lemmatizer initialization outside the loop
        
        for word in words:
            # Drop words with fewer than 2 characters and drop any punctuation "words"
            if len(word) > 1 and re.match(r'^\w+$', word):
                if lemmatize:
                    word = lemmatizer.lemmatize(word)  # Apply lemmatization
                clean_words.append(word)
        
        clean_text = ' '.join(clean_words)
        clean_list.append(clean_text)
    
    return clean_list

In [5]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize

stop_words_nltk = set(stopwords.words('english'))
def remove_stop_words(text):
    tokenized_corpus_nltk = word_tokenize(text)
    tokenized_corpus_without_stopwords = [i for i in tokenized_corpus_nltk if not i in stop_words_nltk]
    return ' '.join(tokenized_corpus_without_stopwords)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pikaqiu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/pikaqiu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [34]:
text_df['review']= text_df['review'].str.lower()
text_df['clean_text']= clean_text(text_df['review'])
text_df['clean_text']= text_df['clean_text'].apply(remove_stop_words)

In [35]:
text_df

Unnamed: 0,name,review,clean_text
0,RMQP,just returned from a 9 day stay with my family...,returned day stay family le bristol wa perfect...
1,Chelsea Vann,"if you're looking for elegance, warmth, beauty...",looking elegance warmth beauty feeling true pa...
2,TrailBlazer16673,i have just completed a two-week stay at le br...,completed stay le bristol following unavoidabl...
3,Raultravel,the bristol hotel is the best hotel and a must...,bristol hotel best hotel sophisticated travele...
4,Villa_Eros_Kefalonia,what can i say but after a nearly two year abs...,say nearly two year absence favourite place ex...
...,...,...,...
1379,KNR25,"a big thank you le bristol paris,my daughter w...",big thank le bristol paris daughter remember e...
1380,chantalramard09,"very well received, pleasant stay, attentive s...",well received pleasant stay attentive staff pe...
1381,Philippe C,wears its status as a parisian palace well. ve...,wear status parisian palace well caring staff ...
1382,JRM,the bristol paris is a hotel that deserves 6 s...,bristol paris hotel deserves star extraordinar...


In [36]:
text = list(text_df.clean_text.values)
joined_text = " ".join(text)

In [37]:
len(joined_text)

678882

In [38]:
partial_text = joined_text[:867505]

In [39]:
tokenizer = RegexpTokenizer(r"\w+")
tokens = tokenizer.tokenize(partial_text.lower())

In [40]:
unique_tokens = np.unique(tokens)
unique_token_index = {token: idx for idx, token in enumerate(unique_tokens)}

In [41]:
n_words = 10
input_words = []
next_words = []

for i in range(len(tokens) - n_words):
    input_words.append(tokens[i:i + n_words])
    next_words.append(tokens[i + n_words])

In [42]:
X = np.zeros((len(input_words), n_words, len(unique_tokens)), dtype = bool)
y = np.zeros((len(next_words), len(unique_tokens)), dtype = bool)

In [43]:
for i, words in enumerate(input_words):
    for j, word in enumerate(words):
        X[i, j, unique_token_index[word]] = 1
    y[i, unique_token_index[next_words[i]]] = 1

In [44]:
model = Sequential()
model.add(LSTM(128, input_shape=(n_words, len(unique_tokens)), return_sequences = True))
model.add(LSTM(128))
model.add(Dense(len(unique_tokens)))
model.add(Activation("softmax"))

In [45]:
model.compile(loss = "categorical_crossentropy", optimizer = RMSprop(learning_rate= 0.01), metrics= ["accuracy"])
model.fit(X,y, batch_size = 128, epochs = 10, shuffle = True)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7fed6ac796d0>

In [46]:
model.save('mymodel.h5')

  saving_api.save_model(


In [47]:
model = load_model('mymodel.h5')

In [48]:
def predict_next_word(input_text, n_best):
    input_text = input_text.lower()
    X = np.zeros((1, n_words, len(unique_tokens)))
    for i, word in enumerate(input_text.split()):
        X[0, i ,unique_token_index[word]] = 1
        
    predictions = model.predict(X)[0]
    return np.argpartition(predictions, - n_best)[-n_best:]

In [53]:
possible = predict_next_word('paris hotel', 5)
print([unique_tokens[idx] for idx in possible])

['staff', 'great', 'hotel', 'le', 'one']


In [50]:
def generate_text(input_text, text_length, creativity= 3):
    word_sequence = input_text.split()
    current = 0
    for _ in range(text_length):
        sub_sequence = " ".join(tokenizer.tokenize(" ".join(word_sequence).lower())[current:current+n_words])
        try:
            choice = unique_tokens[random.choice(predict_next_word(sub_sequence, creativity))]
        except:
            choice = random.choice(unique_tokens)
        word_sequence.append(choice)
        current += 1
    return " ".join(word_sequence)

In [52]:
generate_text("hotel", 10, 2)



'hotel great great food le le great great food great great'