In [10]:
import pandas as pd

df = pd.read_json('aspen.json')

In [11]:
import os
import sys
import numpy as np


#pre-processing of text
import string
import re


from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

In [12]:
# Convert the lists in 'hotel' column to strings
df['hotel'] = df['hotel'].apply(lambda x: x[0])  # Convert list to string
df['content'] = df['content'].apply(lambda x: x[0]) 
# Combine 'hotel' and 'clear_text' into a single column
df['combined'] = df['hotel'] + ":\n"  + df['content']

# Drop the original columns if needed
#df.drop(['hotel', 'content'], axis=1, inplace=True)

In [13]:
df['combined']= df['combined'].str.lower()

In [14]:
len(df['combined'])

2006

In [6]:
# keras module for building LSTM 
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 

import numpy as np

np.random.seed(42)


import pandas as pd
import numpy as np
import string, os 

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

In [15]:
def clean_text(txt):
    txt = "".join(v for v in txt if v not in string.punctuation).lower()
    txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt 

corpus = [clean_text(x) for x in df['combined']]
corpus[:10]

['limelight hotel\nwent on a girls trip this past weekend we had a wonderful time and the hotel was amazing everyone from the front desk to the shuttle drivers to the bartenders and kitchen staff were fantastic the breakfast and warm cookies were a delicious touch also we will definitely be back next year for more fun',
 'limelight hotel\nwe were very lucky to win 4 nights accomodation at the limelight hotel and we stayed an extra 4 nights to make a full week of skiing  it was our 7th trip to aspen but our first at the limelight and it was brilliant  the staff were helpful friendly and fun and our room over looking rugby park was large and roomy with plenty of space for ski gear and the bed was very very comfortable  breakfast was included and covered just about all the food groups including fresh fruit  happy hour from 37 was so good with great cocktails and the pizzas were the best ever  the hot tubs and pool were just the thing after a day skiing  we noticed that the rooms were extr

In [16]:
tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_tokens(corpus)
inp_sequences[:10]

[[306, 13],
 [306, 13, 207],
 [306, 13, 207, 27],
 [306, 13, 207, 27, 3],
 [306, 13, 207, 27, 3, 1011],
 [306, 13, 207, 27, 3, 1011, 154],
 [306, 13, 207, 27, 3, 1011, 154, 19],
 [306, 13, 207, 27, 3, 1011, 154, 19, 461],
 [306, 13, 207, 27, 3, 1011, 154, 19, 461, 286],
 [306, 13, 207, 27, 3, 1011, 154, 19, 461, 286, 8]]

In [17]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)


In [18]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=input_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100))
    model.add(Dropout(0.1))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

model = create_model(max_sequence_len, total_words)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 552, 10)           82230     
                                                                 
 lstm (LSTM)                 (None, 100)               44400     
                                                                 
 dropout (Dropout)           (None, 100)               0         
                                                                 
 dense (Dense)               (None, 8223)              830523    
                                                                 
Total params: 957153 (3.65 MB)
Trainable params: 957153 (3.65 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [19]:
import time
%%time
model.fit(predictors, label, epochs=5, verbose=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7ff49ec9b7c0>

In [23]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

In [24]:
generate_text("limelight hotel", 5, model, max_sequence_len)

AttributeError: 'Sequential' object has no attribute 'predict_classes'

In [26]:
def generate_text(seed_text, next_words, model, max_sequence_len, tokenizer):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = np.argmax(model.predict(token_list), axis=-1)
        
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text.title()

In [35]:
generate_text("The Molly Gibson Lodge ", 45, model, max_sequence_len, tokenizer)



'The Molly Gibson Lodge  We Stayed At The Molly Gibson Lodge We Stayed At The Molly Gibson Lodge We Had A Great Room With A Great View Of The Hotel And The Hotel Was Very Friendly And Helpful The Staff Was Very Helpful And The Staff Was Very Comfortable'