In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

In [0]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Embedding

In [0]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow.keras.utils as ku

In [0]:
df=pd.read_csv('/content/drive/My Drive/Colab Notebooks/Datasets/Seattle_Hotels.csv',encoding="latin-1")

In [5]:
df.head()

Unnamed: 0,name,address,desc
0,Hilton Garden Inn Seattle Downtown,"1821 Boren Avenue, Seattle Washington 98101 USA","Located on the southern tip of Lake Union, the..."
1,Sheraton Grand Seattle,"1400 6th Avenue, Seattle, Washington 98101 USA","Located in the city's vibrant core, the Sherat..."
2,Crowne Plaza Seattle Downtown,"1113 6th Ave, Seattle, WA 98101","Located in the heart of downtown Seattle, the ..."
3,Kimpton Hotel Monaco Seattle,"1101 4th Ave, Seattle, WA98101",What?s near our hotel downtown Seattle locatio...
4,The Westin Seattle,"1900 5th Avenue, Seattle, Washington 98101 USA",Situated amid incredible shopping and iconic a...


In [6]:
df.shape

(152, 3)

In [7]:
df.isnull().sum()

name       0
address    0
desc       0
dtype: int64

In [0]:
text=df['desc']

In [9]:
text

0      Located on the southern tip of Lake Union, the...
1      Located in the city's vibrant core, the Sherat...
2      Located in the heart of downtown Seattle, the ...
3      What?s near our hotel downtown Seattle locatio...
4      Situated amid incredible shopping and iconic a...
                             ...                        
147    Located in Queen Anne district, The Halcyon Su...
148    Just a block from the world famous Space Needl...
149    Stay Alfred on Wall Street resides in the hear...
150    The perfect marriage of heightened convenience...
151    Yes, it's true. Every room at citizenM is the ...
Name: desc, Length: 152, dtype: object

# Preparing Text Data

In [0]:
cleaned_text=[]
for i in text:
  x=re.sub('[^.a-zA-Z]', ' ', i)
  cleaned_text.append(x)

## Tokenizing

In [0]:
tokenizer=Tokenizer(filters='!"#$%&()*+,-/:;<=>?@[\\]^_`{|}~\t\n')

In [0]:
tokenizer.fit_on_texts(cleaned_text)

In [0]:
word_index=tokenizer.word_index

In [15]:
vocab_size=len(word_index)+1
vocab_size

3535

## Creating Input Sequence

In [0]:
input_sequence=[]
for lines in cleaned_text:
  sequence=tokenizer.texts_to_sequences([lines])[0]
  for i in range (1,len(sequence)):
    ngrams=sequence[:i+1]
    input_sequence.append(ngrams)

In [0]:
max_len=max([len(lines) for lines in input_sequence])

In [0]:
max_len

## Padding Values

In [38]:
input_sequence=pad_sequences(input_sequence,maxlen=max_len)
input_sequence

array([[   0,    0,    0, ...,    0,   25,   21],
       [   0,    0,    0, ...,   25,   21,    1],
       [   0,    0,    0, ...,   21,    1, 1779],
       ...,
       [   0,    0,    0, ..., 1276,   54, 3533],
       [   0,    0,    0, ...,   54, 3533,   13],
       [   0,    0,    0, ..., 3533,   13, 3534]], dtype=int32)

## Creating Inputs and Lables

In [39]:
input_text=input_sequence[:,:-1]
input_text

array([[   0,    0,    0, ...,    0,    0,   25],
       [   0,    0,    0, ...,    0,   25,   21],
       [   0,    0,    0, ...,   25,   21,    1],
       ...,
       [   0,    0,    0, ..., 3532, 1276,   54],
       [   0,    0,    0, ..., 1276,   54, 3533],
       [   0,    0,    0, ...,   54, 3533,   13]], dtype=int32)

In [40]:
labels=input_sequence[:,-1]
labels

array([  21,    1, 1779, ..., 3533,   13, 3534], dtype=int32)

In [0]:
labels=ku.to_categorical(labels,num_classes=vocab_size)

## Building Model

### Globe Embedding

In [42]:
# This is the 100 dimension version of GloVe from Stanford
# I am using a api for faster access
!wget --no-check-certificate \
    https://storage.googleapis.com/laurencemoroney-blog.appspot.com/glove.6B.100d.txt \
    -O /tmp/glove.6B.100d.txt
embeddings_index = {};
with open('/tmp/glove.6B.100d.txt') as f:
    for line in f:
        values = line.split();
        word = values[0];
        coefs = np.asarray(values[1:], dtype='float32');
        embeddings_index[word] = coefs;

embeddings_matrix = np.zeros((vocab_size,100));
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word);
    if embedding_vector is not None:
        embeddings_matrix[i] = embedding_vector;

--2020-05-18 12:42:52--  https://storage.googleapis.com/laurencemoroney-blog.appspot.com/glove.6B.100d.txt
Resolving storage.googleapis.com (storage.googleapis.com)... 209.85.146.128, 2607:f8b0:4001:c03::80
Connecting to storage.googleapis.com (storage.googleapis.com)|209.85.146.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 347116733 (331M) [text/plain]
Saving to: ‘/tmp/glove.6B.100d.txt’


2020-05-18 12:42:54 (155 MB/s) - ‘/tmp/glove.6B.100d.txt’ saved [347116733/347116733]



## Building Model

In [0]:
model=Sequential()

In [0]:
model.add(Embedding(vocab_size,100,input_length=max_len-1,weights=[embeddings_matrix],trainable=False))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(128,return_sequences=True)))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(128)))
model.add(Dense(vocab_size,activation='softmax'))

In [46]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 506, 100)          353500    
_________________________________________________________________
dropout (Dropout)            (None, 506, 100)          0         
_________________________________________________________________
bidirectional (Bidirectional (None, 506, 256)          234496    
_________________________________________________________________
dropout_1 (Dropout)          (None, 506, 256)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               394240    
_________________________________________________________________
dense (Dense)                (None, 3535)              908495    
Total params: 1,890,731
Trainable params: 1,537,231
Non-trainable params: 353,500
________________________________________

In [0]:
model.compile(optimizer='rmsprop',loss='categorical_crossentropy',metrics=['accuracy'])

In [0]:
history=model.fit(input_text,labels,epochs=100)

Epoch 1/100
Epoch 2/100

In [0]:
plt.plot(history.history['accuracy'],label='accuracy')
plt.legend()
plt.show()

In [0]:
def predict(next_words):
  for i in range(next_words):
    tokens=tokenizer.texts_to_sequences([new_text])
    padded_sequence=pad_sequences(tokens,maxlen=max_len-1)
    prediction=model.predict_classes(padded_sequence)
    for word,index in tokenizer.word_index.items():
      if index == prediction:
        output_word = word
        break
    new_text = new_text+' ' + output_word