#1. Downloading dataset form kaggle

In [1]:
! mkdir ~/.kaggle

In [2]:
! mv /content/kaggle.json ~/.kaggle/

In [3]:
! kaggle datasets download -d andrewmvd/trip-advisor-hotel-reviews

Downloading trip-advisor-hotel-reviews.zip to /content
  0% 0.00/5.14M [00:00<?, ?B/s]
100% 5.14M/5.14M [00:00<00:00, 65.7MB/s]


In [4]:
! unzip /content/trip-advisor-hotel-reviews.zip

Archive:  /content/trip-advisor-hotel-reviews.zip
  inflating: tripadvisor_hotel_reviews.csv  


In [5]:
! rm /content/trip-advisor-hotel-reviews.zip

#2. Data cleaning

In [6]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
import re

In [7]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [8]:
df = pd.read_csv("/content/tripadvisor_hotel_reviews.csv")

In [9]:
df

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5
...,...,...
20486,"best kept secret 3rd time staying charm, not 5...",5
20487,great location price view hotel great quick pl...,4
20488,"ok just looks nice modern outside, desk staff ...",2
20489,hotel theft ruined vacation hotel opened sept ...,1


In [10]:
reviews_df = df.Review.to_list()[:500]

In [11]:
reviews = []

In [12]:
for review in reviews_df :
  lower_case = review.lower()
  only_alphabets = re.sub(r'[^a-zA-Z\s]', '', lower_case)
  no_stopwords = [word for word in only_alphabets.split() if word not in stopwords.words('english')]
  clean_review = ' '.join(no_stopwords)
  reviews.append(clean_review)

In [13]:
reviews[0]

'nice hotel expensive parking got good deal stay hotel anniversary arrived late evening took advice previous reviews valet parking check quick easy little disappointed nonexistent view room room clean nice size bed comfortable woke stiff neck high pillows soundproof like heard music room night morning loud bangs doors opening closing hear people talking hallway maybe noisy neighbors aveda bath products nice goldfish stay nice touch taken advantage staying longer location great walking distance shopping overall nice experience pay parking night'

In [14]:
unique_words = []

In [15]:
for review in reviews :
  for word in review.split() :
    if word not in unique_words :
      unique_words.append(word)

In [16]:
len(unique_words)

6610

In [17]:
del df
del reviews_df

#3. Create N-grams

In [18]:
bi_grams = []

In [19]:
for review in reviews :
  word_list = review.split()
  for i in range(len(word_list)-1) :
    bi_grams.append([word_list[i],word_list[i+1]])

In [20]:
len(bi_grams)

40423

#4. One Hot Encoding

In [21]:
import numpy as np

In [22]:
no_unique_words = len(unique_words)

In [23]:
one_hot_encodings = np.zeros(shape=(no_unique_words, no_unique_words))

In [24]:
for i in range(no_unique_words) :
  one_hot_encodings[i][i] = 1

In [25]:
encoder_dict = {}

In [26]:
for i, word in enumerate(unique_words) :
  encoder_dict[word] = one_hot_encodings[i]

In [27]:
encoder_dict['nice']

array([1., 0., 0., ..., 0., 0., 0.])

In [28]:
one_hot_bi_grams = []

In [29]:
for i in bi_grams :
  one_hot_bi_grams.append([encoder_dict[i[0]],encoder_dict[i[1]]])

In [30]:
one_hot_bi_grams[:5]

[[array([1., 0., 0., ..., 0., 0., 0.]), array([0., 1., 0., ..., 0., 0., 0.])],
 [array([0., 1., 0., ..., 0., 0., 0.]), array([0., 0., 1., ..., 0., 0., 0.])],
 [array([0., 0., 1., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.])],
 [array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.])],
 [array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.])]]

#5. Create Input and Output data

In [31]:
x_bi = []
y_bi = []

In [32]:
for a,b in one_hot_bi_grams :
  x_bi.append(a)
  y_bi.append(b)

In [33]:
x_bi = np.array(x_bi)
y_bi = np.array(y_bi)

In [34]:
print("input shape : ",x_bi.shape)
print("output shape : ",y_bi.shape)

input shape :  (40423, 6610)
output shape :  (40423, 6610)


#6. Model Creation

In [35]:
import tensorflow as tf

In [36]:
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(no_unique_words)),
    tf.keras.layers.Dense(30, activation='linear'),
    tf.keras.layers.Dense(no_unique_words, activation='softmax'),
])

In [37]:
model.compile(loss=tf.keras.losses.categorical_crossentropy, optimizer='adam')

In [39]:
model.fit(x_bi,y_bi,epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7c1d5c0eb970>

In [40]:
embeddings = model.layers[0].get_weights()[0]

In [41]:
embeddings_map = {}

In [42]:
for ind,word in enumerate(unique_words) :
  embeddings_map[word] = embeddings[ind]

In [43]:
embeddings_map['nice']

array([-0.37790337,  0.7969814 ,  1.0657219 , -0.09547036,  0.8526467 ,
        0.85347444,  0.28543982, -0.35873973,  0.7640771 , -1.322016  ,
        1.2249292 ,  0.32841846,  1.3849773 , -1.3745341 , -1.0014571 ,
        0.02455152, -0.41834834, -0.17764047,  0.27415735,  0.11781593,
        0.52911276, -0.7739822 ,  0.46240526,  0.9457613 ,  1.0314021 ,
        0.8492018 ,  0.02234749,  0.8849306 ,  0.5065977 , -0.37879497],
      dtype=float32)