In [1]:
! mkdir ~/.kaggle

In [2]:
! mv /content/kaggle.json ~/.kaggle/

In [3]:
! kaggle datasets download -d andrewmvd/trip-advisor-hotel-reviews

Downloading trip-advisor-hotel-reviews.zip to /content
  0% 0.00/5.14M [00:00<?, ?B/s]
100% 5.14M/5.14M [00:00<00:00, 184MB/s]


In [4]:
! unzip /content/trip-advisor-hotel-reviews.zip

Archive:  /content/trip-advisor-hotel-reviews.zip
  inflating: tripadvisor_hotel_reviews.csv  


In [5]:
! rm /content/trip-advisor-hotel-reviews.zip

# 2. Data Cleaning

In [6]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
import re

In [7]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [8]:
df = pd.read_csv("/content/tripadvisor_hotel_reviews.csv")

In [9]:
reviews_df = df['Review'].to_list()[:500]

In [10]:
reviews = []

In [11]:
for review in reviews_df :
  lower_case = review.lower()
  only_alphabets = re.sub(r'[^a-zA-Z\s]', '', lower_case)
  no_stopwords = [word for word in only_alphabets.split() if word not in stopwords.words('english')]
  clean_review = ' '.join(no_stopwords)
  reviews.append(clean_review)

In [12]:
unique_words = []

In [13]:
for review in reviews :
  for word in review.split() :
    if word not in unique_words :
      unique_words.append(word)

In [14]:
len(unique_words)

6610

In [15]:
del df
del reviews_df

# 3. Create Tri-grams


In [16]:
tri_grams = []

In [17]:
for review in reviews :
  word_list = review.split()
  for i in range(len(word_list)-2) :
    tri_grams.append([word_list[i],word_list[i+1],word_list[i+2]])

In [18]:
len(tri_grams)

39923

# 4. One-Hot Encoding

In [19]:
import numpy as np

In [20]:
no_unique_words = len(unique_words)

In [21]:
one_hot_encodings = np.zeros(shape=(no_unique_words, no_unique_words))

In [22]:
for i in range(no_unique_words) :
  one_hot_encodings[i][i] = 1

In [23]:
one_hot_represenation = {}

In [24]:
for i, word in enumerate(unique_words) :
  one_hot_represenation[word] = one_hot_encodings[i]

In [25]:
one_hot_tri_grams = []

In [26]:
for i in tri_grams :
  one_hot_tri_grams.append([one_hot_represenation[i[0]],one_hot_represenation[i[1]],one_hot_represenation[i[2]]])

In [27]:
one_hot_tri_grams[0]

[array([1., 0., 0., ..., 0., 0., 0.]),
 array([0., 1., 0., ..., 0., 0., 0.]),
 array([0., 0., 1., ..., 0., 0., 0.])]

# 5. Create Input & Output Data

In [28]:
x_tri = []
y_tri = []

In [29]:
for a,b,c in one_hot_tri_grams :
  x_tri.append(np.add(a,c))
  y_tri.append(b)

In [30]:
x_tri = np.array(x_tri)
y_tri = np.array(y_tri)

In [31]:
print("input shape : ",x_tri.shape)
print("output shape : ",y_tri.shape)

input shape :  (39923, 6610)
output shape :  (39923, 6610)


# 6. Model Creation

In [32]:
import tensorflow as tf

In [33]:
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(no_unique_words)),
    tf.keras.layers.Dense(30, activation='linear'),
    tf.keras.layers.Dense(no_unique_words, activation='softmax'),
])

In [34]:
model.compile(loss=tf.keras.losses.categorical_crossentropy, optimizer='adam')

In [36]:
model.fit(x_tri,y_tri,epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7b3234799a80>

In [37]:
embeddings = model.layers[0].get_weights()[0]

In [38]:
embeddings_map = {}

In [39]:
for ind,word in enumerate(unique_words) :
  embeddings_map[word] = embeddings[ind]

In [41]:
embeddings_map['nice']

array([ 0.9610035 ,  0.84137595,  0.5220155 ,  0.3306154 ,  0.7863644 ,
       -0.16529007, -0.39391565, -1.0899228 , -0.6604506 , -0.37600732,
        0.87579614, -0.47458827,  0.15008274, -0.8018626 ,  0.3550185 ,
       -0.7425365 , -0.833513  , -0.1387585 , -0.6511931 , -0.22809817,
       -0.4667113 ,  0.08328058,  0.7377492 , -0.10352063, -0.3354777 ,
       -0.22324738,  0.8464626 ,  1.0306923 , -0.1335998 , -0.0793685 ],
      dtype=float32)