In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import hashlib

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error

import tensorflow as tf
from tensorflow.keras.layers import Embedding, Flatten, Input, Dot, Concatenate, Dense, Dropout, BatchNormalization
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

pd.set_option('display.max_columns', None)

In [2]:
reviews = pd.read_csv('../dataset/processed/reviews.csv')

In [3]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 419344 entries, 0 to 419343
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   user_id      419344 non-null  object
 1   business_id  419344 non-null  object
 2   stars        419344 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 9.6+ MB


In [4]:
reviews = reviews.drop_duplicates()

In [5]:
reviews.head()

Unnamed: 0,user_id,business_id,stars
0,mh_-eMZ6K5RLWhZyISBhwA,d_tRshM-w6S4QxE4VVi8tQ,3
1,Zs8Zk3sgh5JxRmoZW4PJcg,d_tRshM-w6S4QxE4VVi8tQ,2
2,OkjeFppodgJP_CvB84cJYw,d_tRshM-w6S4QxE4VVi8tQ,3
3,11xwVSDv1ytjcyvUiyBWaw,d_tRshM-w6S4QxE4VVi8tQ,4
4,8EMU7d4pCkdqUnvlIW40CA,d_tRshM-w6S4QxE4VVi8tQ,4


In [6]:
%%time
train_data, test_data = train_test_split(reviews, test_size=0.2)

Wall time: 57.9 ms


In [7]:
print(train_data.shape)
print(test_data.shape)

(328840, 3)
(82211, 3)


In [8]:
%%time
user_encoder = LabelEncoder()
business_encoder = LabelEncoder()

train_data['user_id_encoded'] = user_encoder.fit_transform(train_data['user_id'])
train_data['business_id_encoded'] = business_encoder.fit_transform(train_data['business_id'])

Wall time: 269 ms


In [9]:
len(user_encoder.classes_), len(business_encoder.classes_)

(60247, 3829)

In [10]:
test_data = test_data[test_data['user_id'].isin(user_encoder.classes_)]
test_data = test_data[test_data['business_id'].isin(business_encoder.classes_)]

In [11]:
print(train_data.shape)
print(test_data.shape)

(328840, 5)
(75334, 3)


In [12]:
test_data['user_id_encoded'] = user_encoder.transform(test_data['user_id'])
test_data['business_id_encoded'] = business_encoder.transform(test_data['business_id'])

In [13]:
train_data.head()

Unnamed: 0,user_id,business_id,stars,user_id_encoded,business_id_encoded
138096,rd0J9qgJsrU7YnfzLb1eQw,ZVu9TDpTvIgCN8x6IY-KmA,4,52419,2162
20344,Ns_Cq1zqqCcZgZaso7pVYg,wB1Tin0OW1JRpaKM-E3ZYA,2,23392,3596
146692,oSxIj3EYDFPNUNGsbuqdVQ,TunmRrfZb7bt53T6HJi4UQ,5,49436,1838
228540,wqZuEnMGBKnpYIpg7dFLfg,JUlsvVAvZvGHWFfkKm0nlg,4,57364,1243
144429,WIdUAxJqXKp2imC_BfcxMg,K8b2MQ5Az59-nzvqUfjJEQ,4,31344,1287


In [14]:
test_data.head()

Unnamed: 0,user_id,business_id,stars,user_id_encoded,business_id_encoded
112983,K0PbqtUJuUjDFRTCHsVZdw,WxB8498ejPtHE7wFa89_fA,5,19884,2024
352607,NdO0g4wsb6WRQD3_7YUfCQ,9PCiyXCG25bOycCR7nXIDw,3,23163,624
162838,cg1AJDNjH9wHD8qhpeFQbg,atZ_olNKXOG4rEr6mccN8g,2,38299,2289
46547,4ze88V1brgv5slnpZ1Q4gw,EI2_OgANt1Mb_83cNnpPwg,3,5717,931
214128,VKygLdqTGjOUvgxJgbQkkA,E_h2yNoagLK-3ODYwMPErw,5,30444,947


In [15]:
if tf.test.gpu_device_name():
    print('GPU device found: {}'.format(tf.test.gpu_device_name()))
else:
    print("No GPU device found. Training on CPU.")

GPU device found: /device:GPU:0


In [16]:
num_users = len(user_encoder.classes_)
num_businesses = len(business_encoder.classes_)

print(f"Unique Users: {num_users}, Unique Businesses: {num_businesses}")

Unique Users: 60247, Unique Businesses: 3829


In [40]:
embedding_dim=32

user_input = Input(shape=(1,), name='user_input')
business_input = Input(shape=(1,), name='business_input')

user_embedding = Embedding(input_dim=num_users, output_dim=embedding_dim, embeddings_regularizer=l2(1e-6))(user_input)
business_embedding = Embedding(input_dim=num_businesses, output_dim=embedding_dim, embeddings_regularizer=l2(1e-6))(business_input)

user_flatten = Flatten()(user_embedding)
business_flatten = Flatten()(business_embedding)

merged = Concatenate()([user_flatten, business_flatten])
merged = BatchNormalization()(merged)

dense_layer = Dense(128, activation='relu')(merged)
dropout = Dropout(0.4)(dense_layer)
output_layer = Dense(1, activation='linear')(dropout)

model = Model(inputs=[user_input, business_input], outputs=output_layer)
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mae'])
model.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 user_input (InputLayer)        [(None, 1)]          0           []                               
                                                                                                  
 business_input (InputLayer)    [(None, 1)]          0           []                               
                                                                                                  
 embedding_6 (Embedding)        (None, 1, 32)        1927904     ['user_input[0][0]']             
                                                                                                  
 embedding_7 (Embedding)        (None, 1, 32)        122528      ['business_input[0][0]']         
                                                                                            

In [41]:
batch_size = 128
epochs = 20

user_ids = train_data['user_id_encoded'].values
business_ids = train_data['business_id_encoded'].values
stars = train_data['stars'].values

print(np.shape(user_ids), np.shape(business_ids), np.shape(stars))

(328840,) (328840,) (328840,)


In [42]:
model_checkpoint = ModelCheckpoint(f'../saved/model_weights.h5',
                             monitor='val_loss',   # Monitor validation loss
                             save_best_only=True,  # Save only the best model
                             save_weights_only=True,
                             mode='min'            # Mode of monitoring (minimize validation loss)
                            )

early_stopping = EarlyStopping(monitor='val_loss',
                               patience=5,
                               restore_best_weights=True
                              )

In [43]:
history = model.fit(
    [user_ids, business_ids],
    stars,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.2,
    callbacks=[early_stopping, model_checkpoint]
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20


In [44]:
test_data.head()

Unnamed: 0,user_id,business_id,stars,user_id_encoded,business_id_encoded
112983,K0PbqtUJuUjDFRTCHsVZdw,WxB8498ejPtHE7wFa89_fA,5,19884,2024
352607,NdO0g4wsb6WRQD3_7YUfCQ,9PCiyXCG25bOycCR7nXIDw,3,23163,624
162838,cg1AJDNjH9wHD8qhpeFQbg,atZ_olNKXOG4rEr6mccN8g,2,38299,2289
46547,4ze88V1brgv5slnpZ1Q4gw,EI2_OgANt1Mb_83cNnpPwg,3,5717,931
214128,VKygLdqTGjOUvgxJgbQkkA,E_h2yNoagLK-3ODYwMPErw,5,30444,947


In [45]:
test_user_ids = test_data['user_id_encoded'].values
test_business_ids = test_data['business_id_encoded'].values
test_stars = test_data['stars'].values

In [46]:
predictions = model.predict([test_user_ids, test_business_ids])

In [47]:
predictions

array([[4.3599243],
       [3.5485072],
       [4.489728 ],
       ...,
       [3.0025027],
       [3.8101907],
       [3.25465  ]], dtype=float32)

In [48]:
predictions.min(), predictions.max()

(0.47154915, 5.727051)

In [49]:
mean_squared_error(predictions,test_stars)

1.0647927059497462

In [50]:
mean_absolute_error(predictions,test_stars)

0.8113899665186213

In [51]:
model.predict([np.array([11752]), np.array([3280])])

array([[3.8836231]], dtype=float32)

#### Training for entire data

In [53]:
reviews.head()

Unnamed: 0,user_id,business_id,stars
0,mh_-eMZ6K5RLWhZyISBhwA,d_tRshM-w6S4QxE4VVi8tQ,3
1,Zs8Zk3sgh5JxRmoZW4PJcg,d_tRshM-w6S4QxE4VVi8tQ,2
2,OkjeFppodgJP_CvB84cJYw,d_tRshM-w6S4QxE4VVi8tQ,3
3,11xwVSDv1ytjcyvUiyBWaw,d_tRshM-w6S4QxE4VVi8tQ,4
4,8EMU7d4pCkdqUnvlIW40CA,d_tRshM-w6S4QxE4VVi8tQ,4


In [55]:
%%time
user_encoder = LabelEncoder()
business_encoder = LabelEncoder()

reviews['user_id_encoded'] = user_encoder.fit_transform(reviews['user_id'])
reviews['business_id_encoded'] = business_encoder.fit_transform(reviews['business_id'])

Wall time: 313 ms


In [56]:
num_users = len(user_encoder.classes_)
num_businesses = len(business_encoder.classes_)

print(f"Unique Users: {num_users}, Unique Businesses: {num_businesses}")

Unique Users: 66560, Unique Businesses: 3829


In [57]:
embedding_dim=32

user_input = Input(shape=(1,), name='user_input')
business_input = Input(shape=(1,), name='business_input')

user_embedding = Embedding(input_dim=num_users, output_dim=embedding_dim, embeddings_regularizer=l2(1e-6))(user_input)
business_embedding = Embedding(input_dim=num_businesses, output_dim=embedding_dim, embeddings_regularizer=l2(1e-6))(business_input)

user_flatten = Flatten()(user_embedding)
business_flatten = Flatten()(business_embedding)

merged = Concatenate()([user_flatten, business_flatten])
merged = BatchNormalization()(merged)

dense_layer = Dense(128, activation='relu')(merged)
dropout = Dropout(0.4)(dense_layer)
output_layer = Dense(1, activation='linear')(dropout)

model = Model(inputs=[user_input, business_input], outputs=output_layer)
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mae'])
model.summary()

Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 user_input (InputLayer)        [(None, 1)]          0           []                               
                                                                                                  
 business_input (InputLayer)    [(None, 1)]          0           []                               
                                                                                                  
 embedding_8 (Embedding)        (None, 1, 32)        2129920     ['user_input[0][0]']             
                                                                                                  
 embedding_9 (Embedding)        (None, 1, 32)        122528      ['business_input[0][0]']         
                                                                                            

In [58]:
batch_size = 128
epochs = 20

user_ids = reviews['user_id_encoded'].values
business_ids = reviews['business_id_encoded'].values
stars = reviews['stars'].values

print(np.shape(user_ids), np.shape(business_ids), np.shape(stars))

(411051,) (411051,) (411051,)


In [59]:
model_checkpoint = ModelCheckpoint(f'../saved/model_weights.h5',
                             monitor='val_loss',   # Monitor validation loss
                             save_best_only=True,  # Save only the best model
                             save_weights_only=True,
                             mode='min'            # Mode of monitoring (minimize validation loss)
                            )

early_stopping = EarlyStopping(monitor='val_loss',
                               patience=5,
                               restore_best_weights=True
                              )

In [60]:
history = model.fit(
    [user_ids, business_ids],
    stars,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.2,
    callbacks=[early_stopping, model_checkpoint]
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20


In [61]:
import pickle

with open('../saved/user_encoder.pickle', 'wb') as f:
    pickle.dump(user_encoder, f)
    
with open('../saved/business_encoder.pickle', 'wb') as f:
    pickle.dump(business_encoder, f)