In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import hashlib

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error

import tensorflow as tf
from tensorflow.keras.layers import Embedding, Flatten, Input, Dot, Concatenate, Dense, Dropout, BatchNormalization
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

pd.set_option('display.max_columns', None)

In [31]:
reviews = pd.read_csv('../dataset/processed/reviews.csv')

In [32]:
business = pd.read_csv('../dataset/processed/business.csv')

In [33]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 419344 entries, 0 to 419343
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   user_id      419344 non-null  object
 1   business_id  419344 non-null  object
 2   stars        419344 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 9.6+ MB


In [34]:
reviews = reviews.drop_duplicates()

In [35]:
reviews.head()

Unnamed: 0,user_id,business_id,stars
0,mh_-eMZ6K5RLWhZyISBhwA,d_tRshM-w6S4QxE4VVi8tQ,3
1,Zs8Zk3sgh5JxRmoZW4PJcg,d_tRshM-w6S4QxE4VVi8tQ,2
2,OkjeFppodgJP_CvB84cJYw,d_tRshM-w6S4QxE4VVi8tQ,3
3,11xwVSDv1ytjcyvUiyBWaw,d_tRshM-w6S4QxE4VVi8tQ,4
4,8EMU7d4pCkdqUnvlIW40CA,d_tRshM-w6S4QxE4VVi8tQ,4


In [36]:
reviews = pd.merge(reviews, business, on='business_id')
reviews = reviews[reviews['is_open']==1]
reviews.rename(columns={'stars_x': 'stars'},inplace=True, errors='raise')
reviews = reviews[['user_id','business_id','stars']]

In [37]:
%%time
train_data, test_data = train_test_split(reviews, test_size=0.2)

Wall time: 40.7 ms


In [38]:
print(train_data.shape)
print(test_data.shape)

(245437, 3)
(61360, 3)


In [39]:
%%time
user_encoder = LabelEncoder()
business_encoder = LabelEncoder()

train_data['user_id_encoded'] = user_encoder.fit_transform(train_data['user_id'])
train_data['business_id_encoded'] = business_encoder.fit_transform(train_data['business_id'])

Wall time: 215 ms


In [40]:
len(user_encoder.classes_), len(business_encoder.classes_)

(54513, 2412)

In [41]:
test_data = test_data[test_data['user_id'].isin(user_encoder.classes_)]
test_data = test_data[test_data['business_id'].isin(business_encoder.classes_)]

In [42]:
print(train_data.shape)
print(test_data.shape)

(245437, 5)
(54886, 3)


In [43]:
test_data['user_id_encoded'] = user_encoder.transform(test_data['user_id'])
test_data['business_id_encoded'] = business_encoder.transform(test_data['business_id'])

In [44]:
train_data.head()

Unnamed: 0,user_id,business_id,stars,user_id_encoded,business_id_encoded
108760,f2ojVhuI7x3gqjstCtD62w,3eJMsl41qwhcYlvoTF1ElQ,2,36710,170
224224,qfOftK3mWfauqNd9R-4xyQ,JUlsvVAvZvGHWFfkKm0nlg,1,46658,794
102981,VcwfqvrgnBfu1GDmREgnCg,6ajnOk0GcY9xbb5Ocaw8Gw,5,27827,282
198151,-NXMmOULp-kUv_LO1V6JVQ,8kUh6TROemLfbVR_ewVVLg,4,315,364
166351,lKvLytGmsd-LZWiDWrUHHg,OdIBX09glfXNVSyd0RnIeg,4,42145,982


In [45]:
test_data.head()

Unnamed: 0,user_id,business_id,stars,user_id_encoded,business_id_encoded
127130,fz8IYZR-y2_fWlQZAVJ9YA,Pl_9HzOa8uy_YOUxgonzGw,5,37517,1015
343053,Br4Oe0FhUIeLGDXRxYKTCQ,Cz-2ekKVyKheFccKtYP1YA,3,11015,543
357574,0m6KJhYz7sZEFUAVfYJDDQ,_Re2IwkeFUqUsPMRZK8jng,3,1565,1416
37657,HwL0cUMC7-qghiAKBzMpXg,PhLYLCjM_dS_seS_fMBVeg,4,16170,1014
65091,YvoSfmthDmkqbVX8CS1NnQ,5vAtunQlPFkfdTI9zn4TIw,5,30590,257


In [46]:
if tf.test.gpu_device_name():
    print('GPU device found: {}'.format(tf.test.gpu_device_name()))
else:
    print("No GPU device found. Training on CPU.")

GPU device found: /device:GPU:0


In [47]:
num_users = len(user_encoder.classes_)
num_businesses = len(business_encoder.classes_)

print(f"Unique Users: {num_users}, Unique Businesses: {num_businesses}")

Unique Users: 54513, Unique Businesses: 2412


In [48]:
embedding_dim=32

user_input = Input(shape=(1,), name='user_input')
business_input = Input(shape=(1,), name='business_input')

user_embedding = Embedding(input_dim=num_users, output_dim=embedding_dim, embeddings_regularizer=l2(1e-6))(user_input)
business_embedding = Embedding(input_dim=num_businesses, output_dim=embedding_dim, embeddings_regularizer=l2(1e-6))(business_input)

user_flatten = Flatten()(user_embedding)
business_flatten = Flatten()(business_embedding)

merged = Concatenate()([user_flatten, business_flatten])
merged = BatchNormalization()(merged)

dense_layer = Dense(128, activation='relu')(merged)
dropout = Dropout(0.4)(dense_layer)
output_layer = Dense(1, activation='linear')(dropout)

model = Model(inputs=[user_input, business_input], outputs=output_layer)
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mae'])
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 user_input (InputLayer)        [(None, 1)]          0           []                               
                                                                                                  
 business_input (InputLayer)    [(None, 1)]          0           []                               
                                                                                                  
 embedding (Embedding)          (None, 1, 32)        1744416     ['user_input[0][0]']             
                                                                                                  
 embedding_1 (Embedding)        (None, 1, 32)        77184       ['business_input[0][0]']         
                                                                                              

In [49]:
batch_size = 128
epochs = 20

user_ids = train_data['user_id_encoded'].values
business_ids = train_data['business_id_encoded'].values
stars = train_data['stars'].values

print(np.shape(user_ids), np.shape(business_ids), np.shape(stars))

(245437,) (245437,) (245437,)


In [50]:
model_checkpoint = ModelCheckpoint(f'../saved/model_weights.h5',
                             monitor='val_loss',   # Monitor validation loss
                             save_best_only=True,  # Save only the best model
                             save_weights_only=True,
                             mode='min'            # Mode of monitoring (minimize validation loss)
                            )

early_stopping = EarlyStopping(monitor='val_loss',
                               patience=5,
                               restore_best_weights=True
                              )

In [51]:
history = model.fit(
    [user_ids, business_ids],
    stars,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.2,
    callbacks=[early_stopping, model_checkpoint]
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20


In [52]:
test_data.head()

Unnamed: 0,user_id,business_id,stars,user_id_encoded,business_id_encoded
127130,fz8IYZR-y2_fWlQZAVJ9YA,Pl_9HzOa8uy_YOUxgonzGw,5,37517,1015
343053,Br4Oe0FhUIeLGDXRxYKTCQ,Cz-2ekKVyKheFccKtYP1YA,3,11015,543
357574,0m6KJhYz7sZEFUAVfYJDDQ,_Re2IwkeFUqUsPMRZK8jng,3,1565,1416
37657,HwL0cUMC7-qghiAKBzMpXg,PhLYLCjM_dS_seS_fMBVeg,4,16170,1014
65091,YvoSfmthDmkqbVX8CS1NnQ,5vAtunQlPFkfdTI9zn4TIw,5,30590,257


In [53]:
test_user_ids = test_data['user_id_encoded'].values
test_business_ids = test_data['business_id_encoded'].values
test_stars = test_data['stars'].values

In [54]:
predictions = model.predict([test_user_ids, test_business_ids])

In [55]:
predictions

array([[4.045011 ],
       [4.8661532],
       [4.0147123],
       ...,
       [4.1755023],
       [4.354646 ],
       [4.170184 ]], dtype=float32)

In [56]:
predictions.min(), predictions.max()

(1.1149869, 6.158636)

In [57]:
mean_squared_error(predictions,test_stars)

1.05465812707934

In [58]:
mean_absolute_error(predictions,test_stars)

0.7912228819002795

In [59]:
model.predict([np.array([11752]), np.array([3280])])

array([[3.4684098]], dtype=float32)

#### Training for entire data

In [60]:
reviews.head()

Unnamed: 0,user_id,business_id,stars
810,mh_-eMZ6K5RLWhZyISBhwA,L4kfcADLCU4T33i7Z0CkuA,2
811,LTl0cbH2a8QeQQ3XSA3_dw,L4kfcADLCU4T33i7Z0CkuA,5
812,syKoxudhp7dbwbh3xrgjVQ,L4kfcADLCU4T33i7Z0CkuA,3
813,V9n2Qyr-dvNg00BwMWqquQ,L4kfcADLCU4T33i7Z0CkuA,4
814,vEFJfeis4LEuM-y4qZvXAA,L4kfcADLCU4T33i7Z0CkuA,4


In [61]:
%%time
user_encoder = LabelEncoder()
business_encoder = LabelEncoder()

reviews['user_id_encoded'] = user_encoder.fit_transform(reviews['user_id'])
reviews['business_id_encoded'] = business_encoder.fit_transform(reviews['business_id'])

Wall time: 258 ms


In [62]:
num_users = len(user_encoder.classes_)
num_businesses = len(business_encoder.classes_)

print(f"Unique Users: {num_users}, Unique Businesses: {num_businesses}")

Unique Users: 60425, Unique Businesses: 2412


In [63]:
embedding_dim=32

user_input = Input(shape=(1,), name='user_input')
business_input = Input(shape=(1,), name='business_input')

user_embedding = Embedding(input_dim=num_users, output_dim=embedding_dim, embeddings_regularizer=l2(1e-6))(user_input)
business_embedding = Embedding(input_dim=num_businesses, output_dim=embedding_dim, embeddings_regularizer=l2(1e-6))(business_input)

user_flatten = Flatten()(user_embedding)
business_flatten = Flatten()(business_embedding)

merged = Concatenate()([user_flatten, business_flatten])
merged = BatchNormalization()(merged)

dense_layer = Dense(128, activation='relu')(merged)
dropout = Dropout(0.4)(dense_layer)
output_layer = Dense(1, activation='linear')(dropout)

model = Model(inputs=[user_input, business_input], outputs=output_layer)
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mae'])
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 user_input (InputLayer)        [(None, 1)]          0           []                               
                                                                                                  
 business_input (InputLayer)    [(None, 1)]          0           []                               
                                                                                                  
 embedding_2 (Embedding)        (None, 1, 32)        1933600     ['user_input[0][0]']             
                                                                                                  
 embedding_3 (Embedding)        (None, 1, 32)        77184       ['business_input[0][0]']         
                                                                                            

In [64]:
batch_size = 128
epochs = 20

user_ids = reviews['user_id_encoded'].values
business_ids = reviews['business_id_encoded'].values
stars = reviews['stars'].values

print(np.shape(user_ids), np.shape(business_ids), np.shape(stars))

(306797,) (306797,) (306797,)


In [65]:
model_checkpoint = ModelCheckpoint(f'../saved/model_weights.h5',
                             monitor='val_loss',   # Monitor validation loss
                             save_best_only=True,  # Save only the best model
                             save_weights_only=True,
                             mode='min'            # Mode of monitoring (minimize validation loss)
                            )

early_stopping = EarlyStopping(monitor='val_loss',
                               patience=5,
                               restore_best_weights=True
                              )

In [66]:
history = model.fit(
    [user_ids, business_ids],
    stars,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.2,
    callbacks=[early_stopping, model_checkpoint]
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20


In [67]:
import pickle

with open('../saved/user_encoder.pickle', 'wb') as f:
    pickle.dump(user_encoder, f)
    
with open('../saved/business_encoder.pickle', 'wb') as f:
    pickle.dump(business_encoder, f)