In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import hashlib

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error

import tensorflow as tf
from tensorflow.keras.layers import Embedding, Flatten, Input, Dot, Concatenate, Dense, Dropout, BatchNormalization
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

pd.set_option('display.max_columns', None)

In [6]:
reviews = pd.read_csv(r'C:\Users\User\Desktop\Learnabay Training\My Portfolio projects for resume\Restaurant Recommendation System\Processed_Data\reviews.csv')

In [7]:
business = pd.read_csv(r'C:\Users\User\Desktop\Learnabay Training\My Portfolio projects for resume\Restaurant Recommendation System\Processed_Data\business.csv')

In [8]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 419344 entries, 0 to 419343
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   user_id      419344 non-null  object
 1   business_id  419344 non-null  object
 2   stars        419344 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 9.6+ MB


In [9]:
reviews = reviews.drop_duplicates()

In [10]:
reviews.head()

Unnamed: 0,user_id,business_id,stars
0,smOvOajNG0lS4Pq7d8g4JQ,RZtGWDLCAtuipwaZ-UfjmQ,4
1,IQsF3Rc6IgCzjVV9DE8KXg,eFvzHawVJofxSnD7TgbZtg,5
2,aFa96pz67TwOFu4Weq5Agg,kq5Ghhh14r-eCxlVmlyd8w,5
3,G0DHgkSsDozqUPWtlxVEMw,oBhJuukGRqPVvYBfTkhuZA,4
4,ZGjgfSvjQK886kiTzLwfLQ,EtKSTHV5Qx_Q7Aur9o4kQQ,5


In [11]:
reviews = pd.merge(reviews, business, on='business_id')
reviews = reviews[reviews['is_open']==1]
reviews.rename(columns={'stars_x': 'stars'},inplace=True, errors='raise')
reviews = reviews[['user_id','business_id','stars']]

In [12]:
%%time
train_data, test_data = train_test_split(reviews, test_size=0.2)

CPU times: total: 31.2 ms
Wall time: 28.6 ms


In [13]:
print(train_data.shape)
print(test_data.shape)

(245437, 3)
(61360, 3)


In [14]:
%%time
user_encoder = LabelEncoder()
business_encoder = LabelEncoder()

train_data['user_id_encoded'] = user_encoder.fit_transform(train_data['user_id'])
train_data['business_id_encoded'] = business_encoder.fit_transform(train_data['business_id'])

CPU times: total: 188 ms
Wall time: 194 ms


In [15]:
len(user_encoder.classes_), len(business_encoder.classes_)

(54422, 2412)

In [16]:
test_data = test_data[test_data['user_id'].isin(user_encoder.classes_)]
test_data = test_data[test_data['business_id'].isin(business_encoder.classes_)]

In [17]:
print(train_data.shape)
print(test_data.shape)

(245437, 5)
(54839, 3)


In [18]:
test_data['user_id_encoded'] = user_encoder.transform(test_data['user_id'])
test_data['business_id_encoded'] = business_encoder.transform(test_data['business_id'])

In [19]:
train_data.head()

Unnamed: 0,user_id,business_id,stars,user_id_encoded,business_id_encoded
291626,tmWTXJuqJ3HKPITk07D1Nw,LgAuYz5cQe3zTxuteJ4VyQ,4,49208,879
313992,wt3BsEKkKLklgqv0hylzWA,gYYMQeg4X8FUcCxXI4c2Tw,4,51840,1682
305217,8Ke-o_dV72eKnPJhpNfYOw,32It9NN_lVismU9tcuLB0A,1,8106,148
145559,s4ruVu8oVVPpy1HZx0qWpg,REiAM73RkOxFqCVlEFiRWA,4,47749,1066
241275,9JnKwuTNfTOh1fGGPe2t6A,r7PzCQmfs2jq-VmZjUwKjg,4,8936,2099


In [20]:
test_data.head()

Unnamed: 0,user_id,business_id,stars,user_id_encoded,business_id_encoded
142628,1TcuN6d1LOU12y5geVqCUw,e4MoozYGqe_rb4_ZC1rYMQ,4,2188,1584
154276,IQG1WlWNg2isqbdAX_Zppg,SDL247FlOnScJfuNjZESxg,3,16575,1111
55957,OINYv6r1OMFK9wbDpno9AQ,0K4RwxdAcViifyU3Htzxww,4,21485,46
105603,a-x197V4FMgtBuh63ijsow,RQAF6a0akMiot5lZZnMNNw,5,32377,1076
281111,yGPXAiE0IdZfMcJIcSxc1Q,5WF3593by2u-aE_WaM7_Mw,3,52956,249


In [21]:
if tf.test.gpu_device_name():
    print('GPU device found: {}'.format(tf.test.gpu_device_name()))
else:
    print("No GPU device found. Training on CPU.")

No GPU device found. Training on CPU.


In [22]:
num_users = len(user_encoder.classes_)
num_businesses = len(business_encoder.classes_)

print(f"Unique Users: {num_users}, Unique Businesses: {num_businesses}")

Unique Users: 54422, Unique Businesses: 2412


In [23]:
embedding_dim=32

user_input = Input(shape=(1,), name='user_input')
business_input = Input(shape=(1,), name='business_input')

user_embedding = Embedding(input_dim=num_users, output_dim=embedding_dim, embeddings_regularizer=l2(1e-6))(user_input)
business_embedding = Embedding(input_dim=num_businesses, output_dim=embedding_dim, embeddings_regularizer=l2(1e-6))(business_input)

user_flatten = Flatten()(user_embedding)
business_flatten = Flatten()(business_embedding)

merged = Concatenate()([user_flatten, business_flatten])
merged = BatchNormalization()(merged)

dense_layer = Dense(128, activation='relu')(merged)
dropout = Dropout(0.4)(dense_layer)
output_layer = Dense(1, activation='linear')(dropout)

model = Model(inputs=[user_input, business_input], outputs=output_layer)
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mae'])
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 user_input (InputLayer)        [(None, 1)]          0           []                               
                                                                                                  
 business_input (InputLayer)    [(None, 1)]          0           []                               
                                                                                                  
 embedding (Embedding)          (None, 1, 32)        1741504     ['user_input[0][0]']             
                                                                                                  
 embedding_1 (Embedding)        (None, 1, 32)        77184       ['business_input[0][0]']         
                                                                                              

In [24]:
batch_size = 128
epochs = 20

user_ids = train_data['user_id_encoded'].values
business_ids = train_data['business_id_encoded'].values
stars = train_data['stars'].values

print(np.shape(user_ids), np.shape(business_ids), np.shape(stars))

(245437,) (245437,) (245437,)


In [26]:
model_checkpoint = ModelCheckpoint(r'C:\Users\User\Desktop\Learnabay Training\My Portfolio projects for resume\Restaurant Recommendation System\model_weights.h5',
                             monitor='val_loss',   # Monitor validation loss
                             save_best_only=True,  # Save only the best model
                             save_weights_only=True,
                             mode='min'            # Mode of monitoring (minimize validation loss)
                            )

early_stopping = EarlyStopping(monitor='val_loss',
                               patience=5,
                               restore_best_weights=True
                              )

In [27]:
history = model.fit(
    [user_ids, business_ids],
    stars,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.2,
    callbacks=[early_stopping, model_checkpoint]
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20


In [28]:
test_data.head()

Unnamed: 0,user_id,business_id,stars,user_id_encoded,business_id_encoded
142628,1TcuN6d1LOU12y5geVqCUw,e4MoozYGqe_rb4_ZC1rYMQ,4,2188,1584
154276,IQG1WlWNg2isqbdAX_Zppg,SDL247FlOnScJfuNjZESxg,3,16575,1111
55957,OINYv6r1OMFK9wbDpno9AQ,0K4RwxdAcViifyU3Htzxww,4,21485,46
105603,a-x197V4FMgtBuh63ijsow,RQAF6a0akMiot5lZZnMNNw,5,32377,1076
281111,yGPXAiE0IdZfMcJIcSxc1Q,5WF3593by2u-aE_WaM7_Mw,3,52956,249


In [29]:
test_user_ids = test_data['user_id_encoded'].values
test_business_ids = test_data['business_id_encoded'].values
test_stars = test_data['stars'].values

In [30]:
predictions = model.predict([test_user_ids, test_business_ids])



In [31]:
predictions

array([[4.3378353],
       [4.019232 ],
       [4.0773945],
       ...,
       [3.8636458],
       [3.4256692],
       [5.0630445]], dtype=float32)

In [32]:
predictions.min(), predictions.max()

(0.9519359, 5.904071)

In [33]:
mean_squared_error(predictions,test_stars)

1.0567940473556519

In [34]:
mean_absolute_error(predictions,test_stars)

0.803666353225708

#### Training for entire data

In [60]:
reviews.head()

Unnamed: 0,user_id,business_id,stars
810,mh_-eMZ6K5RLWhZyISBhwA,L4kfcADLCU4T33i7Z0CkuA,2
811,LTl0cbH2a8QeQQ3XSA3_dw,L4kfcADLCU4T33i7Z0CkuA,5
812,syKoxudhp7dbwbh3xrgjVQ,L4kfcADLCU4T33i7Z0CkuA,3
813,V9n2Qyr-dvNg00BwMWqquQ,L4kfcADLCU4T33i7Z0CkuA,4
814,vEFJfeis4LEuM-y4qZvXAA,L4kfcADLCU4T33i7Z0CkuA,4


In [37]:
%%time
user_encoder = LabelEncoder()
business_encoder = LabelEncoder()

reviews['user_id_encoded'] = user_encoder.fit_transform(reviews['user_id'])
reviews['business_id_encoded'] = business_encoder.fit_transform(reviews['business_id'])

CPU times: total: 172 ms
Wall time: 166 ms


In [38]:
num_users = len(user_encoder.classes_)
num_businesses = len(business_encoder.classes_)

print(f"Unique Users: {num_users}, Unique Businesses: {num_businesses}")

Unique Users: 60425, Unique Businesses: 2412


In [39]:
embedding_dim=32

user_input = Input(shape=(1,), name='user_input')
business_input = Input(shape=(1,), name='business_input')

user_embedding = Embedding(input_dim=num_users, output_dim=embedding_dim, embeddings_regularizer=l2(1e-6))(user_input)
business_embedding = Embedding(input_dim=num_businesses, output_dim=embedding_dim, embeddings_regularizer=l2(1e-6))(business_input)

user_flatten = Flatten()(user_embedding)
business_flatten = Flatten()(business_embedding)

merged = Concatenate()([user_flatten, business_flatten])
merged = BatchNormalization()(merged)

dense_layer = Dense(128, activation='relu')(merged)
dropout = Dropout(0.4)(dense_layer)
output_layer = Dense(1, activation='linear')(dropout)

model = Model(inputs=[user_input, business_input], outputs=output_layer)
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mae'])
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 user_input (InputLayer)        [(None, 1)]          0           []                               
                                                                                                  
 business_input (InputLayer)    [(None, 1)]          0           []                               
                                                                                                  
 embedding_2 (Embedding)        (None, 1, 32)        1933600     ['user_input[0][0]']             
                                                                                                  
 embedding_3 (Embedding)        (None, 1, 32)        77184       ['business_input[0][0]']         
                                                                                            

In [40]:
batch_size = 128
epochs = 20

user_ids = reviews['user_id_encoded'].values
business_ids = reviews['business_id_encoded'].values
stars = reviews['stars'].values

print(np.shape(user_ids), np.shape(business_ids), np.shape(stars))

(306797,) (306797,) (306797,)


In [41]:
model_checkpoint = ModelCheckpoint(r'C:\Users\User\Desktop\Learnabay Training\My Portfolio projects for resume\Restaurant Recommendation System\model_weights.h5',
                             monitor='val_loss',   # Monitor validation loss
                             save_best_only=True,  # Save only the best model
                             save_weights_only=True,
                             mode='min'            # Mode of monitoring (minimize validation loss)
                            )

early_stopping = EarlyStopping(monitor='val_loss',
                               patience=5,
                               restore_best_weights=True
                              )

In [42]:
history = model.fit(
    [user_ids, business_ids],
    stars,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.2,
    callbacks=[early_stopping, model_checkpoint]
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20


In [43]:
import pickle

with open(r'C:\Users\User\Desktop\Learnabay Training\My Portfolio projects for resume\Restaurant Recommendation System\user_encoder.pickle', 'wb') as f:
    pickle.dump(user_encoder, f)
    
with open(r'C:\Users\User\Desktop\Learnabay Training\My Portfolio projects for resume\Restaurant Recommendation System\business_encoder.pickle', 'wb') as f:
    pickle.dump(business_encoder, f)