In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import hashlib

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import tensorflow as tf
from tensorflow.keras.layers import Embedding, Flatten, Input, Dot, Concatenate, Dense, Dropout, BatchNormalization
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

pd.set_option('display.max_columns', None)

In [3]:
reviews = pd.read_csv('../dataset/processed/reviews.csv')

In [4]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 419344 entries, 0 to 419343
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   user_id      419344 non-null  object
 1   business_id  419344 non-null  object
 2   stars        419344 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 9.6+ MB


In [5]:
reviews.head()

Unnamed: 0,user_id,business_id,stars
0,mh_-eMZ6K5RLWhZyISBhwA,d_tRshM-w6S4QxE4VVi8tQ,3
1,Zs8Zk3sgh5JxRmoZW4PJcg,d_tRshM-w6S4QxE4VVi8tQ,2
2,OkjeFppodgJP_CvB84cJYw,d_tRshM-w6S4QxE4VVi8tQ,3
3,11xwVSDv1ytjcyvUiyBWaw,d_tRshM-w6S4QxE4VVi8tQ,4
4,8EMU7d4pCkdqUnvlIW40CA,d_tRshM-w6S4QxE4VVi8tQ,4


In [6]:
%%time
train_data, test_data = train_test_split(reviews, test_size=0.2)

Wall time: 72.1 ms


In [7]:
print(train_data.shape)
print(test_data.shape)

(335475, 3)
(83869, 3)


In [8]:
%%time
user_encoder = LabelEncoder()
business_encoder = LabelEncoder()

train_data['user_id_encoded'] = user_encoder.fit_transform(train_data['user_id'])
train_data['business_id_encoded'] = business_encoder.fit_transform(train_data['business_id'])

Wall time: 277 ms


In [9]:
len(user_encoder.classes_), len(business_encoder.classes_)

(60241, 3829)

In [10]:
test_data = test_data[test_data['user_id'].isin(user_encoder.classes_)]
test_data = test_data[test_data['business_id'].isin(business_encoder.classes_)]

In [11]:
print(train_data.shape)
print(test_data.shape)

(335475, 5)
(76971, 3)


In [12]:
test_data['user_id_encoded'] = user_encoder.transform(test_data['user_id'])
test_data['business_id_encoded'] = business_encoder.transform(test_data['business_id'])

In [13]:
train_data.head()

Unnamed: 0,user_id,business_id,stars,user_id_encoded,business_id_encoded
375057,Cf8Jw9yX8RrG-l4PQoMkuw,lUHztzYjhD743ZuOQvgyMg,3,12929,2975
256192,USD9k93iOIB88v93B5iBcw,IjMs1n7UelI7ev_5IDl5kA,5,29574,1193
235870,NaGyO0mjagqD2lGNsOzzaQ,TRwPE6wsoAL6_fRaFdB4FA,5,23104,1808
11769,7qBgGzyf0FbHKC0jiI1J8A,99e7bysta1myyrQogFEWUQ,4,8469,610
196979,yoznWDhnmkKrQLk__LhnBQ,_UOg5_pk9IhKee91eWrT4A,4,59200,2220


In [14]:
test_data.head()

Unnamed: 0,user_id,business_id,stars,user_id_encoded,business_id_encoded
309565,zTwwciNRMedBvUS3-_8h6g,d48Xrx8MhGtdaLvhcYzNWQ,4,59799,2419
90194,WY7B-CBeLbjjDBr73l4k-A,fEqiXG_B-fn__w0aeF3nBQ,5,31556,2558
167636,3Zl3RokY8eyRJQbbQmuOuA,F7fC4dZW17muSvE8ixaXkQ,2,4372,983
107314,LBgWS6FR9sHxiiHm7z7kZg,iWy6Wft0MMO9Ud2wM93frg,5,20942,2772
345401,wsdtHpgw45hpju4mREenoQ,7HE_mWkvZ8GxsP0sLP-_ng,4,57366,490


In [15]:
if tf.test.gpu_device_name():
    print('GPU device found: {}'.format(tf.test.gpu_device_name()))
else:
    print("No GPU device found. Training on CPU.")

GPU device found: /device:GPU:0


In [16]:
num_users = len(user_encoder.classes_)
num_businesses = len(business_encoder.classes_)

print(f"Unique Users: {num_users}, Unique Businesses: {num_businesses}")

Unique Users: 60241, Unique Businesses: 3829


In [21]:
embedding_dim=50

# User and business input layers
user_input = Input(shape=(1,), name='user_input')
business_input = Input(shape=(1,), name='business_input')

# Embedding layers for user and business IDs
user_embedding = Embedding(input_dim=num_users, output_dim=embedding_dim)(user_input)
business_embedding = Embedding(input_dim=num_businesses, output_dim=embedding_dim)(business_input)

# Flatten the embeddings
user_flatten = Flatten()(user_embedding)
business_flatten = Flatten()(business_embedding)

# Merge the embeddings using concat
merged = Concatenate()([user_flatten, business_flatten])

# Add some dense layers for additional modeling
dense_layer = Dense(64, activation='relu')(merged)
output_layer = Dense(1, activation='linear')(dense_layer)

model = Model(inputs=[user_input, business_input], outputs=output_layer)
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mae'])
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 user_input (InputLayer)        [(None, 1)]          0           []                               
                                                                                                  
 business_input (InputLayer)    [(None, 1)]          0           []                               
                                                                                                  
 embedding_2 (Embedding)        (None, 1, 50)        3012050     ['user_input[0][0]']             
                                                                                                  
 embedding_3 (Embedding)        (None, 1, 50)        191450      ['business_input[0][0]']         
                                                                                            

In [22]:
batch_size = 128
epochs = 20

user_ids = train_data['user_id_encoded'].values
business_ids = train_data['business_id_encoded'].values
stars = train_data['stars'].values

In [23]:
model_checkpoint = ModelCheckpoint(f'../saved/model_weights.h5',
                             monitor='val_loss',   # Monitor validation loss
                             save_best_only=True,  # Save only the best model
                             save_weights_only=True,
                             mode='min'            # Mode of monitoring (minimize validation loss)
                            )

early_stopping = EarlyStopping(monitor='val_loss',
                               patience=5,
                               restore_best_weights=True
                              )

In [24]:
history = model.fit(
    [user_ids, business_ids],
    stars,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.2,
    callbacks=[early_stopping, model_checkpoint]
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20


In [25]:
test_data.head()

Unnamed: 0,user_id,business_id,stars,user_id_encoded,business_id_encoded
309565,zTwwciNRMedBvUS3-_8h6g,d48Xrx8MhGtdaLvhcYzNWQ,4,59799,2419
90194,WY7B-CBeLbjjDBr73l4k-A,fEqiXG_B-fn__w0aeF3nBQ,5,31556,2558
167636,3Zl3RokY8eyRJQbbQmuOuA,F7fC4dZW17muSvE8ixaXkQ,2,4372,983
107314,LBgWS6FR9sHxiiHm7z7kZg,iWy6Wft0MMO9Ud2wM93frg,5,20942,2772
345401,wsdtHpgw45hpju4mREenoQ,7HE_mWkvZ8GxsP0sLP-_ng,4,57366,490


In [26]:
test_user_ids = test_data['user_id_encoded'].values
test_business_ids = test_data['business_id_encoded'].values
test_stars = test_data['stars'].values

In [27]:
predictions = model.predict([test_user_ids, test_business_ids])

In [28]:
predictions.min(), predictions.max()

(0.9913414, 5.2857676)

In [29]:
mean_squared_error(predictions,test_stars)

1.0379827003781854

In [32]:
model.predict([np.array([4372]), np.array([983])])

array([[3.0794566]], dtype=float32)

In [None]:
988309	79658
1	uf6Nq_CanHaEn8h5XM4l_Q	1wKbk-FtJBBidd1k8s09DA	5	1600928	7044
2	P-Fv4Mm56a2wTEsA34JMew	7l7XOid3zqMgq-Of281Qww	5	710613	20834
3	-YEfO7S7324NPQqhka0lBA	DE5YBg3Hry6zigdS_oSRNQ	5	14887	33670
4	t1IKpp6jkLdZn93RqLJnIw	1pDauBMngzqWG4ZTPMEY1Q	5	1556333	6812
...	...	...	...	...	...
1140169	ePmanjMTkYwpO65_9_fwQA	hOupS2QRNIuMLfcpOqC0oA	5	1157644	106446
1140170	HfGKgfULfqWkfPvB_3299Q	DRr5xdfHtqgWfhmdTU1B8Q	3	510637	34178
1140171	e13v69_97zYfuET0yvhhsQ	66Xlt-k1ZhVVq1ql6fX30A	5	1147048	16912
1140172	X1nvKXUJ5Lp3W9Oe-_JrMQ	e6DF-FlJ-BXf1vetAkr3Tg	5	929471	98601
1140173	aPqxf2vDDOIFkbdxHCIIXA	wa9tG3QDiR5Qx8P_5oIwdQ	3	1048601	142304