In [81]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import hashlib

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

import tensorflow as tf
from tensorflow.keras.layers import Embedding, Flatten, Input, Dot, Concatenate, Dense, Dropout, BatchNormalization
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

pd.set_option('display.max_columns', None)

In [98]:
reviews = pd.read_csv('../dataset/processed/reviews.csv')

In [99]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 419344 entries, 0 to 419343
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   user_id      419344 non-null  object
 1   business_id  419344 non-null  object
 2   stars        419344 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 9.6+ MB


In [101]:
reviews = reviews.drop_duplicates()

In [102]:
reviews.head()

Unnamed: 0,user_id,business_id,stars
0,mh_-eMZ6K5RLWhZyISBhwA,d_tRshM-w6S4QxE4VVi8tQ,0.6
1,Zs8Zk3sgh5JxRmoZW4PJcg,d_tRshM-w6S4QxE4VVi8tQ,0.4
2,OkjeFppodgJP_CvB84cJYw,d_tRshM-w6S4QxE4VVi8tQ,0.6
3,11xwVSDv1ytjcyvUiyBWaw,d_tRshM-w6S4QxE4VVi8tQ,0.8
4,8EMU7d4pCkdqUnvlIW40CA,d_tRshM-w6S4QxE4VVi8tQ,0.8


In [103]:
%%time
train_data, test_data = train_test_split(reviews, test_size=0.2)

Wall time: 65.7 ms


In [104]:
print(train_data.shape)
print(test_data.shape)

(328840, 3)
(82211, 3)


In [105]:
%%time
user_encoder = LabelEncoder()
business_encoder = LabelEncoder()

train_data['user_id_encoded'] = user_encoder.fit_transform(train_data['user_id'])
train_data['business_id_encoded'] = business_encoder.fit_transform(train_data['business_id'])

Wall time: 290 ms


In [106]:
len(user_encoder.classes_), len(business_encoder.classes_)

(60263, 3828)

In [107]:
test_data = test_data[test_data['user_id'].isin(user_encoder.classes_)]
test_data = test_data[test_data['business_id'].isin(business_encoder.classes_)]

In [108]:
print(train_data.shape)
print(test_data.shape)

(328840, 5)
(75350, 3)


In [109]:
test_data['user_id_encoded'] = user_encoder.transform(test_data['user_id'])
test_data['business_id_encoded'] = business_encoder.transform(test_data['business_id'])

In [110]:
train_data.head()

Unnamed: 0,user_id,business_id,stars,user_id_encoded,business_id_encoded
336382,KWrJZDQXkrt9vzQDh9iqJQ,4gvAsj8NRTla82aSBzGytw,0.8,20384,330
4528,tugozkS2AcmfOR99RdUxjQ,0oSSjekU-3GR8gselReWnA,1.0,54609,106
385179,rTz-hsyfsBMw-ywhnsSOOQ,SYKy8Y_zDlsDuBoZsAckig,0.4,52299,1763
128065,ikIT45t6zrQQ55JBoozHFw,9rdEIoILba8wSZcrTLFzNA,0.2,44109,654
14297,k07Q72rcLjwl-4y1JF_KFA,jkGQQ4_LgJx3hwPtCFkzbQ,0.6,45306,2855


In [111]:
test_data.head()

Unnamed: 0,user_id,business_id,stars,user_id_encoded,business_id_encoded
88341,70y7ExiEHrawld5lruwUVw,jziliEq8Zum-EynD5v-Hvg,0.6,7667,2867
325788,LDwoB4NjFZqcCt03J7K2nQ,PhvIKjKB-tkC6Lld0gQn9Q,0.6,20992,1601
169693,ERC_M2sb4n4kaiKhrrRtJA,OdIBX09glfXNVSyd0RnIeg,0.8,14606,1538
156427,veZGsORMB2Ti-AMxoXIP0g,Uky0DD3LU4C7eyNDhpmOXg,0.8,56248,1897
301540,BQZHDwepT-y5hxCXVrND0w,qaDImxPguQz0jToNYvB1Eg,0.8,11752,3280


In [112]:
if tf.test.gpu_device_name():
    print('GPU device found: {}'.format(tf.test.gpu_device_name()))
else:
    print("No GPU device found. Training on CPU.")

GPU device found: /device:GPU:0


In [113]:
num_users = len(user_encoder.classes_)
num_businesses = len(business_encoder.classes_)

print(f"Unique Users: {num_users}, Unique Businesses: {num_businesses}")

Unique Users: 60263, Unique Businesses: 3828


In [114]:
embedding_dim=32

user_input = Input(shape=(1,), name='user_input')
business_input = Input(shape=(1,), name='business_input')

user_embedding = Embedding(input_dim=num_users, output_dim=embedding_dim, embeddings_regularizer=l2(1e-6))(user_input)
business_embedding = Embedding(input_dim=num_businesses, output_dim=embedding_dim, embeddings_regularizer=l2(1e-6))(business_input)

user_flatten = Flatten()(user_embedding)
business_flatten = Flatten()(business_embedding)

merged = Concatenate()([user_flatten, business_flatten])
merged = BatchNormalization()(merged)

dense_layer = Dense(128, activation='relu')(merged)
dropout = Dropout(0.4)(dense_layer)
output_layer = Dense(1, activation='linear')(dropout)

model = Model(inputs=[user_input, business_input], outputs=output_layer)
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mae'])
model.summary()

Model: "model_8"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 user_input (InputLayer)        [(None, 1)]          0           []                               
                                                                                                  
 business_input (InputLayer)    [(None, 1)]          0           []                               
                                                                                                  
 embedding_16 (Embedding)       (None, 1, 32)        1928416     ['user_input[0][0]']             
                                                                                                  
 embedding_17 (Embedding)       (None, 1, 32)        122496      ['business_input[0][0]']         
                                                                                            

In [115]:
batch_size = 128
epochs = 20

user_ids = train_data['user_id_encoded'].values
business_ids = train_data['business_id_encoded'].values
stars = train_data['stars'].values

In [116]:
model_checkpoint = ModelCheckpoint(f'../saved/model_weights.h5',
                             monitor='val_loss',   # Monitor validation loss
                             save_best_only=True,  # Save only the best model
                             save_weights_only=True,
                             mode='min'            # Mode of monitoring (minimize validation loss)
                            )

early_stopping = EarlyStopping(monitor='val_loss',
                               patience=5,
                               restore_best_weights=True
                              )

In [117]:
history = model.fit(
    [user_ids, business_ids],
    stars,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.2,
    callbacks=[early_stopping, model_checkpoint]
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20


In [118]:
test_data.head()

Unnamed: 0,user_id,business_id,stars,user_id_encoded,business_id_encoded
88341,70y7ExiEHrawld5lruwUVw,jziliEq8Zum-EynD5v-Hvg,0.6,7667,2867
325788,LDwoB4NjFZqcCt03J7K2nQ,PhvIKjKB-tkC6Lld0gQn9Q,0.6,20992,1601
169693,ERC_M2sb4n4kaiKhrrRtJA,OdIBX09glfXNVSyd0RnIeg,0.8,14606,1538
156427,veZGsORMB2Ti-AMxoXIP0g,Uky0DD3LU4C7eyNDhpmOXg,0.8,56248,1897
301540,BQZHDwepT-y5hxCXVrND0w,qaDImxPguQz0jToNYvB1Eg,0.8,11752,3280


In [119]:
test_user_ids = test_data['user_id_encoded'].values
test_business_ids = test_data['business_id_encoded'].values
test_stars = test_data['stars'].values

In [120]:
predictions = model.predict([test_user_ids, test_business_ids])

In [121]:
predictions

array([[0.80033386],
       [0.7516659 ],
       [0.8365202 ],
       ...,
       [0.7373908 ],
       [0.8345917 ],
       [0.76848227]], dtype=float32)

In [122]:
predictions.min(), predictions.max()

(0.09225136, 1.1443138)

In [123]:
mean_squared_error(predictions,test_stars)

0.042213771433088056

In [124]:
mean_absolute_error(predictions,test_stars)

0.16063959959855362

In [128]:
model.predict([np.array([11752]), np.array([3280])])

array([[0.96884644]], dtype=float32)