In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import hashlib

import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Embedding, Flatten, Input, Dot, Concatenate, Dense
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

pd.set_option('display.max_columns', None)

In [2]:
%%time
reviews_chunk = pd.read_json("../dataset/jsons/yelp_academic_dataset_review.json", lines=True,
                      dtype={'review_id':str,'user_id':str,
                             'business_id':str,'stars':'int8',
                             'date':str,'text':str,'useful':'int8',
                             'funny':'int8','cool':'int8'},
                      chunksize=10000)

reviews_data = [review for review in reviews_chunk]
reviews = pd.concat(reviews_data)

Wall time: 3min 53s


In [3]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6990280 entries, 0 to 6990279
Data columns (total 9 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   review_id    object
 1   user_id      object
 2   business_id  object
 3   stars        int8  
 4   useful       int8  
 5   funny        int8  
 6   cool         int8  
 7   text         object
 8   date         object
dtypes: int8(4), object(5)
memory usage: 293.3+ MB


In [4]:
reviews.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15


In [5]:
del reviews_chunk, reviews_data

In [6]:
reviews = reviews[['user_id','business_id','stars']]

In [7]:
reviews

Unnamed: 0,user_id,business_id,stars
0,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3
1,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5
2,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3
3,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5
4,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4
...,...,...,...
6990275,qskILQ3k0I_qcCMI-k6_QQ,jals67o91gcrD4DC81Vk6w,5
6990276,Zo0th2m8Ez4gLSbHftiQvg,2vLksaMmSEcGbjI5gywpZA,5
6990277,mm6E4FbCMwJmb7kPDZ5v2Q,R1khUUxidqfaJmcpmGd4aw,4
6990278,YwAMC-jvZ1fvEUum6QkEkw,Rr9kKArrMhSLVE9a53q-aA,5


In [8]:
%%time
train_data, test_data = train_test_split(reviews, test_size=0.2)

Wall time: 5.7 s


In [9]:
print(train_data.shape)
print(test_data.shape)

(5592224, 3)
(1398056, 3)


In [10]:
%%time
user_encoder = LabelEncoder()
business_encoder = LabelEncoder()

train_data['user_id_encoded'] = user_encoder.fit_transform(train_data['user_id'])
train_data['business_id_encoded'] = business_encoder.fit_transform(train_data['business_id'])

Wall time: 17.6 s


In [11]:
len(user_encoder.classes_), len(business_encoder.classes_)

(1745868, 150342)

In [12]:
test_data = test_data[test_data['user_id'].isin(user_encoder.classes_)]
test_data = test_data[test_data['business_id'].isin(business_encoder.classes_)]

In [13]:
print(train_data.shape)
print(test_data.shape)

(5592224, 5)
(1140174, 3)


In [14]:
test_data['user_id_encoded'] = user_encoder.transform(test_data['user_id'])
test_data['business_id_encoded'] = business_encoder.transform(test_data['business_id'])

In [15]:
train_data

Unnamed: 0,user_id,business_id,stars,user_id_encoded,business_id_encoded
6778669,ZVeO6CnkEvkvW30WQFB6eQ,hXGIMryL90iWW-gPzYEpUA,2,996707,106759
6878193,cZCOdn_bIKw78mNb8PhKLg,5l8nLbyo7altE7HluWq1Bg,5,1107269,16055
3390267,DDItoaNRmJJYrhJGz4ZJ3w,yXX7xmclZkuBif-7QoyCuw,3,389213,146911
2560864,AFNBY21DdTZ_pH5TWZzQ9g,4EXm70lqkCxZ-WtgNon8sw,1,308062,12433
3447431,LbpZOMwcfB6yAiSiyOoiDg,Lmstm1D0Ch8QP1b2TB-tVw,5,618602,53461
...,...,...,...,...,...
5167331,QXPbWV6RMMnKpUNIaYMCuQ,Bgv4BhVzBOPkbcgNUzdgcA,1,752419,30115
3517100,ourE_S1OyE2vv3o-Fle_kA,thlyktt2c7v1DyOpSVSibw,4,1444259,135374
3073084,41a6Rh82q6gr4hHjNpo1aA,IsNdI6kw8GlAopKCpEGt9w,5,138011,46661
2614662,inzMgE6PWlzDR7J80r9o5Q,h_D-p_m3diwqlAHQsOUjCA,5,1278146,106863


In [16]:
test_data

Unnamed: 0,user_id,business_id,stars,user_id_encoded,business_id_encoded
239472,9zG2PRZnMGe5F29a5NK6Rw,dNR-b-CsrFGYhMo9zLMrCw,4,300751,96934
4281013,It0-Upv_-6FxbMjjGjLeyg,3EJr6A23wLRPlekclGwHlA,5,544028,10115
5287599,NSwhq-KFFoatc8jKb8lrJw,KdAWjL9MKjpJzEeI902qBA,5,668639,50815
2624227,fMrV62gzDWGEtCgD-xcS6g,-QI8Qi8XWH3D8y8ethnajA,2,1184061,989
1566589,tL0M8q1-3ojo_UAG1FfhWA,jeiV-bDLPmo0AhBZGF8v4w,5,1564772,111749
...,...,...,...,...,...
5830488,3CoSdSKcBeL6td6y56Ge4g,VUkBbffnuUpKX9MKRUt4yQ,5,115265,76212
6328293,D6pRYr5uPPpe9O0dMsS9Nw,jIuRCFtO7rEOezPAn7rCQg,5,386377,110898
644602,N5zjh9ato1iHAjou4DM1lQ,5CZIGD5oNpS5gQTiVxHCdg,5,658870,14747
985146,kIkw_hlSkANX5_xwwW1amw,4_-IcMpkF_sBRHomWZHNzA,5,1319135,13184


In [17]:
if tf.test.gpu_device_name():
    print('GPU device found: {}'.format(tf.test.gpu_device_name()))
else:
    print("No GPU device found. Training on CPU.")

GPU device found: /device:GPU:0


In [18]:
num_users = len(user_encoder.classes_)
num_businesses = len(business_encoder.classes_)

print(f"Unique Users: {num_users}, Unique Businesses: {num_businesses}")

Unique Users: 1746009, Unique Businesses: 150339


In [19]:
embedding_dim=50

# User and business input layers
user_input = Input(shape=(1,), name='user_input')
business_input = Input(shape=(1,), name='business_input')

# Embedding layers for user and business IDs
user_embedding = Embedding(input_dim=num_users, output_dim=embedding_dim)(user_input)
business_embedding = Embedding(input_dim=num_businesses, output_dim=embedding_dim)(business_input)

# Flatten the embeddings
user_flatten = Flatten()(user_embedding)
business_flatten = Flatten()(business_embedding)

# Merge the embeddings using concat
merged = Concatenate()([user_flatten, business_flatten])

# Add some dense layers for additional modeling
dense_layer = Dense(64, activation='relu')(merged)
output_layer = Dense(1, activation='linear')(dense_layer)

model = Model(inputs=[user_input, business_input], outputs=output_layer)
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mae'])
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 user_input (InputLayer)        [(None, 1)]          0           []                               
                                                                                                  
 business_input (InputLayer)    [(None, 1)]          0           []                               
                                                                                                  
 embedding (Embedding)          (None, 1, 50)        87300450    ['user_input[0][0]']             
                                                                                                  
 embedding_1 (Embedding)        (None, 1, 50)        7516950     ['business_input[0][0]']         
                                                                                              

In [20]:
batch_size = 32
epochs = 10

user_ids = train_data['user_id_encoded'].values
business_ids = train_data['business_id_encoded'].values
stars = train_data['stars'].values

In [21]:
model_checkpoint = ModelCheckpoint(f'saved/model.h5', 
                             monitor='val_loss',   # Monitor validation loss
                             save_best_only=True,  # Save only the best model
                             mode='min'            # Mode of monitoring (minimize validation loss)
                            )

early_stopping = EarlyStopping(monitor='val_loss',
                               patience=1,
                               restore_best_weights=True
                              )

In [None]:
history = model.fit(
    [user_ids, business_ids],
    stars,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.2
)

In [None]:
del reviews, train_data

In [2]:
model = load_model("../saved/model.h5")
model.summary()

TypeError: weight_decay is not a valid argument, kwargs should be empty  for `optimizer_experimental.Optimizer`.