In [63]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import hashlib

import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Embedding, Flatten, Input, Dot, Concatenate, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

pd.set_option('display.max_columns', None)

In [3]:
%%time
reviews_chunk = pd.read_json("../dataset/jsons/yelp_academic_dataset_review.json", lines=True,
                      dtype={'review_id':str,'user_id':str,
                             'business_id':str,'stars':'int8',
                             'date':str,'text':str,'useful':'int8',
                             'funny':'int8','cool':'int8'},
                      chunksize=10000)

reviews_data = [review for review in reviews_chunk]
reviews = pd.concat(reviews_data)

Wall time: 4min 20s


In [5]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6990280 entries, 0 to 6990279
Data columns (total 9 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   review_id    object
 1   user_id      object
 2   business_id  object
 3   stars        int8  
 4   useful       int8  
 5   funny        int8  
 6   cool         int8  
 7   text         object
 8   date         object
dtypes: int8(4), object(5)
memory usage: 293.3+ MB


In [6]:
reviews.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15


In [7]:
del reviews_chunk, reviews_data

In [9]:
reviews = reviews[['user_id','business_id','stars']]

In [10]:
reviews

Unnamed: 0,user_id,business_id,stars
0,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3
1,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5
2,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3
3,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5
4,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4
...,...,...,...
6990275,qskILQ3k0I_qcCMI-k6_QQ,jals67o91gcrD4DC81Vk6w,5
6990276,Zo0th2m8Ez4gLSbHftiQvg,2vLksaMmSEcGbjI5gywpZA,5
6990277,mm6E4FbCMwJmb7kPDZ5v2Q,R1khUUxidqfaJmcpmGd4aw,4
6990278,YwAMC-jvZ1fvEUum6QkEkw,Rr9kKArrMhSLVE9a53q-aA,5


In [32]:
%%time
train_data, test_data = train_test_split(reviews, test_size=0.2)

Wall time: 4.82 s


In [33]:
print(train_data.shape)
print(test_data.shape)

(5592224, 3)
(1398056, 3)


In [36]:
%%time
user_encoder = LabelEncoder()
business_encoder = LabelEncoder()

train_data['user_id_encoded'] = user_encoder.fit_transform(train_data['user_id'])
train_data['business_id_encoded'] = business_encoder.fit_transform(train_data['business_id'])

Wall time: 17.3 s


In [45]:
len(user_encoder.classes_), len(business_encoder.classes_)

(1746094, 150340)

In [46]:
test_data = test_data[test_data['user_id'].isin(user_encoder.classes_)]
test_data = test_data[test_data['business_id'].isin(business_encoder.classes_)]

In [47]:
print(train_data.shape)
print(test_data.shape)

(5592224, 5)
(1140220, 3)


In [49]:
test_data['user_id_encoded'] = user_encoder.transform(test_data['user_id'])
test_data['business_id_encoded'] = business_encoder.transform(test_data['business_id'])

In [50]:
train_data

Unnamed: 0,user_id,business_id,stars,user_id_encoded,business_id_encoded
2545771,A-PODWXgcu7Ln2XjT0DLUQ,UkBmK4Icaximp2DY2Zf_iw,2,301008,74482
1433261,LlTLS-sVaRnB2y6PZ8gGYQ,OR1zqBRTlDSBEDnnAT3iEw,3,622479,59586
3205963,Wia-9CNswal2zF71HFdpeQ,E7RtIisUoRMcADSebBgNAA,1,920955,35773
6240863,WwvnhJ1NEk1PTNJ4hKYMdg,9f5GXEeTvBWnrZ-AHEjgJQ,5,926970,25307
2268162,nYlqRr81Cz09wBTf2E6qYA,W9y4JkMqy2dFNwYc3Y5cKQ,5,1407115,77767
...,...,...,...,...,...
243854,EqSWxn6_7hADuUU9hd0qkQ,seouwqRlM9YNNAkZS09FuQ,5,433123,132881
1169156,faixzMpezcN-XjFEkOP27A,JpJogTQGjOQrA7yt1V8PMg,4,1190055,48927
4620998,K4DosZGNb2U3NY4mba9cSw,ToH4z1F5Tyuo3p7VhXCpTA,3,576233,72264
26757,Ts-H2YcgRSbBzA8TQhoWbQ,7v_LNBsEORxSFcL4VaM1_Q,1,842966,21230


In [51]:
test_data

Unnamed: 0,user_id,business_id,stars,user_id_encoded,business_id_encoded
2608933,XwtMA0kDNLjeACT3evdxhQ,lamYyvyLQdP2aT575shC2w,1,953871,116349
5816578,4dytCu_mR8CovweFR5EPHQ,79nXYNRPoZBc9_y_OLZg2w,5,154675,19398
4829913,sy7lmIIrUhdos0Q--Lp-Rw,i76ERS3jM111T3VMbi2yrQ,5,1554557,108145
3277244,oZG3IaSr-QdgK2B-ZJd51Q,RQJ0f2vUvfDartuciAtAhw,1,1434627,66657
2904192,Ch4GHo9J5Bji7ZGXmLsOtw,mfTyBrvx9uCTSPjmrl8PjA,4,374747,118837
...,...,...,...,...,...
2969654,M79rv2KKQA4cYEoTIBXvoQ,IXr5oaYWFbDJN7CcdcyQVQ,4,632209,45909
1577634,paZeP3jrQ4DWjxakIIZ8pQ,hTA0eCoMdAebXzm4jkx-0A,4,1462431,106614
1101663,QlcWKpVTsCRLIJXiOWewSw,G6lbDeRY_ZpD7FS5dL3qJw,4,758686,40368
2753624,19aAhhiIukw3IQxdCWnfnQ,gqhOAmj7npSiq4b6j5-vuw,3,59178,105110


In [54]:
if tf.test.gpu_device_name():
    print('GPU device found: {}'.format(tf.test.gpu_device_name()))
else:
    print("No GPU device found. Training on CPU.")

GPU device found: /device:GPU:0


In [59]:
num_users = len(user_encoder.classes_)
num_businesses = len(business_encoder.classes_)

print(f"Unique Users: {num_users}, Unique Businesses: {num_businesses}")

Unique Users: 1746094, Unique Businesses: 150340


In [60]:
embedding_dim=50

# User and business input layers
user_input = Input(shape=(1,), name='user_input')
business_input = Input(shape=(1,), name='business_input')

# Embedding layers for user and business IDs
user_embedding = Embedding(input_dim=num_users, output_dim=embedding_dim)(user_input)
business_embedding = Embedding(input_dim=num_businesses, output_dim=embedding_dim)(business_input)

# Flatten the embeddings
user_flatten = Flatten()(user_embedding)
business_flatten = Flatten()(business_embedding)

# Merge the embeddings using concat
merged = Concatenate()([user_flatten, business_flatten])

# Add some dense layers for additional modeling
dense_layer = Dense(64, activation='relu')(merged)
output_layer = Dense(1, activation='linear')(dense_layer)

model = Model(inputs=[user_input, business_input], outputs=output_layer)
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mae'])
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 user_input (InputLayer)        [(None, 1)]          0           []                               
                                                                                                  
 business_input (InputLayer)    [(None, 1)]          0           []                               
                                                                                                  
 embedding (Embedding)          (None, 1, 50)        87304700    ['user_input[0][0]']             
                                                                                                  
 embedding_1 (Embedding)        (None, 1, 50)        7517000     ['business_input[0][0]']         
                                                                                              

In [None]:
# Define batch size and number of epochs
batch_size = 32
epochs = 10

# Define the input features and target (rating)
user_ids = train_data['user_id_encoded'].values  # Encoded user IDs
business_ids = train_data['business_id_encoded'].values  # Encoded business IDs
ratings = train_data['stars'].values  # Target ratings

# Create a tf.data.Dataset from the numpy arrays
dataset = tf.data.Dataset.from_tensor_slices((user_ids, business_ids, ratings))

# Shuffle and batch the dataset
dataset = dataset.shuffle(buffer_size=len(train_data)).batch(batch_size)

# Define the optimizer and loss function
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
loss_fn = tf.keras.losses.MeanSquaredError()

# Iterate through epochs and train the model
for epoch in range(epochs):
    for batch_data in dataset:
        user_ids_batch, business_ids_batch, ratings_batch = batch_data
        with tf.GradientTape() as tape:
            # Forward pass
            predictions = model([user_ids_batch, business_ids_batch])
            loss_value = loss_fn(ratings_batch, predictions)
        # Compute gradients and apply updates
        gradients = tape.gradient(loss_value, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    
    # Print training progress
    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss_value.numpy():.4f}")

# After training, you can save or use the model for predictions


In [61]:
batch_size = 32
epochs = 10

user_ids = train_data['user_id_encoded'].values
business_ids = train_data['business_id_encoded'].values
stars = train_data['stars'].values

In [64]:
model_checkpoint = ModelCheckpoint(f'saved/model.h5', 
                             monitor='val_loss',   # Monitor validation loss
                             save_best_only=True,  # Save only the best model
                             mode='min'            # Mode of monitoring (minimize validation loss)
                            )

early_stopping = EarlyStopping(monitor='val_loss',
                               patience=1,
                               restore_best_weights=True
                              )

In [65]:
history = model.fit(
    [user_ids, business_ids],
    stars,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.2
)

Epoch 1/10
     1/139806 [..............................] - ETA: 107:22:31 - loss: 18.5051 - mae: 4.1144

ResourceExhaustedError: Graph execution error:

Detected at node 'Adam/Adam/update/AssignVariableOp' defined at (most recent call last):
    File "C:\Users\OMEN\Anaconda3\lib\runpy.py", line 193, in _run_module_as_main
      "__main__", mod_spec)
    File "C:\Users\OMEN\Anaconda3\lib\runpy.py", line 85, in _run_code
      exec(code, run_globals)
    File "C:\Users\OMEN\Anaconda3\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
      app.launch_new_instance()
    File "C:\Users\OMEN\Anaconda3\lib\site-packages\traitlets\config\application.py", line 664, in launch_instance
      app.start()
    File "C:\Users\OMEN\Anaconda3\lib\site-packages\ipykernel\kernelapp.py", line 563, in start
      self.io_loop.start()
    File "C:\Users\OMEN\Anaconda3\lib\site-packages\tornado\platform\asyncio.py", line 148, in start
      self.asyncio_loop.run_forever()
    File "C:\Users\OMEN\Anaconda3\lib\asyncio\base_events.py", line 534, in run_forever
      self._run_once()
    File "C:\Users\OMEN\Anaconda3\lib\asyncio\base_events.py", line 1771, in _run_once
      handle._run()
    File "C:\Users\OMEN\Anaconda3\lib\asyncio\events.py", line 88, in _run
      self._context.run(self._callback, *self._args)
    File "C:\Users\OMEN\Anaconda3\lib\site-packages\tornado\ioloop.py", line 690, in <lambda>
      lambda f: self._run_callback(functools.partial(callback, future))
    File "C:\Users\OMEN\Anaconda3\lib\site-packages\tornado\ioloop.py", line 743, in _run_callback
      ret = callback()
    File "C:\Users\OMEN\Anaconda3\lib\site-packages\tornado\gen.py", line 787, in inner
      self.run()
    File "C:\Users\OMEN\Anaconda3\lib\site-packages\tornado\gen.py", line 748, in run
      yielded = self.gen.send(value)
    File "C:\Users\OMEN\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 365, in process_one
      yield gen.maybe_future(dispatch(*args))
    File "C:\Users\OMEN\Anaconda3\lib\site-packages\tornado\gen.py", line 209, in wrapper
      yielded = next(result)
    File "C:\Users\OMEN\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 272, in dispatch_shell
      yield gen.maybe_future(handler(stream, idents, msg))
    File "C:\Users\OMEN\Anaconda3\lib\site-packages\tornado\gen.py", line 209, in wrapper
      yielded = next(result)
    File "C:\Users\OMEN\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 542, in execute_request
      user_expressions, allow_stdin,
    File "C:\Users\OMEN\Anaconda3\lib\site-packages\tornado\gen.py", line 209, in wrapper
      yielded = next(result)
    File "C:\Users\OMEN\Anaconda3\lib\site-packages\ipykernel\ipkernel.py", line 294, in do_execute
      res = shell.run_cell(code, store_history=store_history, silent=silent)
    File "C:\Users\OMEN\Anaconda3\lib\site-packages\ipykernel\zmqshell.py", line 536, in run_cell
      return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
    File "C:\Users\OMEN\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2855, in run_cell
      raw_cell, store_history, silent, shell_futures)
    File "C:\Users\OMEN\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2881, in _run_cell
      return runner(coro)
    File "C:\Users\OMEN\Anaconda3\lib\site-packages\IPython\core\async_helpers.py", line 68, in _pseudo_sync_runner
      coro.send(None)
    File "C:\Users\OMEN\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3058, in run_cell_async
      interactivity=interactivity, compiler=compiler, result=result)
    File "C:\Users\OMEN\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3249, in run_ast_nodes
      if (await self.run_code(code, result,  async_=asy)):
    File "C:\Users\OMEN\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3326, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "<ipython-input-65-a555c7a893b6>", line 6, in <module>
      validation_split=0.2
    File "C:\Users\OMEN\Anaconda3\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\OMEN\Anaconda3\lib\site-packages\keras\engine\training.py", line 1384, in fit
      tmp_logs = self.train_function(iterator)
    File "C:\Users\OMEN\Anaconda3\lib\site-packages\keras\engine\training.py", line 1021, in train_function
      return step_function(self, iterator)
    File "C:\Users\OMEN\Anaconda3\lib\site-packages\keras\engine\training.py", line 1010, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\OMEN\Anaconda3\lib\site-packages\keras\engine\training.py", line 1000, in run_step
      outputs = model.train_step(data)
    File "C:\Users\OMEN\Anaconda3\lib\site-packages\keras\engine\training.py", line 863, in train_step
      self.optimizer.minimize(loss, self.trainable_variables, tape=tape)
    File "C:\Users\OMEN\Anaconda3\lib\site-packages\keras\optimizer_v2\optimizer_v2.py", line 532, in minimize
      return self.apply_gradients(grads_and_vars, name=name)
    File "C:\Users\OMEN\Anaconda3\lib\site-packages\keras\optimizer_v2\optimizer_v2.py", line 675, in apply_gradients
      name=name)
    File "C:\Users\OMEN\Anaconda3\lib\site-packages\keras\optimizer_v2\optimizer_v2.py", line 717, in _distributed_apply
      var, apply_grad_to_update_var, args=(grad,), group=False)
    File "C:\Users\OMEN\Anaconda3\lib\site-packages\keras\optimizer_v2\optimizer_v2.py", line 695, in apply_grad_to_update_var
      grad.values, var, grad.indices, **apply_kwargs)
    File "C:\Users\OMEN\Anaconda3\lib\site-packages\keras\optimizer_v2\optimizer_v2.py", line 1281, in _resource_apply_sparse_duplicate_indices
      **kwargs)
    File "C:\Users\OMEN\Anaconda3\lib\site-packages\keras\optimizer_v2\adam.py", line 202, in _resource_apply_sparse
      use_locking=self._use_locking)
Node: 'Adam/Adam/update/AssignVariableOp'
OOM when allocating tensor with shape[1746094,50] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node Adam/Adam/update/AssignVariableOp}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_train_function_871]