In [1]:
# connect to GDrive for data
# folder: https://drive.google.com/drive/folders/1z9qUAX7fQfcAOX7FH-CerkKfMCj6OxWX
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install turicreate

Collecting turicreate
[?25l  Downloading https://files.pythonhosted.org/packages/25/9f/a76acc465d873d217f05eac4846bd73d640b9db6d6f4a3c29ad92650fbbe/turicreate-6.4.1-cp37-cp37m-manylinux1_x86_64.whl (92.0MB)
[K     |████████████████████████████████| 92.0MB 52kB/s 
Collecting resampy==0.2.1
[?25l  Downloading https://files.pythonhosted.org/packages/14/b6/66a06d85474190b50aee1a6c09cdc95bb405ac47338b27e9b21409da1760/resampy-0.2.1.tar.gz (322kB)
[K     |████████████████████████████████| 327kB 56.0MB/s 
[?25hCollecting numba<0.51.0
[?25l  Downloading https://files.pythonhosted.org/packages/04/be/8c88cee3366de2a3a23a9ff1a8be34e79ad1eb1ceb0d0e33aca83655ac3c/numba-0.50.1-cp37-cp37m-manylinux2014_x86_64.whl (3.6MB)
[K     |████████████████████████████████| 3.6MB 34.7MB/s 
Collecting prettytable==0.7.2
  Downloading https://files.pythonhosted.org/packages/ef/30/4b0746848746ed5941f052479e7c23d2b56d174b82f4fd34a25e389831f5/prettytable-0.7.2.tar.bz2
Collecting coremltools==3.3
[?25l  Downloa

In [3]:
import os

import math

import pandas as pd
import numpy as np
import datetime
from sklearn.model_selection import train_test_split

import pickle

import tensorflow as tf

import matplotlib.pyplot as plt

from tqdm.notebook import tqdm

import turicreate

In [4]:
# Detect hardware
try:
    tpu_resolver = tf.distribute.cluster_resolver.TPUClusterResolver() # TPU detection
except ValueError:
    tpu_resolver = None
    gpus = tf.config.experimental.list_logical_devices("GPU")

# Select appropriate distribution strategy
if tpu_resolver:
    tf.config.experimental_connect_to_cluster(tpu_resolver)
    tf.tpu.experimental.initialize_tpu_system(tpu_resolver)
    strategy = tf.distribute.experimental.TPUStrategy(tpu_resolver)
    print('Running on TPU ', tpu_resolver.cluster_spec().as_dict()['worker'])
elif len(gpus) > 1:
    strategy = tf.distribute.MirroredStrategy([gpu.name for gpu in gpus])
    print('Running on multiple GPUs ', [gpu.name for gpu in gpus])
elif len(gpus) == 1:
    strategy = tf.distribute.get_strategy() # default strategy that works on CPU and single GPU
    print('Running on single GPU ', gpus[0].name)
else:
    strategy = tf.distribute.get_strategy() # default strategy that works on CPU and single GPU

    print('Running on CPU')
print("Number of accelerators: ", strategy.num_replicas_in_sync)



Running on CPU
Number of accelerators:  1


In [5]:
from tensorflow.python.client import device_lib

def get_available_devices():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU' or x.device_type == 'CPU']


In [6]:
get_available_devices()

['/device:CPU:0']

In [None]:
import subprocess as sp
import os

def get_gpu_memory():
    _output_to_list = lambda x: x.decode('ascii').split('\n')[:-1]

    ACCEPTABLE_AVAILABLE_MEMORY = 1024
    COMMAND = "nvidia-smi --query-gpu=memory.free --format=csv"
    memory_free_info = _output_to_list(sp.check_output(COMMAND.split()))[1:]
    memory_free_values = [int(x.split()[0]) for i, x in enumerate(memory_free_info)]
    print(memory_free_values)
    return memory_free_values

get_gpu_memory()

[14839]


[14839]

In [7]:
dir_prefix = "drive/MyDrive/W266/data/individual_reviews_bert_uncased/"

In [7]:
max_sequence_length = 64
max_review_count = 50

In [9]:
# pool of all posssible review embeddings
bert_embeddings_raw_data = pd.read_pickle(os.path.join(dir_prefix,"final_dataset_bert_mean_pooled_3-11.pkl")).drop_duplicates(subset=['asin','reviewerID']).drop('bert_embedding',axis=1)
# the 'usable' rows which have >= 10 reviews per reviewerID and asin
bert_embeddings_usable = bert_embeddings_raw_data[(bert_embeddings_raw_data['userReviewsCount'] >=10) & 
                                                  (bert_embeddings_raw_data['itemReviewsCount'] >=10) &
                                                  (bert_embeddings_raw_data['userReviewsCount'] <= max_review_count) &
                                                   (bert_embeddings_raw_data['itemReviewsCount'] <= max_review_count)][['asin','reviewerID','overall']]
# sort the pool of possible review embeddings for quick retrieval
user_sorted_raw_data = bert_embeddings_raw_data.sort_values('reviewerID')
item_sorted_raw_data = bert_embeddings_raw_data.sort_values('asin')

In [11]:
sf = turicreate.SFrame(bert_embeddings_usable)

In [16]:
train, test = turicreate.recommender.util.random_split_by_user(sf, user_id='reviewerID', item_id='asin', max_num_users=400)

In [20]:
m1 = turicreate.factorization_recommender.create(train, user_id='reviewerID', item_id='asin', target='overall')

In [21]:
m1.evaluate(test)


Precision and recall summary statistics by cutoff
+--------+-----------------------+-----------------------+
| cutoff |     mean_precision    |      mean_recall      |
+--------+-----------------------+-----------------------+
|   1    |          0.0          |          0.0          |
|   2    | 0.0015337423312883438 | 0.0015337423312883438 |
|   3    | 0.0010224948875255623 | 0.0015337423312883438 |
|   4    | 0.0007668711656441719 | 0.0015337423312883438 |
|   5    | 0.0006134969325153374 | 0.0015337423312883438 |
|   6    | 0.0010224948875255623 |  0.002556237218813906 |
|   7    | 0.0008764241893076248 |  0.002556237218813906 |
|   8    | 0.0015337423312883438 |  0.004345603271983641 |
|   9    | 0.0013633265167007499 |  0.004345603271983641 |
|   10   | 0.0012269938650306745 |  0.004345603271983641 |
+--------+-----------------------+-----------------------+
[10 rows x 3 columns]


Overall RMSE: 1.1434048704384459

Per User RMSE (best)
+----------------+----------------------+---

{'precision_recall_by_user': Columns:
 	reviewerID	str
 	cutoff	int
 	precision	float
 	recall	float
 	count	int
 
 Rows: 5868
 
 Data:
 +----------------+--------+-----------+--------+-------+
 |   reviewerID   | cutoff | precision | recall | count |
 +----------------+--------+-----------+--------+-------+
 | A3BOIJO08BTJUD |   1    |    0.0    |  0.0   |   3   |
 | A3BOIJO08BTJUD |   2    |    0.0    |  0.0   |   3   |
 | A3BOIJO08BTJUD |   3    |    0.0    |  0.0   |   3   |
 | A3BOIJO08BTJUD |   4    |    0.0    |  0.0   |   3   |
 | A3BOIJO08BTJUD |   5    |    0.0    |  0.0   |   3   |
 | A3BOIJO08BTJUD |   6    |    0.0    |  0.0   |   3   |
 | A3BOIJO08BTJUD |   7    |    0.0    |  0.0   |   3   |
 | A3BOIJO08BTJUD |   8    |    0.0    |  0.0   |   3   |
 | A3BOIJO08BTJUD |   9    |    0.0    |  0.0   |   3   |
 | A3BOIJO08BTJUD |   10   |    0.0    |  0.0   |   3   |
 +----------------+--------+-----------+--------+-------+
 [5868 rows x 5 columns]
 Note: Only the head of the

In [22]:
# pool of all posssible review embeddings
held_out_test_raw_data = pd.read_pickle(os.path.join(dir_prefix,"final_dataset_bert_mean_pooled_heldout-test-3-11.pkl.gz")).drop_duplicates(subset=['asin','reviewerID']).drop('bert_embedding',axis=1)
# held_out_test_raw_data = pd.read_pickle(os.path.join(dir_prefix,"yelp_held_out_test_dataset.pkl.gz")).drop_duplicates(subset=['asin','reviewerID'])
# the 'usable' rows which have >= 10 reviews per reviewerID and asin
held_out_test_usable = held_out_test_raw_data[(held_out_test_raw_data['userReviewsCount'] >=10) 
                                              & (held_out_test_raw_data['itemReviewsCount'] >=10) &
                                             (held_out_test_raw_data['userReviewsCount'] <= max_review_count)  &
                                             (held_out_test_raw_data['itemReviewsCount'] <= max_review_count)  ][['asin','reviewerID','overall']]
# sort the pool of possible review embeddings for quick retrieval
user_sorted_held_out = held_out_test_raw_data.sort_values('reviewerID')
item_sorted_held_out = held_out_test_raw_data.sort_values('asin')

In [23]:
print(f'Held out test size: {len(held_out_test_usable)}, {len(held_out_test_raw_data)}')

Held out test size: 5358, 70297


In [24]:
hot = turicreate.SFrame(held_out_test_usable)

In [25]:
m1.evaluate(hot)


Precision and recall summary statistics by cutoff
+--------+----------------+-------------+
| cutoff | mean_precision | mean_recall |
+--------+----------------+-------------+
|   1    |      0.0       |     0.0     |
|   2    |      0.0       |     0.0     |
|   3    |      0.0       |     0.0     |
|   4    |      0.0       |     0.0     |
|   5    |      0.0       |     0.0     |
|   6    |      0.0       |     0.0     |
|   7    |      0.0       |     0.0     |
|   8    |      0.0       |     0.0     |
|   9    |      0.0       |     0.0     |
|   10   |      0.0       |     0.0     |
+--------+----------------+-------------+
[10 rows x 3 columns]


Overall RMSE: 1.0516648726209883

Per User RMSE (best)
+----------------+----------------------+-------+
|   reviewerID   |         rmse         | count |
+----------------+----------------------+-------+
| A2N04V181QHDG1 | 0.008317371275338381 |   1   |
+----------------+----------------------+-------+
[1 rows x 3 columns]


Per User 

{'precision_recall_by_user': Columns:
 	reviewerID	str
 	cutoff	int
 	precision	float
 	recall	float
 	count	int
 
 Rows: 32328
 
 Data:
 +----------------+--------+-----------+--------+-------+
 |   reviewerID   | cutoff | precision | recall | count |
 +----------------+--------+-----------+--------+-------+
 | A1MUTGFKWD74U4 |   1    |    0.0    |  0.0   |   2   |
 | A1MUTGFKWD74U4 |   2    |    0.0    |  0.0   |   2   |
 | A1MUTGFKWD74U4 |   3    |    0.0    |  0.0   |   2   |
 | A1MUTGFKWD74U4 |   4    |    0.0    |  0.0   |   2   |
 | A1MUTGFKWD74U4 |   5    |    0.0    |  0.0   |   2   |
 | A1MUTGFKWD74U4 |   6    |    0.0    |  0.0   |   2   |
 | A1MUTGFKWD74U4 |   7    |    0.0    |  0.0   |   2   |
 | A1MUTGFKWD74U4 |   8    |    0.0    |  0.0   |   2   |
 | A1MUTGFKWD74U4 |   9    |    0.0    |  0.0   |   2   |
 | A1MUTGFKWD74U4 |   10   |    0.0    |  0.0   |   2   |
 +----------------+--------+-----------+--------+-------+
 [32328 rows x 5 columns]
 Note: Only the head of t

In [8]:
# ---- Yelp

In [9]:
dir_prefix = "drive/MyDrive/W266/data/Yelp/"

In [10]:
# pool of all posssible review embeddings
bert_embeddings_raw_data = pd.read_pickle(os.path.join(dir_prefix,"yelp_train_val_dataset.pkl.gz")).drop_duplicates(subset=['asin','reviewerID'])
# the 'usable' rows which have >= 10 reviews per reviewerID and asin
bert_embeddings_usable = bert_embeddings_raw_data[(bert_embeddings_raw_data['userReviewsCount'] >=10) & 
                                                  (bert_embeddings_raw_data['itemReviewsCount'] >=10) &
                                                  (bert_embeddings_raw_data['userReviewsCount'] <= max_review_count) &
                                                   (bert_embeddings_raw_data['itemReviewsCount'] <= max_review_count)][['asin','reviewerID','overall']]
# sort the pool of possible review embeddings for quick retrieval
user_sorted_raw_data = bert_embeddings_raw_data.sort_values('reviewerID')
item_sorted_raw_data = bert_embeddings_raw_data.sort_values('asin')

In [11]:
sf = turicreate.SFrame(bert_embeddings_usable)

In [13]:
train, test = turicreate.recommender.util.random_split_by_user(sf, user_id='reviewerID', item_id='asin', max_num_users=800)

In [14]:
m2 = turicreate.factorization_recommender.create(train, user_id='reviewerID', item_id='asin', target='overall')

In [15]:
m2.evaluate(test)


Precision and recall summary statistics by cutoff
+--------+-----------------------+-----------------------+
| cutoff |     mean_precision    |      mean_recall      |
+--------+-----------------------+-----------------------+
|   1    |          0.0          |          0.0          |
|   2    |          0.0          |          0.0          |
|   3    |          0.0          |          0.0          |
|   4    |          0.0          |          0.0          |
|   5    |          0.0          |          0.0          |
|   6    |          0.0          |          0.0          |
|   7    |          0.0          |          0.0          |
|   8    |          0.0          |          0.0          |
|   9    |          0.0          |          0.0          |
|   10   | 0.0001577287066246057 | 0.0015772870662460572 |
+--------+-----------------------+-----------------------+
[10 rows x 3 columns]


Overall RMSE: 1.1873121451723343

Per User RMSE (best)
+------------------------+------------------

{'precision_recall_by_user': Columns:
 	reviewerID	str
 	cutoff	int
 	precision	float
 	recall	float
 	count	int
 
 Rows: 11412
 
 Data:
 +------------------------+--------+-----------+--------+-------+
 |       reviewerID       | cutoff | precision | recall | count |
 +------------------------+--------+-----------+--------+-------+
 | wJHy7ZJG_EvLFQDRms5rXQ |   1    |    0.0    |  0.0   |   2   |
 | wJHy7ZJG_EvLFQDRms5rXQ |   2    |    0.0    |  0.0   |   2   |
 | wJHy7ZJG_EvLFQDRms5rXQ |   3    |    0.0    |  0.0   |   2   |
 | wJHy7ZJG_EvLFQDRms5rXQ |   4    |    0.0    |  0.0   |   2   |
 | wJHy7ZJG_EvLFQDRms5rXQ |   5    |    0.0    |  0.0   |   2   |
 | wJHy7ZJG_EvLFQDRms5rXQ |   6    |    0.0    |  0.0   |   2   |
 | wJHy7ZJG_EvLFQDRms5rXQ |   7    |    0.0    |  0.0   |   2   |
 | wJHy7ZJG_EvLFQDRms5rXQ |   8    |    0.0    |  0.0   |   2   |
 | wJHy7ZJG_EvLFQDRms5rXQ |   9    |    0.0    |  0.0   |   2   |
 | wJHy7ZJG_EvLFQDRms5rXQ |   10   |    0.0    |  0.0   |   2   |
 +---

In [16]:
# pool of all posssible review embeddings
#held_out_test_raw_data = pd.read_pickle(os.path.join(dir_prefix,"final_dataset_bert_mean_pooled_heldout-test-3-11.pkl.gz")).drop_duplicates(subset=['asin','reviewerID']).drop('bert_embedding',axis=1)
held_out_test_raw_data = pd.read_pickle(os.path.join(dir_prefix,"yelp_held_out_test_dataset.pkl.gz")).drop_duplicates(subset=['asin','reviewerID'])
# the 'usable' rows which have >= 10 reviews per reviewerID and asin
held_out_test_usable = held_out_test_raw_data[(held_out_test_raw_data['userReviewsCount'] >=10) 
                                              & (held_out_test_raw_data['itemReviewsCount'] >=10) &
                                             (held_out_test_raw_data['userReviewsCount'] <= max_review_count)  &
                                             (held_out_test_raw_data['itemReviewsCount'] <= max_review_count)  ][['asin','reviewerID','overall']]
# sort the pool of possible review embeddings for quick retrieval
user_sorted_held_out = held_out_test_raw_data.sort_values('reviewerID')
item_sorted_held_out = held_out_test_raw_data.sort_values('asin')

In [17]:
hot_yelp = turicreate.SFrame(held_out_test_usable)

In [18]:
m2.evaluate(hot_yelp)


Precision and recall summary statistics by cutoff
+--------+------------------------+------------------------+
| cutoff |     mean_precision     |      mean_recall       |
+--------+------------------------+------------------------+
|   1    | 0.0005277044854881267  | 0.00010554089709762527 |
|   2    | 0.00026385224274406337 | 0.00010554089709762536 |
|   3    | 0.00017590149516270872 | 0.0001055408970976253  |
|   4    | 0.00026385224274406294 | 0.00018092725216735763 |
|   5    | 0.00021108179419525064 | 0.0001809272521673577  |
|   6    | 0.00017590149516270864 | 0.00018092725216735768 |
|   7    | 0.00015077271013946476 | 0.00018092725216735782 |
|   8    | 0.00013192612137203155 | 0.00018092725216735766 |
|   9    | 0.00011726766344180573 | 0.0001809272521673576  |
|   10   | 0.0001055408970976253  | 0.00018092725216735785 |
+--------+------------------------+------------------------+
[10 rows x 3 columns]


Overall RMSE: 1.0096479291739755

Per User RMSE (best)
+---------------

{'precision_recall_by_user': Columns:
 	reviewerID	str
 	cutoff	int
 	precision	float
 	recall	float
 	count	int
 
 Rows: 34110
 
 Data:
 +------------------------+--------+-----------+--------+-------+
 |       reviewerID       | cutoff | precision | recall | count |
 +------------------------+--------+-----------+--------+-------+
 | 0VOrqc8SFndCMUYUy_rYlQ |   1    |    0.0    |  0.0   |   1   |
 | 0VOrqc8SFndCMUYUy_rYlQ |   2    |    0.0    |  0.0   |   1   |
 | 0VOrqc8SFndCMUYUy_rYlQ |   3    |    0.0    |  0.0   |   1   |
 | 0VOrqc8SFndCMUYUy_rYlQ |   4    |    0.0    |  0.0   |   1   |
 | 0VOrqc8SFndCMUYUy_rYlQ |   5    |    0.0    |  0.0   |   1   |
 | 0VOrqc8SFndCMUYUy_rYlQ |   6    |    0.0    |  0.0   |   1   |
 | 0VOrqc8SFndCMUYUy_rYlQ |   7    |    0.0    |  0.0   |   1   |
 | 0VOrqc8SFndCMUYUy_rYlQ |   8    |    0.0    |  0.0   |   1   |
 | 0VOrqc8SFndCMUYUy_rYlQ |   9    |    0.0    |  0.0   |   1   |
 | 0VOrqc8SFndCMUYUy_rYlQ |   10   |    0.0    |  0.0   |   1   |
 +---

In [20]:
held_out_test_usable['overall'].mean()

3.9848831565078706