In [11]:
import json 
import torch 
from torch.utils.data import Dataset, DataLoader 
import torch.nn as nn 
import os
import json
import gzip
import pandas as pd
import numpy as np 
from urllib.request import urlopen
from tqdm import tqdm 

In [4]:
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Video_Games_5.json.gz

--2022-05-21 14:27:54--  http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Video_Games_5.json.gz
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 154050105 (147M) [application/octet-stream]
Saving to: ‘Video_Games_5.json.gz’


2022-05-21 14:28:02 (18.7 MB/s) - ‘Video_Games_5.json.gz’ saved [154050105/154050105]



In [7]:
data = []
with gzip.open('Video_Games_5.json.gz') as f:
    for l in f:
        data.append(json.loads(l.strip()))
    

print(len(data))


497577


In [14]:
df = pd.DataFrame(data)

In [15]:
del data; 
df.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,style,image
0,5.0,True,"10 17, 2015",A1HP7NVNPFMA4N,700026657,Ambrosia075,"This game is a bit hard to get the hang of, bu...",but when you do it's great.,1445040000,,,
1,4.0,False,"07 27, 2015",A1JGAP0185YJI6,700026657,travis,I played it a while but it was alright. The st...,"But in spite of that it was fun, I liked it",1437955200,,,
2,3.0,True,"02 23, 2015",A1YJWEXHQBWK2B,700026657,Vincent G. Mezera,ok game.,Three Stars,1424649600,,,
3,2.0,True,"02 20, 2015",A2204E1TH211HT,700026657,Grandma KR,"found the game a bit too complicated, not what...",Two Stars,1424390400,,,
4,5.0,True,"12 25, 2014",A2RF5B5H74JLPE,700026657,jon,"great game, I love it and have played it since...",love this game,1419465600,,,


In [17]:
df = df.sort_values('reviewerID') # Sort by reviewer ID so that we can split on reviewerID during train test split

In [22]:
df_train = df[:315218].sample(frac=1., random_state=42).reset_index()
df_test = df[315218:].sample(frac=1., random_state=42).reset_index()

array(['B00005YCXO', 'B00104KCF8', 'B01ALRCD38', 'B0009SQFHA',
       'B0016BVYA2', 'B00TKLFOKQ', 'B003QOWQHC', 'B000ERVMI8',
       'B002EQFXZA', 'B00009ZVHW'], dtype=object)

In [27]:
def create_vocab(df, field):
    id2int = {}
    int2id = {}
    for i, id in enumerate(df[field].unique()):
        id2int[id] = i
        int2id[i] = id
            
    return id2int, int2id

In [28]:
product2int, int2product = create_vocab(df, 'asin')

In [30]:
reviewer2int, int2reviewer = create_vocab(df, 'reviewerID')

In [36]:
df_train

Unnamed: 0,index,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,style,image
0,32302,5.0,True,"06 25, 2013",A1P36N7FVUG7Z8,B00005YCXO,Sheldon Erickson,Fun for the whole family. I bought this way ba...,Great bargain game,1372118400,,,
1,127220,4.0,False,"10 18, 2008",A1EEKKTK46WMS7,B00104KCF8,Justin Mayer,it was great for 360 and was a good choice for...,lost planet,1224288000,2,,
2,431629,5.0,True,"06 24, 2016",A32LS5IP8KG3ZP,B01ALRCD38,Steph C.,love it,love it,1466726400,,{'Edition:': ' Zootopia: Power Disc Pack'},
3,68164,4.0,False,"08 4, 2006",A26113AWK9B7Y9,B0009SQFHA,Sean,Condemned lets you take control of FBI agent E...,Condemned is what survival horror is all about,1154649600,5,{'Platform:': ' Xbox 360'},
4,160294,3.0,False,"02 16, 2010",A2UN5UY8Q6YMOF,B0016BVYA2,S. Bradford,I really liked the first Bioshock so I was exc...,More of the same-with glitches,1266278400,,{'Format:': ' Video Game'},
...,...,...,...,...,...,...,...,...,...,...,...,...,...
315213,301663,5.0,False,"11 21, 2013",A1WVA7V02PQOY6,B00CKG3H66,Dad of Divas,My daughters have had the chance to play with ...,A Truly Fun Game For Lego Friends Fans!,1384992000,13,{'Format:': ' Video Game'},
315214,432485,5.0,True,"09 20, 2016",A2ZC0CLM48MKP2,B01BBKYM3I,Carfo,I'll keep this short and simple:\n\nPros:\n+LE...,Great LED customization; logitech switches are...,1474329600,,,
315215,434071,5.0,True,"11 24, 2016",A206JAD7W5ED6H,B01C93CWSI,Jason,"Another great Pokemon game, probably my favori...",Amazing game,1479945600,,"{'Edition:': ' Moon', 'Platform:': ' Nintendo ...",
315216,285293,5.0,False,"07 29, 2013",A24AOGQH7N9G20,B00AQF30XI,S. Runyon,This is a great hidden object game. I own over...,AWESOME !,1375056000,2,{'Platform:': ' PC Disc'},


In [43]:
class RecSysDataset(Dataset):
    def __init__(self, df, ):
        self.df = df 
        self.ratings = df['overall'].values
        self.product_ids = df['asin'].values
        self.reviewer_ids = df['reviewerID'].values
    def __len__(self):
        return len(self.data)
    
    
    
    def __getitem__(self, idx):
        
        rating = torch.tensor(self.ratings[idx], dtype=torch.float)
        product_id = torch.tensor(product2int[self.product_ids[idx]], dtype=torch.long)
        reviewer_id = torch.tensor(reviewer2int[self.reviewer_ids[idx]], dtype=torch.long)
        # review_text = item['reviewText'] # Add with bert tokenizer later 

        return {
            'rating': rating,
            'product_id': product_id,
            'reviewer_id': reviewer_id
        }




In [44]:
train_dataset = RecSysDataset(df_train)
test_dataset = RecSysDataset(df_test)

In [71]:
class MFModel(nn.Module):
    def __init__(self, num_reviewers, num_products, emb_sz):
        super().__init__()
        self.reviewer_embeddings = nn.Embedding(num_reviewers, emb_sz)
        self.product_embeddings = nn.Embedding(num_products, emb_sz)
    
    def forward(self, product_id, reviewer_id):
        print(reviewer_id)
        reviewer_vector = self.reviewer_embeddings(reviewer_id)
        product_vector = self.product_embeddings(product_id)
        return (reviewer_vector * product_vector).sum(1)



In [64]:
model = MFModel(len(reviewer2int), len(product2int), 10)

In [70]:
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [56]:
def train(train_loader, model, loss_fn, optimizer, device):
    accuracy = []
    total_loss = 0
    for idx, data in enumerate(tqdm(train_loader)):
        optimizer.zero_grad()
        rating = data['rating'].to(device)
        reviewer_id = data['reviewer_id'].to(device)
        product_id = data['product_id'].to(device)

        preds = model(product_id, reviewer_id)

        loss = loss_fn(preds, rating)
        total_loss += loss 
        optimizer.step()

        batch_accuracy = ((preds.cpu() == rating.cpu()).sum()) / len(data)
        accuracy.append(batch_accuracy)
    
    print(f'Training Loss: {total_loss/(len(train_loader))}, Training Accuracy: {np.mean(np.asarray(accuracy))}')



In [None]:
def test(test_loader, model, loss_fn, device):
    accuracy = []
    with torch.no_grad():
        for idx, data in enumerate(tqdm(test_loader)):

            rating = data['rating'].to(device)
            reviewer_id = data['reviewer_id'].to(device)
            product_id = data['product_id'].to(device)

            preds = model(product_id, reviewer_id)

            loss = loss_fn(preds, rating)

            batch_accuracy = ((preds.cpu() == rating.cpu()).sum()) / len(data)
            accuracy.append(batch_accuracy)
        print(f'Training Loss: {loss}, Training Accuracy: {np.mean(np.asarray(accuracy))}')

In [59]:
a(torch.tensor([1, 2]))

tensor([[ 0.1150, -0.0762, -0.4434, -2.2551, -2.1552, -1.9053, -1.5596, -1.4760,
          1.4537,  1.3882],
        [-0.6215,  0.7647, -1.4853,  0.1306,  0.0975,  0.0943,  2.4900,  0.4897,
          1.3510,  0.6800]], grad_fn=<EmbeddingBackward0>)