In [1]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl

def read_data():
    rating=pd.read_table('ratings.dat',sep='::',header=None,names=['UserId','MovieId','ratings','timestamp'],engine='python')
    return rating
rating = read_data()
print(rating)

         UserId  MovieId  ratings  timestamp
0             1     1193        5  978300760
1             1      661        3  978302109
2             1      914        3  978301968
3             1     3408        4  978300275
4             1     2355        5  978824291
...         ...      ...      ...        ...
1000204    6040     1091        1  956716541
1000205    6040     1094        5  956704887
1000206    6040      562        5  956704746
1000207    6040     1096        4  956715648
1000208    6040     1097        4  956715569

[1000209 rows x 4 columns]


In [2]:
rating['rank_latest'] = rating.groupby(['UserId'])['timestamp'].rank(method='first',ascending=False)
train_ratings = rating[rating['rank_latest'] != 1]
test_ratings = rating[rating['rank_latest']==1]
train_ratings = train_ratings[['UserId','MovieId','ratings']]
test_ratings = test_ratings[['UserId','MovieId','ratings']]
print(test_ratings)

         UserId  MovieId  ratings
25            1       48        5
66            2     1687        3
232           3     2081        4
235           4     2951        4
258           5      288        2
...         ...      ...      ...
998803     6036     2807        1
999557     6037      968        4
999731     6038     1183        5
999764     6039      912        4
1000042    6040     1221        4

[6040 rows x 3 columns]


In [3]:
train_ratings.loc[:,'rating']=1
train_ratings.sample(5)

Unnamed: 0,UserId,MovieId,ratings,rating
860431,5173,589,5,1
820740,4933,453,2,1
105737,698,2012,3,1
31894,216,2352,3,1
923840,5580,919,5,1


In [4]:
#generate neg, neg:pos=4:1
allmovie = rating['MovieId'].unique()
user,movie,label=[],[],[]
user_movie = set(zip(train_ratings['UserId'],train_ratings['MovieId']))
num_neg = 4
for(u,i) in user_movie:
    user.append(u)
    movie.append(i)
    label.append(1)
    for j in range(num_neg):
        neg_movie = np.random.choice(allmovie)
        while(u,neg_movie) in user_movie:
            neg_movie = np.random.choice(allmovie)
        user.append(u)
        movie.append(neg_movie)
        label.append(0)


In [5]:
class Dataset(Dataset):

    def __init__(self, rating, movieIds):
        self.users, self.movies, self.labels = self.get_dataset(rating, movieIds)

    def __len__(self):
        return len(self.users)
  
    def __getitem__(self, idx):
        return self.users[idx], self.movies[idx], self.labels[idx]

    def get_dataset(self, rating, movieIds):
        users, movies, labels = [], [], []
        rating_set = set(zip(rating['UserId'], rating['MovieId']))
        num_neg = 5
        for u, i in rating_set:
            users.append(u)
            movies.append(i)
            labels.append(1)
            for j in range(num_neg):
                negmovie = np.random.choice(all_movieIds)
                while (u, negmovie) in rating_set:
                    negmovie = np.random.choice(all_movieIds)
                users.append(u)
                movies.append(negmovie)
                labels.append(0)
        return torch.tensor(users), torch.tensor(movies), torch.tensor(labels)

In [8]:
class NCF(pl.LightningModule):
    
    def __init__(self, num_users, num_items, ratings, all_movieIds):
        super().__init__()
        self.user_embedding = nn.Embedding(num_embeddings=num_users, embedding_dim=16)
        self.item_embedding = nn.Embedding(num_embeddings=num_items, embedding_dim=16)
        self.fc1 = nn.Linear(in_features=32, out_features=64)
        self.fc2 = nn.Linear(in_features=64, out_features=32)
        self.output = nn.Linear(in_features=32, out_features=1)
        self.ratings = ratings
        self.all_movieIds = all_movieIds
        
    def forward(self, user_input, item_input):
        user_embedded = self.user_embedding(user_input)
        item_embedded = self.item_embedding(item_input)
        vector = torch.cat([user_embedded, item_embedded], dim=-1)
        vector = nn.ReLU()(self.fc1(vector))
        vector = nn.ReLU()(self.fc2(vector))
        pred = nn.Sigmoid()(self.output(vector))
        return pred
    
    def training_step(self, batch, batch_idx):
        user_input, item_input, labels = batch
        predicted_labels = self(user_input, item_input)
        loss = nn.BCELoss()(predicted_labels, labels.view(-1, 1).float())
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters())
    
    def train_dataloader(self):
        return DataLoader(Dataset(self.ratings, self.all_movieIds),
                          batch_size=512, num_workers=0)

In [9]:
num_users = rating['UserId'].max()+1
num_items = rating['MovieId'].max()+1

all_movieIds = rating['MovieId'].unique()

model = NCF(num_users, num_items, train_ratings, all_movieIds)
trainer = pl.Trainer(max_epochs=20, reload_dataloaders_every_epoch=True,
                     progress_bar_refresh_rate=50, logger=False, checkpoint_callback=False)

trainer.fit(model)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores

  | Name           | Type      | Params
---------------------------------------------
0 | user_embedding | Embedding | 96.7 K
1 | item_embedding | Embedding | 63.2 K
2 | fc1            | Linear    | 2.1 K 
3 | fc2            | Linear    | 2.1 K 
4 | output         | Linear    | 33    
---------------------------------------------
164 K     Trainable params
0         Non-trainable params
164 K     Total params
0.657     Total estimated model params size (MB)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…




In [13]:
test_user_movie = set(zip(test_ratings['UserId'],test_ratings['MovieId']))
user_interacted_items = rating.groupby('UserId')['MovieId'].apply(list).to_dict()
hits=[]
for (u,i) in test_user_movie:
    inter = user_interacted_items[u]
    notinter = set(allmovie)-set(inter)
    selecnot = list(np.random.choice(list(notinter),99))
    test_movie = selecnot+[i]
    pre_label = np.squeeze(model(torch.tensor([u]*100),torch.tensor(test_movie)).detach().numpy())
    top10 = [test_movie[i] for i in np.argsort(pre_label)[::-1][0:10].tolist()]
    if i in top10:
        hits.append(1)
    else:
        hits.append(0)
print("The hit ratio @ 10 is {:.2f}".format(np.average(hits)))

The hit ratio @ 10 is 0.56
