In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
from torch_geometric.data import download_url, extract_zip

In [3]:
# url = 'https://files.grouplens.org/datasets/movielens/ml-latest-small.zip'
url = 'https://files.grouplens.org/datasets/movielens/ml-latest.zip'
extract_zip(download_url(url, '.'), '.')

# movies_path = './ml-latest-small/movies.csv'
# ratings_path = './ml-latest-small/ratings.csv'
movies_path = './ml-latest/movies.csv'
ratings_path = './ml-latest/ratings.csv'

Downloading https://files.grouplens.org/datasets/movielens/ml-latest.zip
Extracting .\ml-latest.zip


In [4]:
import os
os.listdir('./ml-latest-small/')

['links.csv', 'movies.csv', 'ratings.csv', 'README.txt', 'tags.csv']

In [5]:
import pandas as pd
movie_df = pd.read_csv(movies_path)
ratings_df = pd.read_csv(ratings_path)

In [6]:
movie_df.head()
ratings_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [42]:
ratings_df['rating'].value_counts()

rating
4.0    26818
3.0    20047
5.0    13211
3.5    13136
4.5     8551
2.0     7551
2.5     5550
1.0     2811
1.5     1791
0.5     1370
Name: count, dtype: int64

In [7]:
movie_df.set_index('movieId', inplace=True)

In [8]:
genres = movie_df['genres'].str.get_dummies('|')

In [9]:
genres.shape
genres.head()

(9742, 20)

Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
5,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [10]:
import torch

In [11]:
movie_features = torch.from_numpy(genres.values).to(torch.float)

movie_features.shape
movie_features

torch.Size([9742, 20])

tensor([[0., 0., 1.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [12]:
unique_user_id = ratings_df['userId'].unique()
unique_user_id = pd.DataFrame(data= {'userId' : unique_user_id, 
                                     'mappedId' : pd.RangeIndex(0, len(unique_user_id))})

unique_movie_id = pd.DataFrame(data= {'movieId' : movie_df.index, 
                                     'mappedId' : pd.RangeIndex(0, movie_df.shape[0])})

In [13]:
ratings_user_id = pd.merge(ratings_df, unique_user_id, left_on='userId', right_on='userId', how='left')
ratings_movie_id = pd.merge(ratings_df, unique_movie_id, left_on='movieId', right_on='movieId', how = 'left')

In [14]:
ratings_user_id = torch.from_numpy(ratings_user_id['mappedId'].values)
ratings_movie_id = torch.from_numpy(ratings_movie_id['mappedId'].values)

In [15]:
edge_index_user2movie = torch.stack([ratings_user_id, ratings_movie_id], dim = 0)
edge_index_user2movie
edge_index_user2movie.shape

tensor([[   0,    0,    0,  ...,  609,  609,  609],
        [   0,    2,    5,  ..., 9462, 9463, 9503]])

torch.Size([2, 100836])

In [16]:
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T

In [28]:
data = HeteroData()

data['user']['node_id'] = torch.arange(len(unique_user_id))

data['movie']['node_id'] = torch.arange(len(movie_df))
data['movie']['x'] = movie_features

data['user', 'rates', 'movie']['edge_index'] = edge_index_user2movie
data = T.ToUndirected()(data)

In [30]:
data

HeteroData(
  user={ node_id=[610] },
  movie={
    node_id=[9742],
    x=[9742, 20],
  },
  (user, rates, movie)={ edge_index=[2, 100836] },
  (movie, rev_rates, user)={ edge_index=[2, 100836] }
)

In [40]:
data['user', 'rates', 'movie']
data['movie', 'rev_rates', 'user']

{'edge_index': tensor([[   0,    0,    0,  ...,  609,  609,  609],
        [   0,    2,    5,  ..., 9462, 9463, 9503]])}

{'edge_index': tensor([[   0,    2,    5,  ..., 9462, 9463, 9503],
        [   0,    0,    0,  ...,  609,  609,  609]])}

In [29]:
data['user'].num_nodes
data['user'].num_features
data['movie'].num_nodes
data['movie'].num_features
data.edge_types

610

0

9742

20

[('user', 'rates', 'movie'), ('movie', 'rev_rates', 'user')]

### Edge level train-val-test split

In [43]:
# train :: 80% ; val :: 10% ; test :: 10%
# from the train we use 70% for message_passing and 30% for supervision (disjoint_train_ratio)
# generate negative edges for evaluation 2:1 ratio (generated on the fly)

transform = T.RandomLinkSplit(num_val=0.1, 
                              num_test=0.1, 
                              disjoint_train_ratio=0.3, 
                              neg_sampling_ratio=2.0,
                              add_negative_train_samples=False,
                              edge_types= ('user', 'rates', 'movie'), 
                              rev_edge_types= ('movie', 'rev_rates', 'user'))

In [45]:
train_data, val_data, test_data = transform(data)

In [51]:
print("== train ==")
train_data
print("== val ==")
val_data
print("== test ==")
val_data

== train ==


HeteroData(
  user={ node_id=[610] },
  movie={
    node_id=[9742],
    x=[9742, 20],
  },
  edge_index={},
  (user, rates, movie)={},
  (user, rates, movie)={
    edge_index=[2, 56469],
    edge_label=[24201],
    edge_label_index=[2, 24201],
  },
  (movie, rev_rates, user)={ edge_index=[2, 56469] }
)

== val ==


HeteroData(
  user={ node_id=[610] },
  movie={
    node_id=[9742],
    x=[9742, 20],
  },
  edge_index={},
  (user, rates, movie)={},
  (user, rates, movie)={
    edge_index=[2, 80670],
    edge_label=[30249],
    edge_label_index=[2, 30249],
  },
  (movie, rev_rates, user)={ edge_index=[2, 80670] }
)

== test ==


HeteroData(
  user={ node_id=[610] },
  movie={
    node_id=[9742],
    x=[9742, 20],
  },
  edge_index={},
  (user, rates, movie)={},
  (user, rates, movie)={
    edge_index=[2, 80670],
    edge_label=[30249],
    edge_label_index=[2, 30249],
  },
  (movie, rev_rates, user)={ edge_index=[2, 80670] }
)

In [61]:
train_data[('user', 'rates', 'movie')]['edge_label'].unique() # no negative samples added
val_data[('user', 'rates', 'movie')]['edge_label'].unique() # negative samples added
val_data[('user', 'rates', 'movie')]['edge_label'].to(torch.long).bincount()

tensor([1.])

tensor([0., 1.])

tensor([20166, 10083])

In [63]:
train_data[('user', 'rates', 'movie')] # edge_label_index are used for supervision
val_data[('user', 'rates', 'movie')]

{'edge_index': tensor([[ 317,   21,   19,  ...,  291,  247,  279],
        [ 546, 5372, 1273,  ..., 7429,  911, 7416]]), 'edge_label': tensor([1., 1., 1.,  ..., 1., 1., 1.]), 'edge_label_index': tensor([[ 417,  181,  602,  ...,   62,  273,  413],
        [8245, 2354,  450,  ...,  909, 6039, 2099]])}

{'edge_index': tensor([[ 417,  181,  602,  ...,  291,  247,  279],
        [8245, 2354,  450,  ..., 7429,  911, 7416]]), 'edge_label': tensor([1., 1., 1.,  ..., 0., 0., 0.]), 'edge_label_index': tensor([[ 199,  181,  593,  ...,  504,   79,   28],
        [ 815, 4327,  138,  ..., 6257, 1239, 5869]])}

In [67]:
train_data[('movie', 'rev_rates', 'user')].num_edges
train_data[('movie', 'rev_rates', 'user')]
val_data[('movie', 'rev_rates', 'user')].num_edges
val_data[('movie', 'rev_rates', 'user')]

56469

{'edge_index': tensor([[ 546, 5372, 1273,  ..., 7429,  911, 7416],
        [ 317,   21,   19,  ...,  291,  247,  279]])}

80670

{'edge_index': tensor([[8245, 2354,  450,  ..., 7429,  911, 7416],
        [ 417,  181,  602,  ...,  291,  247,  279]])}

### Mini Batching

In [68]:
from torch_geometric.loader import LinkNeighborLoader

In [69]:
train_data[('user', 'rates', 'movie')]['edge_label_index']

tensor([[ 417,  181,  602,  ...,   62,  273,  413],
        [8245, 2354,  450,  ...,  909, 6039, 2099]])

In [70]:
train_data[('user', 'rates', 'movie')]['edge_label']

tensor([1., 1., 1.,  ..., 1., 1., 1.])

In [71]:
edge_label_index = train_data[('user', 'rates', 'movie')]['edge_label_index']
edge_label = train_data[('user', 'rates', 'movie')]['edge_label']

In [73]:
train_data.edge_types

[('user', 'rates', 'movie'), ('movie', 'rev_rates', 'user')]

In [74]:
train_loader = LinkNeighborLoader(data= train_data, num_neighbors= [20,10], 
                                  neg_sampling_ratio=2.0, 
                                  edge_label_index=(('user', 'rates', 'movie'), edge_label_index),
                                  edge_label=edge_label,
                                  batch_size = 128,
                                  shuffle = True)



In [75]:
sample_data = next(iter(train_loader))

In [76]:
sample_data

HeteroData(
  user={
    node_id=[607],
    n_id=[607],
  },
  movie={
    node_id=[2863],
    x=[2863, 20],
    n_id=[2863],
  },
  edge_index={ n_id=[0] },
  (user, rates, movie)={ n_id=[0] },
  edge_label={ n_id=[0] },
  (user, rates, movie)={
    edge_index=[2, 17834],
    edge_label=[384],
    edge_label_index=[2, 384],
    e_id=[17834],
    input_id=[128],
  },
  (movie, rev_rates, user)={
    edge_index=[2, 7980],
    e_id=[7980],
  }
)