In [1]:
from torch_geometric.data import download_url, extract_zip

url = 'https://files.grouplens.org/datasets/movielens/ml-latest-small.zip'
extract_zip(download_url(url, '.'), '.')

movies_path = './ml-latest-small/movies.csv'
ratings_path = './ml-latest-small/ratings.csv'

Using existing file ml-latest-small.zip
Extracting .\ml-latest-small.zip


In [2]:
import pandas as pd
import torch

In [3]:
  # Load the entire movie data frame into memory:
movies_df = pd.read_csv(movies_path, index_col='movieId')

# Split genres and convert into indicator variables:
genres = movies_df['genres'].str.get_dummies('|')
print(genres[["Action", "Adventure", "Drama", "Horror"]].head())
# Use genres as movie input features:
movie_feat = torch.from_numpy(genres.values).to(torch.float)
assert movie_feat.size() == (9742, 20)  # 20 genres in total.

         Action  Adventure  Drama  Horror
movieId                                  
1             0          1      0       0
2             0          1      0       0
3             0          0      0       0
4             0          0      1       0
5             0          0      0       0


In [4]:
# Load the entire ratings data frame into memory:
ratings_df = pd.read_csv(ratings_path)

# Create a mapping from unique user indices to range [0, num_user_nodes):
unique_user_id = ratings_df['userId'].unique()
unique_user_id = pd.DataFrame(data={
    'userId': unique_user_id,
    'mappedID': pd.RangeIndex(len(unique_user_id)),
})
print("Mapping of user IDs to consecutive values:")
print("==========================================")
print(unique_user_id.head())
print()
# Create a mapping from unique movie indices to range [0, num_movie_nodes):
unique_movie_id = ratings_df['movieId'].unique()
unique_movie_id = pd.DataFrame(data={
    'movieId': unique_movie_id,
    'mappedID': pd.RangeIndex(len(unique_movie_id)),
})
print("Mapping of movie IDs to consecutive values:")
print("===========================================")
print(unique_movie_id.head())
# Perform merge to obtain the edges from users and movies:
ratings_user_id = pd.merge(ratings_df['userId'], unique_user_id,
                            left_on='userId', right_on='userId', how='left')
ratings_user_id = torch.from_numpy(ratings_user_id['mappedID'].values)
ratings_movie_id = pd.merge(ratings_df['movieId'], unique_movie_id,
                            left_on='movieId', right_on='movieId', how='left')
ratings_movie_id = torch.from_numpy(ratings_movie_id['mappedID'].values)
# With this, we are ready to construct our `edge_index` in COO format
# following PyG semantics:
edge_index_user_to_movie = torch.stack([ratings_user_id, ratings_movie_id], dim=0)
assert edge_index_user_to_movie.size() == (2, 100836)
print()
print("Final edge indices pointing from users to movies:")
print("=================================================")
print(edge_index_user_to_movie)

Mapping of user IDs to consecutive values:
   userId  mappedID
0       1         0
1       2         1
2       3         2
3       4         3
4       5         4

Mapping of movie IDs to consecutive values:
   movieId  mappedID
0        1         0
1        3         1
2        6         2
3       47         3
4       50         4

Final edge indices pointing from users to movies:
tensor([[   0,    0,    0,  ...,  609,  609,  609],
        [   0,    1,    2,  ..., 3121, 1392, 2873]])


In [5]:
(ratings_df['userId'].shape, ratings_user_id.shape, unique_user_id.shape)

((100836,), torch.Size([100836]), (610, 2))

In [6]:
ratings_user_id

tensor([  0,   0,   0,  ..., 609, 609, 609])

In [7]:
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T

In [None]:
data = HeteroData()
# Save node indices:
data["user"].node_id = torch.arange(len(unique_user_id))
data["movie"].node_id = torch.arange(len(movies_df))
# Add the node features and edge indices:
data["movie"].x = movie_feat
data["user", "rates", "movie"].edge_index = edge_index_user_to_movie

data["user", "rates", "movie"].edge_attr = edge_index_user_to_movie[0] / torch.max(edge_index_user_to_movie[0])

# We also need to make sure to add the reverse edges from movies to users
# in order to let a GNN be able to pass messages in both directions.
# We can leverage the `T.ToUndirected()` transform for this from PyG:
data = T.ToUndirected()(data)

In [None]:
'edge_attr' in data

In [None]:
data

In [None]:
test = torch.tensor(edge_index_user_to_movie)
test[:,0] = torch.ones((2,))
print(edge_index_user_to_movie)
print(test)

In [None]:
# For this, we first split the set of edges into
# training (80%), validation (10%), and testing edges (10%).
# Across the training edges, we use 70% of edges for message passing,
# and 30% of edges for supervision.
# We further want to generate fixed negative edges for evaluation with a ratio of 2:1.
# Negative edges during training will be generated on-the-fly.
# We can leverage the `RandomLinkSplit()` transform for this from PyG:
transform = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    disjoint_train_ratio=0.3,
    neg_sampling_ratio=2.0,
    add_negative_train_samples=False,
    edge_types=("user", "rates", "movie"),
    rev_edge_types=("movie", "rev_rates", "user"), 
)
train_data, val_data, test_data = transform(data)

In [None]:
# In the first hop, we sample at most 20 neighbors.
# In the second hop, we sample at most 10 neighbors.
# In addition, during training, we want to sample negative edges on-the-fly with
# a ratio of 2:1.
# We can make use of the `loader.LinkNeighborLoader` from PyG:
from torch_geometric.loader import LinkNeighborLoader

# Define seed edges:
edge_label_index = train_data["user", "rates", "movie"].edge_label_index
edge_label = train_data["user", "rates", "movie"].edge_label

In [None]:
train_loader = LinkNeighborLoader(
    data=train_data,
    num_neighbors=[20, 10],
    neg_sampling_ratio=2.0,
    edge_label_index=(("user", "rates", "movie"), edge_label_index),
    edge_label=edge_label,
    batch_size=128,
    shuffle=True
)

In [None]:
het: HeteroData = next(iter(train_loader))
print(het.edge_index_dict)
print(het.edge_attr_dict)

In [None]:
from torch_geometric.nn import SAGEConv, GENConv, GeneralConv, GATv2Conv, TransformerConv, to_hetero, BatchNorm
from torch import nn, Tensor
import torch.nn.functional as F

In [None]:
class HeteroGCNConv(nn.Module):
    def __init__(self, in_feature, out_feature, dropout = 0.57) -> None:
        super().__init__()
        self.conv1 = GATv2Conv(in_feature, out_feature, edge_dim=1, add_self_loops=False)
        self.conv2 = GATv2Conv(out_feature, out_feature, edge_dim=1, add_self_loops=False)
        
        self.batch_norm = BatchNorm(out_feature)
        self.dropout= nn.Dropout(dropout)
        
    def forward(self, x: Tensor, edge_index: Tensor, edge_weights: Tensor) -> Tensor:
        x = F.relu(self.conv1(x, edge_index, edge_attr=edge_weights))
        x = self.conv2(x, edge_index, edge_attr=edge_weights)
        x = self.batch_norm(x)
        x = F.relu(x)
        x = self.dropout(x)
        return x


In [None]:
emb_model = to_hetero(HeteroGCNConv(256, 128), data.metadata())

In [None]:
het.edge_attr_dict[('user', 'rates', 'movie')]

In [None]:
import torch
user_emb = torch.nn.Embedding(data['user'].num_nodes, 256)
movie_emb = torch.nn.Embedding(data['movie'].num_nodes, 256)
movie_lin = torch.nn.Linear(20, 256)
x_dict = {
    "user": user_emb(het["user"].node_id),
    "movie": movie_lin(het["movie"].x) + movie_emb(het["movie"].node_id)
}
x_dict = emb_model(x_dict, het.edge_index_dict, het.edge_attr_dict)

In [None]:
import torch
user_emb = torch.nn.Embedding(data['user'].num_nodes, 256)
movie_emb = torch.nn.Embedding(data['movie'].num_nodes, 256)
movie_lin = torch.nn.Linear(20, 256)
x_dict = {
    "user": user_emb(het["user"].node_id),
    "movie": movie_lin(het["movie"].x) + movie_emb(het["movie"].node_id)
}
x_dict = emb_model(x_dict, het.edge_index_dict, het.edge_attr_dict)

In [None]:
from torch_geometric.nn import SAGEConv, to_hetero
import torch.nn.functional as F
from torch import Tensor
import torch
import torch_geometric
from torch.nn import Linear
from torch_geometric.nn import GATv2Conv, GCNConv, GCN2Conv, DenseGCNConv, dense_diff_pool, BatchNorm, global_mean_pool, global_add_pool, global_max_pool, MemPooling, SAGEConv, to_hetero, HeteroBatchNorm
from torch_geometric.nn import Sequential as GSequential
from torch_geometric.utils import to_dense_adj
from torch import nn, Tensor
import torch.nn.functional as F

class GNN(torch.nn.Module):
    def __init__(self, 
                 input_feature: int, out_features: int,
                 metadata,
                 base_hidden_feature: int=256,
                 dropout=0.1):
        
        super(GNN, self).__init__()
        self.input_features = input_feature
        self.num_out_features = out_features
        self.bsh: int = base_hidden_feature
        bsh2: int = int(self.bsh/2)
        bsh4: int = int(self.bsh/4)
        bsh8: int = int(self.bsh/8)
        
        self.encoder = GSequential('x_dict, edge_index, edge_weights', [
            (to_hetero(HeteroG CNConv(input_feature, self.bsh), metadata), 'x_dict, edge_index, edge_weights ->x1'),
            (to_hetero(HeteroGCNConv(self.bsh, self.bsh), metadata), 'x1, edge_index, edge_weights ->x1'),
            (to_hetero(HeteroGCNConv(self.bsh, self.bsh), metadata), 'x1, edge_index, edge_weights ->x1'),
            (to_hetero(HeteroGCNConv(self.bsh, bsh2), metadata), 'x1, edge_index, edge_weights -> x2'),
            (to_hetero(HeteroGCNConv(bsh2, bsh2), metadata), 'x2, edge_index, edge_weights -> x2'),
            (to_hetero(HeteroGCNConv(bsh2, bsh2), metadata), 'x2, edge_index, edge_weights -> x2'),
            (to_hetero(HeteroGCNConv(bsh2, bsh4), metadata), 'x2, edge_index, edge_weights -> x3'),
            (to_hetero(HeteroGCNConv(bsh4, bsh4), metadata), 'x3, edge_index, edge_weights -> x3'),
            (to_hetero(HeteroGCNConv(bsh4, bsh4), metadata), 'x3, edge_index, edge_weights -> x3'),
            (to_hetero(HeteroGCNConv(bsh4, bsh8), metadata), 'x3, edge_index, edge_weights -> x4'),
            (to_hetero(HeteroGCNConv(bsh8, bsh8), metadata), 'x4, edge_index, edge_weights -> x4'),
            (to_hetero(HeteroGCNConv(bsh8, bsh8), metadata), 'x4, edge_index, edge_weights -> x4'),
            (lambda x1, x2, x3, x4: (x1, x2, x3, x4), 'x1, x2, x3, x4 -> x1, x2, x3, x4')
        ])
        
        self.attention = GSequential('x3, x4, edge_index, edge_weights', [
            (GATv2Conv(bsh8, bsh8, 2, dropout=dropout), 'x4, edge_index ->x4'),
            (BatchNorm(bsh4), 'x4->x4'),
            (nn.ReLU(), 'x4->x4'),
            
            (GCN2Conv(bsh4, 0.5, 0.1, 2), 'x4, x3, edge_index, edge_weights->x3'),
            (BatchNorm(bsh4), 'x3->x3'),
            (nn.ReLU(), 'x3->x3'),
            (GCNConv(bsh4, bsh4), 'x3, edge_index, edge_weights -> x3'),
            (BatchNorm(bsh4), 'x3->x3'),
            (nn.ReLU(), 'x3->x3'),
            
            (GATv2Conv(bsh4, bsh4, 2, dropout=dropout), 'x3, edge_index ->x3'),
            (BatchNorm(bsh2), 'x3->x3'),
            (nn.ReLU(), 'x3->x3'),
            (lambda x3, x4: (x3, x4), 'x3, x4 -> x3, x4')
        ])
        
        self.decoder = GSequential('x1, x2, x3, edge_index, edge_weights', [
            
            (GCN2Conv(bsh2, 0.5, 0.1, 2), 'x3, x2, edge_index, edge_weights->x2'),
            (BatchNorm(bsh2), 'x2->x2'),
            (nn.ReLU(), 'x2->x2'),
            (nn.Dropout(dropout), 'x2->x2'),
            (GCNConv(bsh2, bsh2), 'x2, edge_index, edge_weights -> x2'),
            (BatchNorm(bsh2), 'x2->x2'),
            (nn.ReLU(), 'x2->x2'),
            (nn.Dropout(dropout), 'x2->x2'),
            (GCNConv(bsh2, self.bsh), 'x2, edge_index->x2'),
            (BatchNorm(self.bsh), 'x2->x2'),
            (nn.ReLU(), 'x2->x2'),
            (nn.Dropout(dropout), 'x2->x2'),
            
            (GCN2Conv(self.bsh, 0.5, 0.1, 2), 'x2, x1, edge_index, edge_weights->x1'),
            (BatchNorm(self.bsh), 'x1->x1'),
            (nn.ReLU(), 'x1->x1'),
            (nn.Dropout(dropout), 'x1->x1'),
            (GCNConv(self.bsh, self.bsh), 'x1, edge_index, edge_weights ->x1'),
            (BatchNorm(self.bsh), 'x1->x1'),
            (nn.ReLU(), 'x1->x1'),
            (nn.Dropout(dropout), 'x1->x1'),
            (GCNConv(self.bsh, self.bsh), 'x1, edge_index, edge_weights ->x1'),
            (BatchNorm(self.bsh), 'x1->x1'),
            (nn.ReLU(), 'x1->x1'),
            (nn.Dropout(dropout), 'x1->x1'),
            (GCNConv(self.bsh, self.input_features), 'x1, edge_index, edge_weights -> x1'),
        ])
        
                
        self.user_emb = torch.nn.Embedding(data['user'].num_nodes, self.input_features)
        self.movie_emb = torch.nn.Embedding(data['movie'].num_nodes, self.input_features)
        self.movie_lin = torch.nn.Linear(20, self.input_features)
        
    def forward(self, x: HeteroData) -> Tensor:
        x_dict = {
          "word": self.user_emb(x["word"].node_id),
          "movie": self.movie_lin(x["movie"].x) + self.movie_emb(x["movie"].node_id)
        }

        x1_dict, x2_dict, x3_dict, x4_dict = self.encoder(x_dict, x.edge_index_dict, x.edge_attr_dict)
        print(f'x4_dict["movie"]: {x4_dict["movie"].shape}, x_dict["movie"]: {x_dict["movie"].shape}')
        x_att, x4 = self.attention(x3_dict["movie"], x4_dict["movie"], 
                                   x.edge_index_dict[('movie','rev_rates','user')],
                                   x.edge_attr_dict[('movie','rev_rates','user')])
        x_dec = self.decoder(x1_dict["movie"], x2_dict["movie"], x_att, 
                             x.edge_index_dict[('movie','rev_rates','user')],
                             x.edge_attr_dict[('movie','rev_rates','user')])
        return x_dict #x_dec

In [None]:
words = torch.zeros((25 , 300), dtype=torch.float32)

In [None]:
het.edge_index_dict
het.edge_attr_dict

In [None]:
gnn_model = GNN(20, 1, metadata=data.metadata(), base_hidden_feature=256)

In [57]:
het.edge_index_dict
het.edge_attr_dict

{('user',
  'rates',
  'movie'): tensor([0.3924, 0.9819, 0.4401,  ..., 0.7783, 0.2414, 0.8112]),
 ('movie',
  'rev_rates',
  'user'): tensor([0.0016, 0.0016, 0.0016,  ..., 0.8177, 0.8177, 0.8177])}

In [58]:
gnn_model = GNN(20, 1, metadata=data.metadata(), base_hidden_feature=256)

In [59]:
gnn_model(het)

x4_dict["movie"]: torch.Size([2808, 32]), x_dict["movie"]: torch.Size([2808, 20])


{'user': tensor([[ 1.1510,  0.1046, -0.5639,  ...,  1.0205, -0.0078,  0.0808],
         [ 0.3696, -0.9333, -2.0932,  ..., -0.6952, -0.3166,  0.9402],
         [-0.6207,  1.2826,  1.5125,  ...,  0.0499, -0.0366,  1.0635],
         ...,
         [ 0.5995, -0.3419,  1.8178,  ...,  1.1518, -1.0551,  0.7446],
         [-1.0250,  1.9003, -0.4490,  ..., -0.8544,  0.6340,  1.5318],
         [ 2.3937, -0.9057,  1.7867,  ..., -2.3540,  1.7853,  0.0841]],
        grad_fn=<EmbeddingBackward0>),
 'movie': tensor([[ 4.9276e-01,  1.5015e+00,  4.4476e-01,  ...,  1.7528e+00,
          -6.5898e-02, -1.0381e-02],
         [ 8.6721e-02, -8.2240e-01, -4.4348e-01,  ...,  2.3993e-02,
          -3.3888e+00, -1.2052e-02],
         [ 7.0787e-01, -3.8669e-01,  2.7619e-01,  ...,  3.6967e-01,
           7.7473e-01,  3.2005e-01],
         ...,
         [ 2.0895e-01,  1.3283e+00, -1.0325e+00,  ..., -8.2338e-01,
          -2.4185e-02,  4.1061e-01],
         [ 4.0624e-01, -5.2136e-02,  4.3045e-01,  ...,  3.4253e-01,
 

In [26]:
sample = (torch.zeros((3,3)), torch.zeros((4, 4)))

In [27]:
sample.size(0)

AttributeError: 'tuple' object has no attribute 'size'

In [None]:
train_loader

LinkNeighborLoader()

In [None]:
type(het)

torch_geometric.data.hetero_data.HeteroData

In [None]:
# het.edge_index_dict

In [None]:
data["movie"].x.shape

torch.Size([9742, 20])

In [None]:
# Our final classifier applies the dot-product between source and destination
# node embeddings to derive edge-level predictions:
class Classifier(torch.nn.Module):
    def forward(self, x_user: Tensor, x_movie: Tensor, edge_label_index: Tensor) -> Tensor:
        # Convert node embeddings to edge-level representations:
        edge_feat_user = x_user[edge_label_index[0]]
        edge_feat_movie = x_movie[edge_label_index[1]]
        # Apply dot-product to get a prediction per supervision edge:
        return (edge_feat_user * edge_feat_movie).sum(dim=-1)

In [None]:

class Model(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        # Since the dataset does not come with rich features, we also learn two
        # embedding matrices for users and movies:
        self.movie_lin = torch.nn.Linear(20, hidden_channels)
        self.user_emb = torch.nn.Embedding(data["user"].num_nodes, hidden_channels)
        self.movie_emb = torch.nn.Embedding(data["movie"].num_nodes, hidden_channels)
        # Instantiate homogeneous GNN:
        self.gnn = GNN(hidden_channels)
        # Convert GNN model into a heterogeneous variant:
        self.gnn = to_hetero(self.gnn, metadata=data.metadata())
        self.classifier = Classifier()
    def forward(self, data: HeteroData) -> Tensor:
        print(f'data["user"].node_id.shape: {data["user"].node_id.shape}')
        print(f'self.user_emb(data["user"].node_id).shape: {self.user_emb(data["user"].node_id).shape}')
        
        
        x_dict = {
          "user": self.user_emb(data["user"].node_id),
          "movie": self.movie_lin(data["movie"].x) + self.movie_emb(data["movie"].node_id),
        } 
        # `x_dict` holds feature matrices of all node types
        # `edge_index_dict` holds all edge indices of all edge types
        print(f'x_dict 1: {x_dict}')
        x_dict = self.gnn(x_dict, data.edge_index_dict)
        print(f'x_dict 2: {x_dict}')
        pred = self.classifier(
            x_dict["user"],
            x_dict["movie"],
            data["user", "rates", "movie"].edge_label_index,
        )
        return pred

In [None]:
data["user"].num_nodes

610

In [None]:
het = next(iter(train_loader))

In [None]:
hetero_gnn = Model(hidden_channels=64)
# hetero_gnn(x_dict, het.edge_index_dict)
hetero_gnn(het)

data["user"].node_id.shape: torch.Size([610])
self.user_emb(data["user"].node_id).shape: torch.Size([610, 64])
x_dict 1: {'user': tensor([[-0.1731, -0.0329, -1.0646,  ..., -0.1266, -1.2288, -1.2651],
        [ 2.0936, -0.9719, -0.0572,  ..., -0.1425, -0.5372, -0.1820],
        [ 1.2944, -0.1054,  0.4820,  ..., -0.6956, -0.4736,  0.8452],
        ...,
        [-0.0977, -0.4481,  0.1680,  ..., -0.6580,  0.1051, -2.0542],
        [-0.0750,  2.1841,  1.6005,  ...,  0.7096,  0.5372, -0.0957],
        [ 0.7012,  1.5016,  0.0513,  ..., -1.3722,  0.3077, -0.5693]],
       grad_fn=<EmbeddingBackward0>), 'movie': tensor([[ 0.1400,  1.0546,  1.0634,  ..., -1.1045, -1.3633,  0.3525],
        [ 1.0973,  1.1088,  0.3908,  ..., -0.7576,  1.0406,  0.3367],
        [ 0.4720,  0.9814,  0.4514,  ...,  0.7214, -0.7344,  0.1905],
        ...,
        [-0.3643,  1.0382, -1.4448,  ..., -0.5761, -1.3072,  1.5188],
        [-0.1337, -0.0369,  0.0053,  ..., -1.0227,  0.4914,  1.1152],
        [-1.4048,  0.2001,

tensor([-2.8000e-02, -3.2009e-01, -8.7974e-01, -7.8132e-01,  1.9838e-01,
         1.4694e+00, -6.8347e-01, -6.7994e-01, -3.5775e-02, -1.3441e+00,
         2.6486e-01, -6.2920e-01,  6.8263e-02,  3.6945e-01, -6.5919e-02,
        -6.7867e-01, -1.9387e-01, -4.9250e-01,  1.1928e-01, -1.3715e-01,
        -1.1608e-01, -4.1062e-01, -1.8796e+00, -3.1967e-01, -3.3600e-01,
        -5.0233e-01, -3.1731e-01, -1.2445e+00,  9.4310e-01,  7.4915e-01,
        -1.5191e-01,  2.8177e-01, -1.0342e+00,  1.3886e+00, -1.6083e+00,
         8.7437e-01, -3.4305e-02, -7.9155e-01,  3.6961e-01,  7.0661e-01,
        -9.2525e-02, -3.9848e-01, -5.3912e-01, -6.9309e-01, -3.1650e-01,
         9.3210e-01, -1.2362e+00, -3.2783e-01,  1.8021e-01,  1.1343e-01,
        -1.2535e+00, -1.2975e+00, -1.0150e+00, -8.0170e-01, -5.3207e-01,
         6.2693e-01, -7.4344e-01, -2.1599e+00,  8.6293e-01, -1.1305e+00,
        -1.6756e+00,  7.1698e-01, -8.0521e-01, -3.6644e-01,  4.7580e-02,
         3.9132e-01, -5.4051e-01, -1.8751e-01,  3.4