In [1]:
import pandas as pd

In [2]:
triples = pd.read_csv('data/triplets_file.csv')
me = pd.read_csv('data/me.csv')
# add me to triples
triples = pd.concat([me, triples], ignore_index=True)

# use first 500000 rows
triples = triples.head(500000)


In [3]:
# turn triples into a graph
import networkx as nx
G = nx.from_pandas_edgelist(triples, source='user_id', target='song_id', edge_attr='listen_count', create_using=nx.DiGraph())


In [4]:
# show number of nodes and edges and edge weights
print('Number of nodes: ', G.number_of_nodes())
print('Number of edges: ', G.number_of_edges())
print('Number of edges with weight > 1: ', len([e for e in G.edges(data=True) if e[2]['listen_count'] > 1]))
print('Number of edges with weight > 10: ', len([e for e in G.edges(data=True) if e[2]['listen_count'] > 10]))
print('Number of edges with weight > 100: ', len([e for e in G.edges(data=True) if e[2]['listen_count'] > 100]))
print('Number of edges with weight > 1000: ', len([e for e in G.edges(data=True) if e[2]['listen_count'] > 1000]))



Number of nodes:  29193
Number of edges:  500000
Number of edges with weight > 1:  217556
Number of edges with weight > 10:  24236
Number of edges with weight > 100:  228
Number of edges with weight > 1000:  0


In [5]:
# create pytorch geometric graph from networkx graph
import torch
from torch_geometric.data import Data
from torch_geometric.utils import from_networkx

data = from_networkx(G)

# add listen counts as edge weights
data.edge_attr = torch.tensor([e[2]['listen_count'] for e in G.edges(data=True)], dtype=torch.float)

# link prediction task for bipartite graph, no node features.
# use edge weights as edge features
data.x = torch.zeros((data.num_nodes, 1), dtype=torch.float)
data.y = torch.zeros((data.num_edges, 1), dtype=torch.float)

data

Data(edge_index=[2, 500000], listen_count=[500000], num_nodes=29193, edge_attr=[500000], x=[29193, 1], y=[500000, 1])

In [6]:
# train test mask
import numpy as np
from sklearn.model_selection import train_test_split

# split into train and test
train_mask, test_mask = train_test_split(np.arange(data.num_edges), test_size=0.2, random_state=42)

# split train into train and validation
train_mask, val_mask = train_test_split(train_mask, test_size=0.2, random_state=42)

# create masks
data.train_mask = torch.zeros(data.num_edges, dtype=torch.bool)
data.train_mask[train_mask] = True

data.val_mask = torch.zeros(data.num_edges, dtype=torch.bool)
data.val_mask[val_mask] = True

data.test_mask = torch.zeros(data.num_edges, dtype=torch.bool)
data.test_mask[test_mask] = True



In [7]:
data.num_node_features

1

In [8]:
# model for link prediction
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, DeepGraphInfomax

class GCN(torch.nn.Module):
    def __init__(self, dim_in, dim_h, dim_out):
        super().__init__()
        self.gcn1 = GCNConv(dim_in, dim_h)
        self.gcn2 = GCNConv(dim_h, dim_out)
        self.optimizer = torch.optim.Adam(self.parameters(),
                                          lr=0.01,
                                          weight_decay=5e-4)

    def forward(self, x, edge_index):
        #h = F.dropout(x, p=0.5, training=self.training)
        h = self.gcn1(x, edge_index)
        h = torch.relu(h)
        h = F.dropout(h, p=0.5, training=self.training)

        h = self.gcn2(h, edge_index)
        return h, F.log_softmax(h, dim=1)


In [9]:
def accuracy(pred_y, y):
    return ((pred_y == y).sum() / len(y)).item()


def train(model, data):
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = model.optimizer
    epochs = 200
    result = None
    model.train()
    for epoch in range(epochs + 1):
        # Training
        optimizer.zero_grad()

        result, out = model(data.x, data.edge_index)
        loss = criterion(out[data.train_mask], data.y[data.train_mask])
        acc = accuracy(out[data.train_mask].argmax(dim=1), data.y[data.train_mask])
        loss.backward()
        optimizer.step()

        # Validation
        val_loss = criterion(out[data.val_mask], data.y[data.val_mask])
        val_acc = accuracy(out[data.val_mask].argmax(dim=1), data.y[data.val_mask])

        if epoch % 10 == 0:
            print(f'Epoch {epoch:>3} | Train Loss: {loss:.3f} | Train Acc: '
                  f'{acc * 100:>6.2f}% | Val Loss: {val_loss:.2f} | '
                  f'Val Acc: {val_acc * 100:.2f}%')

    return result


def test(model, data):
    """Evaluate the model on test set and print the accuracy score."""
    model.eval()
    _, out = model(data.x, data.edge_index)
    acc = accuracy(out.argmax(dim=1)[data.test_mask], data.y[data.test_mask])
    return acc

In [10]:


lame = GCN(1, 16, 1)
print(lame)
lame_acc = test(lame, data)
print(f'No training GCN test accuracy: {lame_acc*100:.2f}%\n')
lame_res = train(lame, data)

#lame_acc = test(lame, data)
print(f'GCN test accuracy: {lame_acc*100:.2f}%\n')



GCN(
  (gcn1): GCNConv(1, 16)
  (gcn2): GCNConv(16, 1)
)


IndexError: The shape of the mask [500000] at index 0 does not match the shape of the indexed tensor [29193] at index 0

In [11]:
data.edge_index.shape

torch.Size([2, 500000])