In [1]:
import os
import numpy as np
import torch
import torch.nn as nn
from torch.nn import init
import random
from collections import defaultdict
from scipy import sparse as sp
import time
from sklearn.metrics import f1_score
import torch.nn.functional as F
import time
import re
import networkx as nx
from torch.nn.functional import normalize
from sklearn.metrics import roc_auc_score
import itertools

In [2]:
import dgl
from dgl.nn import SAGEConv
import dgl.function as fn

Using backend: pytorch


In [3]:
os.chdir('..')
os.chdir('./teams/DSC180A_FA21_A00/a13group1')
os.getcwd()

'/home/jiw033/teams/DSC180A_FA21_A00/a13group1'

In [4]:
feat_dir = './features/merged_features.csv'

In [5]:
def load_data(G, feat_dir, normalize=True):
    data = np.genfromtxt(feat_dir, delimiter=',', skip_header=True, dtype=str)
    features = np.array(np.delete(data[:,2:], -3, 1), dtype=float)
    if normalize:
        features = F.normalize(torch.Tensor(features), dim=0)
    uris = data[:, 1]
    uris = [re.sub('spotify:track:', '', uri) for uri in uris]
    uri_map = {n: i for i,n in enumerate(uris)}

    src, dest = [], [] 
    adj_list = defaultdict(set)    
    for e in G.edges:
        u,v = uri_map[e[0]], uri_map[e[1]]
        adj_list[u].add(v)
        adj_list[v].add(u)
        src.append(u)
        dest.append(v)
    
    dgl_G = dgl.graph((src, dest), num_nodes=len(G.nodes))
    
    return features, adj_list, dgl_G

def adj_matrix(adj_list):
    row_idx = torch.LongTensor([k for k in range(len(adj_list.keys())) for v in range(len(adj_list[k]))])
    col_idx = torch.LongTensor([v for k in range(len(adj_list.keys())) for v in adj_list[k]]) 

    idx = torch.vstack((row_idx, col_idx))
    
    return torch.sparse_coo_tensor(indices = idx, values = torch.ones(len(row_idx)), 
                                   size=[len(adj_list.keys()), len(adj_list.keys())])

def make_label(batch_nodes):
    batch_map = {n:i for i,n in enumerate(batch_nodes)}
    neigh_list = [adj_list[n].intersection(batch_nodes) for n in batch_nodes]
    #unique_nodes_list = list(set.union(*neigh_list))
    #unique_nodes = {n:i for i,n in enumerate(unique_nodes_list)}
    mask = torch.zeros(len(neigh_list), len(neigh_list)) 
    column_indices = [batch_map[n] for neigh in neigh_list for n in neigh]   
    row_indices = [i for i in range(len(neigh_list)) for j in range(len(neigh_list[i]))]
    mask[row_indices, column_indices] = 1
    
    return mask

def compute_loss(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score])
    labels = torch.cat([torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])])
    return F.binary_cross_entropy_with_logits(scores, labels)

def compute_auc(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score]).numpy()
    labels = torch.cat(
        [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy()
    return roc_auc_score(labels, scores)

def edge_coordinate(batch_nodes, neg=False):
    if not neg:
        neigh_dict = {n:adj_list[n] for n in batch_nodes}
    else:
        neigh_dict = {n:adj_list[n]^set(batch_nodes) for n in batch_nodes}
    src = [k for k in neigh_dict.keys() for n in neigh_dict[k]]
    dest = [n for v in neigh_dict.values() for n in v]
    
    return src, dest

In [6]:
G = nx.read_gpickle('graph_170k.gpickle')

In [7]:
feat_data, adj_list, dgl_G = load_data(G, feat_dir)
#feat_data = normalize(torch.Tensor(feat_data), dim=0)

In [8]:
class GraphSAGE(nn.Module):
    def __init__(self, in_feats, h_feats):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_feats, h_feats, 'mean')
        self.conv2 = SAGEConv(h_feats, h_feats, 'mean')

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h

In [9]:
class DotPredictor(nn.Module):
    def forward(self, g, h):
        with g.local_scope():
            g.ndata['h'] = h
            # Compute a new edge feature named 'score' by a dot-product between the
            # source node feature 'h' and destination node feature 'h'.
            g.apply_edges(fn.u_dot_v('h', 'h', 'score'))
            # u_dot_v returns a 1-element vector for each edge so you need to squeeze it.
            return g.edata['score'][:, 0]
        
class MLPPredictor(nn.Module):
    def __init__(self, h_feats):
        super().__init__()
        self.W1 = nn.Linear(h_feats * 2, h_feats)
        self.W2 = nn.Linear(h_feats, 1)

    def apply_edges(self, edges):
        """
        Computes a scalar score for each edge of the given graph.

        Parameters
        ----------
        edges :
            Has three members ``src``, ``dst`` and ``data``, each of
            which is a dictionary representing the features of the
            source nodes, the destination nodes, and the edges
            themselves.

        Returns
        -------
        dict
            A dictionary of new edge features.
        """
        h = torch.cat([edges.src['h'], edges.dst['h']], 1)
        return {'score': self.W2(F.relu(self.W1(h))).squeeze(1)}
    
    def get_scores(self, h):
        return self.W2(F.relu(self.W1(h))).squeeze(1)

    def forward(self, g, h):
        with g.local_scope():
            g.ndata['h'] = h
            g.apply_edges(self.apply_edges)
            return g.edata['score']

In [10]:
def train(feat_dim, emb_dim, G, features, adj_list):
    np.random.seed(1)
    random.seed(1)
    num_nodes = G.number_of_nodes()

    model = GraphSAGE(feat_dim, emb_dim)
    pred = DotPredictor()
#   model.cuda()

    rand_indices = np.random.permutation(num_nodes)
    test = list(rand_indices[:34000])
    val = list(rand_indices[34000:51000])
    train = list(rand_indices[51000:])
    
    train_g = dgl.remove_edges(G, val+test)
    val_pos_g = dgl.graph(edge_coordinate(val), num_nodes=train_g.number_of_nodes())
    val_neg_g = dgl.graph(edge_coordinate(val,neg=True), num_nodes=train_g.number_of_nodes())
    print('Training starts:')

    optimizer = torch.optim.SGD(itertools.chain(model.parameters(), pred.parameters()), lr=0.01)                   
    losses = []
    for batch in range(100):
        batch_nodes = train[:3000]
        random.shuffle(train)  
        start_time = time.time()
        embed = model(train_g, features)
        
        train_pos_g = dgl.graph(edge_coordinate(batch_nodes), num_nodes=train_g.number_of_nodes())
        train_neg_g = dgl.graph(edge_coordinate(batch_nodes,neg=True), num_nodes=train_g.number_of_nodes())
        pos_score = pred(train_pos_g, embed)
        neg_score = pred(train_neg_g, embed)
        loss = compute_loss(pos_score, neg_score)
        losses.append(loss)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 5 == 0:
            print('In epoch {}, loss: {}'.format(batch, loss))

            with torch.no_grad():
                pos = pred(val_pos_g, embed)
                neg = pred(val_neg_g, embed)
                print('AUC', compute_auc(pos, neg))

In [10]:
np.random.seed(1)
random.seed(1)

def train(feat_dim, emb_dim, G, features, adj_list):
    np.random.seed(1)
    random.seed(1)
    num_nodes = G.number_of_nodes()
    num_edges = G.number_of_edges()

    model = GraphSAGE(feat_dim, emb_dim)
    pred = MLPPredictor(emb_dim)
#   model.cuda()

    rand_indices = np.random.permutation(num_nodes)
    test = list(rand_indices[:34000])
    val = list(rand_indices[34000:51000])
    train = list(rand_indices[51000:])
    
    train_g = dgl.remove_edges(G, val+test)
    #val_pos_g = dgl.graph(edge_coordinate(val), num_nodes=train_g.number_of_nodes())
    #val_neg_g = dgl.graph(edge_coordinate(val,neg=True), num_nodes=train_g.number_of_nodes())
    print('Training starts:')

    optimizer = torch.optim.SGD(itertools.chain(model.parameters(), pred.parameters()), lr=0.01)                   
    losses = []
    for batch in range(100):
        batch_nodes = train[:3000]
        random.shuffle(train)  
        start_time = time.time()
        embed = model(train_g, features)
        
        train_pos_g = dgl.graph(edge_coordinate(batch_nodes), num_nodes=train_g.number_of_nodes())
        train_neg_g = dgl.graph(edge_coordinate(batch_nodes,neg=True), num_nodes=train_g.number_of_nodes())
        pos_score = pred(train_pos_g, embed)
        neg_score = pred(train_neg_g, embed)
        loss = compute_loss(pos_score, neg_score)
        losses.append(loss)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        end_time = time.time()

        if batch % 5 == 0:
            print('In epoch {}, loss: {}'.format(batch, loss))

            with torch.no_grad():
                embed = model(val_g, features)
                pos = pred(val_pos_g, embed)
                neg = pred(val_neg_g, embed)
                print('AUC', compute_auc(pos, neg))

    return model, pred

In [12]:
model, pred = train(14, 10, dgl_G, feat_data, adj_list)

Training starts:
In epoch 0, loss: 0.5980200171470642
In epoch 5, loss: 0.5898678302764893
In epoch 10, loss: 0.5833808183670044
In epoch 15, loss: 0.5780704021453857
In epoch 20, loss: 0.5720557570457458
In epoch 25, loss: 0.565000593662262
In epoch 30, loss: 0.5577007532119751
In epoch 35, loss: 0.553368330001831
In epoch 40, loss: 0.550187349319458
In epoch 45, loss: 0.5419141054153442
In epoch 50, loss: 0.5378782749176025
In epoch 55, loss: 0.5321746468544006
In epoch 60, loss: 0.5262676477432251
In epoch 65, loss: 0.5221739411354065
In epoch 70, loss: 0.5163643956184387
In epoch 75, loss: 0.5152520537376404
In epoch 80, loss: 0.5065482258796692
In epoch 85, loss: 0.5052351951599121
In epoch 90, loss: 0.49707314372062683
In epoch 95, loss: 0.4960591197013855
