In [66]:
import torch
import random
import networkx as nx
from torch_geometric.utils import to_dense_adj
from graspologic.embed import AdjacencySpectralEmbed  

import sys
sys.path.append("../")
from models.onu_fuctions import load_un_dataset, create_un_graphs, process_un_graph, process_un_graph_2
from models.link_prediction import train_link_prediction, eval_link_prediction, train_link_prediction_GAT, train_link_prediction_Transformer, train_link_prediction_GraphTransformer
from models.glase_e2e_link_prediction import train_link_prediction_e2e, eval_link_prediction_e2e, train_link_prediction_GAT_e2e, train_link_prediction_Transformer_e2e, train_link_prediction_GraphTransformer_e2e
# from models.link_prediction import Net2
from models.RDPG_GD import GRDPG_GD_Armijo
from models.GLASE_unshared_normalized import gLASE 
from training.generate_embeddings import generate_embeddings
from torch_geometric.data import Data
import torch_geometric.transforms as T


import copy
from typing import List
votes_df = load_un_dataset('data/UNVotes-1.csv', unknown_votes=True)

all_graphs = create_un_graphs(votes_df[votes_df.year==1946])

unknown_countries = ['ISR', 'GRB', 'NDL', 'CUB', 'TUR', 'VNM']
adj_matrix, country_indexes, res_indexes, unknown_edges, features, mask_nodes, mask, selected_resolutions, inverted_mask_matrix, mask_unknown = process_un_graph_2(all_graphs, mask_countries=unknown_countries, mask_threshold=0.1) 

# print(selected_resolutions)

num_nodes = adj_matrix.shape[0]
edge_index = torch.tensor(adj_matrix).nonzero().t().contiguous()

# x_ase, x_grdpg, x_glase, masked_edge_index, edge_index_2, Q = generate_embeddings(adj_matrix, mask, d)
# Q = Q.to('cuda')
# edge_index_2 = edge_index_2.to('cuda')
# mask = mask.to('cuda')


## Calculate Embeddings
d = 4
## ASE 
adj_matrix = to_dense_adj(edge_index.to('cpu'), max_num_nodes=num_nodes).squeeze(0)
ase = AdjacencySpectralEmbed(n_components=d, diag_aug=True, algorithm='full')
masked_adj = adj_matrix*mask
x_ase = ase.fit_transform(masked_adj.numpy())
x_ase = torch.from_numpy(x_ase)

masked_edge_index = masked_adj.nonzero().t().contiguous()

# data = Data(x=features.float(), x_init=x_ase, x_ase=x_ase, x_glase=x_ase, x_grdpg=x_ase, edge_index=masked_edge_index)
torch.manual_seed(42)
random_features=torch.rand([num_nodes, 12])
data = Data(x=random_features.float(), x_init=x_ase, x_ase=x_ase, x_glase=x_ase, x_grdpg=x_ase, edge_index=masked_edge_index)
num_nodes = mask.shape[0]
adj_matrix = to_dense_adj(edge_index.to('cpu'), max_num_nodes=num_nodes).squeeze(0)

## Split Train, Val, Test
device = 'cuda'
transform = T.Compose([
    T.ToDevice(device),
    T.RandomLinkSplit(num_val=0.05, num_test=0.01, is_undirected=True,
                    add_negative_train_samples=False),
])

train_data, val_data, test_data = transform(data)
data = data.to('cuda')
print(data)


Data(x=[98, 12], edge_index=[2, 2416], x_init=[98, 4], x_ase=[98, 4], x_glase=[98, 4], x_grdpg=[98, 4])




In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import dgl
import dgl.function as fn
import numpy as np

"""
    Graph Transformer Layer
    
"""

"""
    Util functions
"""
def src_dot_dst(src_field, dst_field, out_field):
    def func(edges):
        return {out_field: (edges.src[src_field] * edges.dst[dst_field]).sum(-1, keepdim=True)}
    return func

def scaled_exp(field, scale_constant):
    def func(edges):
        # clamp for softmax numerical stability
        return {field: torch.exp((edges.data[field] / scale_constant).clamp(-5, 5))}

    return func


"""
    Single Attention Head
"""

class MultiHeadAttentionLayer(nn.Module):
    def __init__(self, in_dim, out_dim, num_heads, use_bias):
        super().__init__()
        
        self.out_dim = out_dim
        self.num_heads = num_heads
        
        if use_bias:
            self.Q = nn.Linear(in_dim, out_dim * num_heads, bias=True)
            self.K = nn.Linear(in_dim, out_dim * num_heads, bias=True)
            self.V = nn.Linear(in_dim, out_dim * num_heads, bias=True)
        else:
            self.Q = nn.Linear(in_dim, out_dim * num_heads, bias=False)
            self.K = nn.Linear(in_dim, out_dim * num_heads, bias=False)
            self.V = nn.Linear(in_dim, out_dim * num_heads, bias=False)
        
    
    def propagate_attention(self, g):
        # Compute attention score
        g.apply_edges(src_dot_dst('K_h', 'Q_h', 'score')) #, edges)
        g.apply_edges(scaled_exp('score', np.sqrt(self.out_dim)))

        # Send weighted values to target nodes
        eids = g.edges()
        g.send_and_recv(eids, fn.src_mul_edge('V_h', 'score', 'V_h'), fn.sum('V_h', 'wV'))
        g.send_and_recv(eids, fn.copy_edge('score', 'score'), fn.sum('score', 'z'))
    
    def forward(self, g, h):
        
        Q_h = self.Q(h)
        K_h = self.K(h)
        V_h = self.V(h)
        
        # Reshaping into [num_nodes, num_heads, feat_dim] to 
        # get projections for multi-head attention
        g.ndata['Q_h'] = Q_h.view(-1, self.num_heads, self.out_dim)
        g.ndata['K_h'] = K_h.view(-1, self.num_heads, self.out_dim)
        g.ndata['V_h'] = V_h.view(-1, self.num_heads, self.out_dim)
        
        self.propagate_attention(g)
        
        head_out = g.ndata['wV']/g.ndata['z']
        
        return head_out
    

class GraphTransformerLayer(nn.Module):
    """
        Param: 
    """
    def __init__(self, in_dim, out_dim, num_heads, dropout=0.0, layer_norm=False, batch_norm=True, residual=True, use_bias=False):
        super().__init__()

        self.in_channels = in_dim
        self.out_channels = out_dim
        self.num_heads = num_heads
        self.dropout = dropout
        self.residual = residual
        self.layer_norm = layer_norm        
        self.batch_norm = batch_norm
        
        self.attention = MultiHeadAttentionLayer(in_dim, out_dim//num_heads, num_heads, use_bias)
        
        self.O = nn.Linear(out_dim, out_dim)

        if self.layer_norm:
            self.layer_norm1 = nn.LayerNorm(out_dim)
            
        if self.batch_norm:
            self.batch_norm1 = nn.BatchNorm1d(out_dim)
        
        # FFN
        self.FFN_layer1 = nn.Linear(out_dim, out_dim*2)
        self.FFN_layer2 = nn.Linear(out_dim*2, out_dim)

        if self.layer_norm:
            self.layer_norm2 = nn.LayerNorm(out_dim)
            
        if self.batch_norm:
            self.batch_norm2 = nn.BatchNorm1d(out_dim)
        
    def forward(self, g, h):
        h_in1 = h # for first residual connection
        
        # multi-head attention out
        attn_out = self.attention(g, h)
        h = attn_out.view(-1, self.out_channels)
        
        h = F.dropout(h, self.dropout, training=self.training)
        
        h = self.O(h)
        
        if self.residual:
            h = h_in1 + h # residual connection
        
        if self.layer_norm:
            h = self.layer_norm1(h)
            
        if self.batch_norm:
            h = self.batch_norm1(h)
        
        h_in2 = h # for second residual connection
        
        # FFN
        h = self.FFN_layer1(h)
        h = F.relu(h)
        h = F.dropout(h, self.dropout, training=self.training)
        h = self.FFN_layer2(h)

        if self.residual:
            h = h_in2 + h # residual connection
        
        if self.layer_norm:
            h = self.layer_norm2(h)
            
        if self.batch_norm:
            h = self.batch_norm2(h)       

        return h
        
    def __repr__(self):
        return '{}(in_channels={}, out_channels={}, heads={}, residual={})'.format(self.__class__.__name__,
                                             self.in_channels,
                                             self.out_channels, self.num_heads, self.residual)

In [61]:
from torch import nn
# from models.GraphTransformerLayer import GraphTransformerLayer
from torch_geometric.utils import to_dgl
from torch_geometric.data import Data

import torch 
from torch import nn
from torch_geometric.nn import GCNConv
from torch_geometric.nn import TransformerConv
from models.GraphTransformerLayer import GraphTransformerLayer
from models.GAT import GATv2
from sklearn.metrics import roc_auc_score
from torch_geometric.utils import negative_sampling
from torch_geometric.utils import to_dense_adj
from torch_geometric.utils import to_dgl
import copy
from torch_geometric.data import Data

class GraphTransformerLinkPrediction(torch.nn.Module):
    def __init__(self, in_channels, out_channels, pe_dim_in, pe_dim_out, n_layers, dropout, num_heads, batch_norm: int = True):
        super().__init__()
        
        self.feat_lin = nn.Linear(in_channels, out_channels, bias=True)
        self.pe_lin = nn.Linear(pe_dim_in, pe_dim_out, bias=True)
        self.layers = nn.ModuleList([GraphTransformerLayer(out_channels+pe_dim_out, out_channels+pe_dim_out, num_heads, dropout, batch_norm=batch_norm, layer_norm=True, residual=True) for _ in range(n_layers)])
        

    def encode(self, x_in, edge_index):
        x_feat, x_pe = x_in
        x_pe = self.pe_lin(x_pe)
        x_feat=self.feat_lin(x_feat)
        x = torch.concatenate((x_feat, x_pe), axis=1)
        # print(x)
        data = Data(x=x, edge_index=edge_index)
        # Transform to DGL
        g = to_dgl(data) 
        
        # GraphTransformer Layers
        for conv in self.layers:
            x = conv(g, x)
            # print(x)
        return x
    
    def decode(self, z, edge_label_index):
        return (z[edge_label_index[0]] * z[edge_label_index[1]]).sum(dim=-1)

    def decode_all(self, z):
        prob_adj = z @ z.t()
        return (prob_adj > 0).nonzero(as_tuple=False).t()
    
def train(x_input, train_data, model, optimizer, criterion):
    model.train()
    optimizer.zero_grad()
    z = model.encode(x_input, train_data.edge_index)

    # We perform a new round of negative sampling for every training epoch:
    neg_edge_index = negative_sampling(
        edge_index=train_data.edge_index, num_nodes=train_data.num_nodes,
        num_neg_samples=train_data.edge_label_index.size(1), method='sparse')

    edge_label_index = torch.cat(
        [train_data.edge_label_index, neg_edge_index],
        dim=-1,
    )
    edge_label = torch.cat([
        train_data.edge_label,
        train_data.edge_label.new_zeros(neg_edge_index.size(1))
    ], dim=0)

    out = model.decode(z, edge_label_index).view(-1)
    print(torch.sort(out))
    print(edge_label[:5])
    loss = criterion(out[:5], edge_label[:5])
    print('Hola',loss)
    loss.backward()
    optimizer.step()
    return loss

In [62]:
x_train = train_data.x, train_data.x_ase
x_val = val_data.x, val_data.x_ase
x_test = test_data.x, test_data.x_ase


model = GraphTransformerLinkPrediction(12, 12, 4, 4, n_layers=1, dropout=0.5, num_heads=1, batch_norm=False).to(device)


model

# model_2 = train_link_prediction_GraphTransformer(x_train, x_val, x_test, train_data, val_data, test_data, input_dim=12, pe_dim=d, epochs = 101, 
#                                                     output_dim= 12, pe_out_dim= 4, n_layers= 1, dropout=0.5, num_heads=1)


model.encode(x_train, train_data.edge_index)

tensor([[-1.1016,  2.3111, -0.3753,  ..., -0.4465,  0.8163,  0.6778],
        [-2.1781,  1.7504,  0.5823,  ...,  0.7038, -0.2071, -0.2766],
        [-0.4197,  0.1038, -0.1684,  ..., -0.5292,  0.5420,  0.3487],
        ...,
        [-1.5097,  1.6042,  0.3253,  ...,  0.7505, -0.2385, -0.4173],
        [-1.5945,  1.9640,  0.1765,  ...,  0.4531, -0.1168, -0.0580],
        [-0.6371,  1.6956, -0.0449,  ...,  0.7677, -0.5222,  0.6367]],
       device='cuda:0', grad_fn=<NativeLayerNormBackward0>)

In [63]:
from models.link_prediction import test

def train_link_prediction_GraphTransformer(x_train, x_val, x_test, train_data, val_data, test_data, input_dim, pe_dim, epochs = 301, 
                                           output_dim: int = 32, pe_out_dim: int = 8, n_layers: int = 3, dropout:int =0.5, 
                                           num_heads: int =4, batch_norm: bool = True, lr: int = 0.01): 
    device = "cuda"
    model = GraphTransformerLinkPrediction(input_dim, output_dim, pe_dim, pe_out_dim, n_layers=n_layers, dropout=dropout, num_heads=num_heads, batch_norm=batch_norm).to(device)
    print(model)
    optimizer = torch.optim.Adam(params=model.parameters(), lr=lr)
    criterion = torch.nn.BCEWithLogitsLoss()
    best_val_auc = final_test_auc = 0
    for epoch in range(1, epochs):
        loss = train(x_train, train_data, model, optimizer, criterion)
        print(loss)
        val_auc = test(x_val, val_data, model)
        test_auc = test(x_test, test_data, model)
        if val_auc > best_val_auc:
            best_val_auc = val_auc
            final_test_auc = test_auc
    return model


train_link_prediction_GraphTransformer(x_train, x_val, x_test, train_data, val_data, test_data, input_dim=12, pe_dim=d, epochs = 101, 
                                                     output_dim= 12, pe_out_dim= 4, n_layers= 1, dropout=0.5, num_heads=1, batch_norm=False, lr=0.000000000001)


TypeError: empty(): argument 'size' must be tuple of ints, but found element of type Tensor at pos 2

In [64]:

z = model.encode(x_train, train_data.edge_index)


# We perform a new round of negative sampling for every training epoch:
neg_edge_index = negative_sampling(
    edge_index=train_data.edge_index, num_nodes=train_data.num_nodes,
    num_neg_samples=train_data.edge_label_index.size(1), method='sparse')

edge_label_index = torch.cat(
    [train_data.edge_label_index, neg_edge_index],
    dim=-1,
)
edge_label = torch.cat([
    train_data.edge_label,
    train_data.edge_label.new_zeros(neg_edge_index.size(1))
], dim=0)


a = z[edge_label_index[0]]
b = z[edge_label_index[1]]

# print(torch.sort(z[edge_label_index[0]]))
# print(z[edge_label_index[1]])

print(a.shape)
print(b.shape)
c = a*b
print(c.shape)

d = (a*b).sum(dim=-1)
print(d.shape)
torch.sort((a*b).sum(dim=-1))
# edge_label_index

# torch.sort(out),;

torch.Size([2276, 16])
torch.Size([2276, 16])
torch.Size([2276, 16])
torch.Size([2276])


torch.return_types.sort(
values=tensor([2.2996, 3.2255, 3.2734,  ...,    nan,    nan,    nan], device='cuda:0',
       grad_fn=<SortBackward0>),
indices=tensor([ 578,  879,  654,  ..., 2071, 2174, 2274], device='cuda:0'))

In [65]:
import torch

# Example tensor with NaN values

# Create a boolean mask tensor indicating where the entries are NaN

z = model.encode(x_train, train_data.edge_index)
nan_mask = torch.isnan(z)

print("Boolean Mask:", nan_mask)

# Get the indices where NaN values occur
nan_indices = torch.nonzero(nan_mask).squeeze()

print("Indices of NaN Values:", nan_indices)


Boolean Mask: tensor([[False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        ...,
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False]], device='cuda:0')
Indices of NaN Values: tensor([[72,  0],
        [72,  1],
        [72,  2],
        [72,  3],
        [72,  4],
        [72,  5],
        [72,  6],
        [72,  7],
        [72,  8],
        [72,  9],
        [72, 10],
        [72, 11],
        [72, 12],
        [72, 13],
        [72, 14],
        [72, 15]], device='cuda:0')


In [23]:
out = torch.tensor([0.1421, 0.1502, 0.1686, 0.1014, 0.1211])
label = torch.tensor([1., 1., 1., 1., 1.])
criterion = torch.nn.BCEWithLogitsLoss()
criterion(out, label)

tensor(0.6272)