### Import libraries

In [40]:
import numpy as np
from torch_geometric.data import Dataset, Data
import pandas as pd
import networkx as nx
import torch
import os
import matplotlib.pyplot as plt
from torch import mps 
from torch_geometric.datasets import CitationFull
from torch_geometric.transforms import RandomNodeSplit
from sklearn.metrics import accuracy_score , hamming_loss, multilabel_confusion_matrix
import copy
import node2vec

from src.models.model_architecture import GCN

# from deepsnap.dataset import GraphDataset
# from ogb.graphproppred import Evaluator
# from torch_geometric.utils import to_torch_coo_tensor
# from sklearn.metrics import classification_report

In [68]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("mps device")
else:
    device = "cpu"
    print ("MPS device not found.")

mps device


### pytorch_geometric class to make a custom dataset

In [37]:
class DBLP_dataset(Dataset):
    def __init__(self, root, filenames, test=False, transform=None, pre_transform=None):
        """
        root = Where the dataset should be stored. This folder is split
        into raw_dir (downloaded dataset) and processed_dir (processed data). 
        """
        self.test = test
        self.filenames = filenames

        super(DBLP_dataset , self).__init__(root, transform, pre_transform)

        self.data = None
        

    @property
    def raw_file_names(self):
        """ If this file exists in raw_dir, the download is not triggered.
            (The download func. is not implemented here)  

            Filenames will be:
            "graph_edges.txt"
            "5000_communities.txt"/"communities.txt"
        """
        return self.filenames

    @property
    def processed_file_names(self):
        """ If these files are found in raw_dir, processing is skipped"""
        
        return "data_0.pt"

    def download(self):
        pass

    def process(self):
        
        num_feats = 5
        node_list , edge_list = self.node_edge_list_from_txt_file(self.raw_paths[0])
        labels = self.one_hot_encoded_communities_from_txt_file(self.raw_paths[1] , node_list)

        f = self.get_frequencies()
        modified_node_list = self.modified_ohe()
        labels = torch.tensor(labels[modified_node_list , :] , dtype=torch.float32)


        mapping = dict(zip(modified_node_list , np.arange(0 , modified_node_list.shape[0] , 1)))

        g = nx.Graph()
        g.add_nodes_from(node_list)
        g.add_edges_from(edge_list)

        g = nx.Graph(g.subgraph(modified_node_list).copy())
        nx.relabel_nodes(g , mapping = mapping , copy=False)

        feature_vector = np.zeros((len(g.nodes) , num_feats))

        feature_vector , g = self.get_feature_vector(g)
        feature_vector = torch.nn.functional.normalize(torch.tensor(feature_vector , dtype=torch.float32) , dim=0)

        edge_index = np.array([e for e in g.edges])
        edge_index = torch.tensor(edge_index.T , dtype=torch.int32)
    
        data = Data(x=feature_vector , edge_index=edge_index , y=labels , dtype=torch.float32 , g=g)
        add_masks = RandomNodeSplit(split = "train_rest" , num_val=100 , num_test=100)
        
        self.data = add_masks(data)
        idx=0
        torch.save(self.data, os.path.join(self.processed_dir, f"data_{idx}.pt") , pickle_protocol=4)
            
    def node_edge_list_from_txt_file(self , file_path):

        txt_file = open(file_path)
        edge_list = []
        node_set = set([])

        for line in txt_file:
            line_elements = line.split()
            if(line_elements[0] == "#"):
                pass
            else:
                edge = [int(x) for x in line_elements]
                nodes = [int(x) for x in line_elements]
                edge_list.append(edge)
                node_set.update(nodes)
        
        txt_file.close()
        
        edge_list = np.array(edge_list)
        node_list = np.sort(np.array(list(node_set)))
        
        return node_list , edge_list
    
    
    def one_hot_encoded_communities_from_txt_file(self , file_path , node_list , community_count = 1000):

        txt_file = open(file_path)
        community_count = 0

        for line in txt_file:
            community_count += 1

        community_ohe = np.zeros((max(node_list)+1 , community_count))

        txt_file.close()
        txt_file = open(file_path)

        community = 0
        for line in txt_file:
            line_elements = [int(x) for x in line.split()]
            community_ohe[line_elements , community] = 1
            community+=1


        txt_file.close()
        
        return community_ohe
    
    

    def get_feature_vector(self , g):
        """
        input: (g: Networkx graph)
        output: (feature_vector: Pytorch tensor of shape (len(g.nodes) , n_features)) , g: nx graph with added node features)
        description: 
        takes in nx graph g, 
        calculates "n_features" graph statistics for each node,
        updates the feature values in g,

        
        node characteristics and returns it as a Pytorch tensor.
        """

        # using set_node_attributes function given with examples at: https://networkx.org/documentation/stable/reference/generated/networkx.classes.function.set_node_attributes.html

        feature_vector = np.ones((len(g.nodes) , 1))

        #Adding degree as a feature
        degree = g.degree()     # compute node degrees
        nx.set_node_attributes(g , dict(degree) , "degree")     # Add the new features using nx.set_node_attributes(graph , feature: dict({node:value}) , name_of_feature: str)
        degree_ = np.reshape(np.array(degree)[: , 1] , newshape=(-1,1))
        feature_vector = np.append(feature_vector , degree_ , axis=1)


        # Eigenvector Centrality
        e_centrality = nx.eigenvector_centrality(g)     # compute Eigenvector centralities
        nx.set_node_attributes(g, e_centrality, "centrality")    # Add feature to graph
        e_centrality_ = np.array(list((e_centrality.values()))).reshape((-1,1))        
        feature_vector = np.append(feature_vector , e_centrality_ , axis=1)     # Adding feature to feature vector

        # Clustering Coefficient
        cc = nx.clustering(g)    # compute clustering coefficients
        nx.set_node_attributes(g, cc, "clustering_coef")    # Add feature to graph
        cc_ = np.array(list((e_centrality.values()))).reshape((-1,1))        
        feature_vector = np.append(feature_vector , cc_ , axis=1)     # Add feature to feature vector

        # Square clustering
        scc = nx.square_clustering(g)    # compute sqaure clustering coefficient
        nx.set_node_attributes(g, scc, "square_clustering_coef")    # Add feature to graph
        scc_ = np.array(list((scc.values()))).reshape((-1,1))        
        feature_vector = np.append(feature_vector , scc_ , axis=1)     # Add feature to feature vector


        return feature_vector , g
    
    def train_test_split(self , splits = [0.8 , 0.1 , 0.1]):

        d = self.get(0)
        
        m = np.random.rand(1 , d.x.shape[0])

        m_train = (m<(splits[0]))
        m_val = ((m>splits[0])*(m<splits[0]+splits[1]))
        m_test = ((m>splits[0]+splits[1]))

        return (torch.tensor(m_train) , torch.tensor(m_val) , torch.tensor(m_test))
        
    def _get_labels(self, label):
        label = np.asarray([label])
        return torch.tensor(label, dtype=torch.int64)

    def len(self):
        return len(self.processed_file_names)

    def get(self, idx):
        data = torch.load(os.path.join(self.processed_dir, f'data_{idx}.pt'))
        return data
    
    def mini_batching(self , m_train , batch_size = 128):

        input_shape = m_train.shape[1]
        train_idx = np.where(m_train==True)
        train_idx = train_idx[1]
        np.random.shuffle(train_idx)
        batches = np.array_split(train_idx , m_train.sum()//batch_size)

        batch_vector = np.ones((input_shape , ))

        for i, batch in enumerate(batches):
            batch_vector[batch] = i
        
        return batch_vector
    
    def get_frequencies(self , data_file_path="./data_files/5000_communities.txt"):

        # Sampling communities based on the number of members and saving to a .csv file
        # Function to get the value counts of number of members in a community
        
        # list of lengths of communities

        list_of_frequencies = []
        distribution_list = []

        data_file = open(data_file_path)

        for line in data_file:

            line_elements = [int(x) for x in line.split()]
            list_of_frequencies.append(len(line_elements))
            distribution_list.append(len(line_elements))


        unique, counts = np.unique(list_of_frequencies, return_counts=True)
        combined = np.stack([unique , counts] , axis=0)
        print(combined.shape)

        # creating a dataframe of frequencies to load.
        frequencies = pd.DataFrame(combined.T , columns=["value" , "count"])
        
        # Saving frequencies as csv file
        frequencies.to_csv("./data_files/frequencies.csv")

        data_file.close()
        plt.hist(distribution_list , range=[0,101] , bins=100)
        plt.show()

        return frequencies
    
    def modified_ohe(self , data_file_path = "./data_files/5000_communities.txt" , frequencies_file_path = "./data_files/frequencies.csv" , total_communities = 50):

        frequencies = pd.read_csv(frequencies_file_path)

        data_file = open(data_file_path)
        write_file = open("./data_folder/raw/modified_communities.txt", "w")

        k = 5

        modified_node_set = set([])
        distribution_list = []
        count = 0

        for line in data_file:

            line_elements = [int(x) for x in line.split()]

            random = np.random.rand()

            if(count == total_communities):
                break

            elif(random <= (k/(frequencies[frequencies["value"] == len(line_elements)]["count"].iloc[0]))):

                write_file.write(line)
                modified_node_set.update(line_elements)
                distribution_list.append(len(line_elements))
                count+=1

            else:
                pass

        data_file.close()
        write_file.close()

        
        modified_node_list = np.sort(np.array(list(modified_node_set)))
        
        plt.hist(distribution_list , range=[0,100] , bins=100)
        plt.show()

        return modified_node_list

In [38]:
d = DBLP_dataset(root="/Users/sbhardwaj/documents/project_2/data" , filenames=["graph_edges.txt" , "modified_communities.txt"] , test=False)

In [39]:
data=d[0]

### Visualising the output variable distribution

### Loading data in appropriate formats

In [88]:
def node_edge_list_from_txt_file(file_path):

    txt_file = open(file_path)
    edge_list = []
    node_set = set([])

    for line in txt_file:
        line_elements = line.split()
        if(line_elements[0] == "#"):
            pass
        else:
            edge = [int(x) for x in line_elements]
            nodes = [int(x) for x in line_elements]
            edge_list.append(edge)
            node_set.update(nodes)
    
    txt_file.close()
    
    edge_list = np.array(edge_list)
    node_list = np.sort(np.array(list(node_set)))
    
    return node_list , edge_list

In [89]:
def one_hot_encoded_communities_from_txt_file(file_path , node_list):

    txt_file = open(file_path)
    community_count = 0

    for line in txt_file:
        community_count+=1

    community_ohe = np.zeros((np.max(node_list)+1 , community_count))

    txt_file.close()
    txt_file = open(file_path)
    count = 0

    community = 0
    for line in txt_file:

        line_elements = [int(x) for x in line.split()]
        print((line_elements))
        community_ohe[line_elements , community] = 1.0
        # count+=len(line.split())
        community+=1

    print(count)


    txt_file.close()
    
    return community_ohe

In [90]:
node_list , edge_list = node_edge_list_from_txt_file("./data_files/graph_edges.txt")
print(edge_list.shape , node_list.shape)
community_ohe = one_hot_encoded_communities_from_txt_file("./data_files/5000_communities.txt" , node_list)
print(community_ohe.shape)

(1049866, 2) (317080,)
[105653, 105654, 210737, 210738, 210739, 211813, 211814]
[53599, 69449, 115973, 135447, 198973, 231501, 258663, 258664, 258878, 258879, 258880, 258881, 258882, 258883, 268226, 273534, 278033, 278127, 284825]
[14320, 24722, 26240, 44769, 45169, 57357, 75878, 87447, 100480, 107928, 128723, 130891, 130893, 145212, 145213, 146090, 147152, 147153, 147154]
[12083, 21358, 21719, 21916, 21917, 21918]
[6880, 6881, 6882, 6883, 6884, 6885]
[66911, 66913, 66915, 102533, 178107, 332128, 339068, 339069]
[77916, 147731, 150907, 315669, 339888, 370497, 370498]
[114450, 114451, 115429, 115430, 141941, 148107]
[52201, 67186, 128284, 141315, 144846, 167358, 231852, 234813, 237994, 239688, 248515, 265386, 299756, 299757, 311753, 314063, 323567, 332199, 364621, 378225, 378226, 392700]
[43723, 60780, 79541, 103633, 266040, 381201, 381202, 400902]
[227237, 239566, 274803, 287800, 287801, 287802]
[117391, 154582, 163222, 166893, 259008, 267400, 267401, 285228]
[60246, 259720, 259721, 26

In [91]:
print(node_list)

[     0      1      2 ... 425954 425955 425956]


In [92]:
community_ohe[node_list , :].shape

(317080, 5000)

In [6]:
trial = (community_ohe[node_list , :])
trial.shape

(317080, 5000)

In [62]:
trial[105653]

array([0., 0., 0., ..., 0., 0., 0.])

In [58]:
pos = np.where(community_ohe == 1)

for row , col in zip(*pos):
    print(row , col)

4 3407
10 3407
11 3407
27 3407
35 3407
39 3407
53 3407
61 3407
61 4790
61 4972
69 4733
74 3407
83 2631
83 3407
94 3407
96 3407
99 3407
102 3407
105 3407
118 3407
120 3407
122 3407
135 3407
138 3407
139 2679
152 3407
153 2679
154 3407
156 3407
189 1297
189 3407
200 3407
206 3407
217 2596
217 4527
221 3407
223 3407
232 3407
238 1780
251 3407
258 3407
264 3407
269 3407
270 3943
270 4496
273 2405
304 3407
305 3407
336 3407
339 3407
359 3407
360 792
360 3407
368 3407
369 3407
407 3407
426 3407
437 3407
438 443
461 4342
482 4496
483 4496
484 4496
485 3943
485 4496
486 443
488 3407
541 827
545 4495
546 4495
547 4495
584 2136
585 2136
585 3087
605 4496
606 4496
647 41
654 4496
672 3407
676 3407
710 3984
710 4972
711 4658
728 3784
730 3407
750 2438
750 3407
760 3407
796 2788
798 3407
834 3407
835 3407
880 4790
885 4867
913 4494
918 3943
919 3943
920 3943
921 3943
976 2470
976 4430
976 4658
1034 1218
1049 4430
1111 3539
1121 2631
1126 2015
1157 4494
1168 4356
1232 4496
1233 3407
1254 3880
1259 3

## Create the graph using the node and edge list

In [13]:
import networkx as nx

In [14]:
# Creating a NetworkX graph from the obtained node_list and edge_list

    # Creating the graph
g = nx.Graph()
    # Adding the nodes
g.add_nodes_from(node_list)
    # Adding the edges
g.add_edges_from((edge_list))

### Adding relevant features to the nodes

In [35]:
# using set_node_attributes function given with examples at: https://networkx.org/documentation/stable/reference/generated/networkx.classes.function.set_node_attributes.html

# Adding degree as a feature
    # get the degree of each node using built-in networkx functions
degree = g.degree()
    # convert to a dictionary of the format {node:value}
degree = dict(list(degree))
    # Adding the new features using nx.set_node_attributes(graph , feature: dict({node:value}) , name_of_feature: str)
nx.set_node_attributes(g, degree, "degree")

# Adding a constant 1 feature
const = zip(g.nodes() , np.ones((len(g.nodes) , )))
const = dict(const)
nx.set_node_attributes(g , const , "constant")

# Verify the features for a node
node = 0
g.nodes[node]

{'degree': 16, 'constant': 1.0}

In [32]:
(np.array(g.degree())[: , 1]).shape

(317080,)

### Visualising the Graph

In [None]:
import matplotlib.pyplot as plt

## Building the model class

In [75]:
import torch
from torch_geometric.nn import GCNConv, GraphSAGE
import torch.nn.functional as F

In [264]:
class GCN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers,
                 dropout, return_embeds=False):

        super(GCN, self).__init__()

        # Initialise GCNConv layers
        self.convs = torch.nn.ModuleList([GCNConv(input_dim , hidden_dim)])
        for i in range(num_layers-2):
            self.convs.append(GCNConv(hidden_dim , hidden_dim))
        self.convs.append(GCNConv(hidden_dim , output_dim))

        # Initialise batch normalization layers
        self.bns = torch.nn.ModuleList([torch.nn.BatchNorm1d(hidden_dim) for i in range(num_layers-1)])

        # The final layer layer
        self.output = torch.nn.Sigmoid()

        # Probability of an element getting zeroed
        self.dropout = dropout

        # Skip classification layer and return node embeddings
        self.return_embeds = return_embeds

        self.dropout = dropout

        self.reset_parameters()

    def reset_parameters(self):
        
        for conv in self.convs:
            conv.reset_parameters()

        for bn in self.bns:
            bn.reset_parameters()

    def forward(self, x , edge_index):

        out = None

        for conv, batch_norm in zip(self.convs[:-1], self.bns):
            
            x = conv(x , edge_index[: , :])
            x = batch_norm(x)
            x = F.relu(x)
            x = F.dropout(x, p=self.dropout, training=self.training)

        x = self.convs[-1](x , edge_index[: , :5])
        
        if(self.return_embeds):
            out = x
        else:
            out = self.output(x)

        return out

In [313]:
class MLP(torch.nn.Module):

    def __init__(self, input_dim, hidden_dim, output_dim, num_layers,
                 dropout):

        super(MLP, self).__init__()

        self.mlp = torch.nn.Sequential(torch.nn.Linear(input_dim, hidden_dim))

        for i in range(num_layers-2):
            self.mlp.append(torch.nn.Linear(hidden_dim, hidden_dim))
            self.mlp.append(torch.nn.Dropout(dropout))
        self.mlp.append(torch.nn.Linear(hidden_dim , output_dim))

        self.output = torch.nn.Sigmoid()
        

    def forward(self, x , edge_index):
        out = self.mlp(x)
        return out

In [69]:
class n2vnet(torch.nn.Module):

    def __init__(self , g , embedding_dim , walk_length , num_walks , min_count , batch_words , window ,
                 hidden_dim , output_dim , num_layers , dropout , device):

        super(n2vnet, self).__init__()

        sorted_node_list = (list(g.nodes))
        sorted_node_list.sort()
        sorted_node_list = [str(node) for node in sorted_node_list]

        self.n2v = node2vec.Node2Vec(g , dimensions=embedding_dim , walk_length=walk_length , num_walks=num_walks) 
        self.embeddings = torch.tensor((self.n2v.fit(window=window , min_count=min_count , batch_words=batch_words).wv)[sorted_node_list] , dtype=torch.float32 , device=device)
        

        self.dense_layers = torch.nn.ModuleList([torch.nn.Linear(embedding_dim, hidden_dim)])
        for i in range(num_layers-2):
            self.dense_layers.append(torch.nn.Linear(hidden_dim, hidden_dim))
        self.dense_layers.append(torch.nn.Linear(hidden_dim , output_dim))

        self.output = torch.nn.Sigmoid()

        self.dropout = dropout

    def reset_parameters(self):

        for layer in self.dense_layers:
            layer.reset_parameters()

    def forward(self, x , edge_index):

        out = None

        x = self.embeddings

        for layer in self.dense_layers[:-1]:
            x = layer(x)
            x = F.relu(x)
            x = F.dropout(x , p=self.dropout , training=self.training)

        x = self.dense_layers[-1](x)
        out = self.output(x)

        return out

In [70]:
n2v_model = n2vnet(data.g , embedding_dim=32 , walk_length=10 , num_walks=100 , min_count=1 , batch_words=4 , window=10 ,
                 hidden_dim=16 , output_dim=data.y.shape[1] , num_layers=3 , dropout=0.2 , device=device)

Computing transition probabilities: 100%|██████████| 6285/6285 [00:00<00:00, 8105.73it/s]
Generating walks (CPU: 1): 100%|██████████| 100/100 [00:13<00:00,  7.53it/s]


In [41]:
data.y.shape

torch.Size([6285, 100])

In [71]:
total_params = sum(
	param.numel() for param in n2v_model.parameters()
)
total_params

2500

In [72]:
def train(model, data , optimizer , loss_fn , device):

    print(f"Training... batch")
    model.train()
    loss = 0

    optimizer.zero_grad()
    print(f"    starting model")
    out = model(data.x , data.edge_index)
    print(f"   forward pass done")

    out_slice = (out[data.train_mask])
    out_labels = (data.y[data.train_mask])
    loss = loss_fn(out_slice , out_labels)
    print(f"     loss: {loss.item()}")

    loss.backward()
    optimizer.step()
    print(f"     Weights updated")

    return loss.item()

In [53]:
@torch.no_grad()
def test(model, data , device , save_model_results=False):
    
    model.eval()

    print(f"Testing...")

    out = model( (data.x) , (data.edge_index))
    print(f"   Test forward pass done")

    c_train = np.sum(multilabel_confusion_matrix((data.y[data.train_mask].int()).cpu() , ((out>0.5)[data.train_mask].int()).cpu()) , axis=0)
    train_acc = (c_train[0][0] + c_train[1][1])/(c_train.sum())

    c_val = np.sum(multilabel_confusion_matrix((data.y[data.val_mask].int()).cpu() , ((out>0.5)[data.val_mask].int()).cpu()) , axis=0)
    val_acc = (c_val[0][0] + c_val[1][1])/(c_val.sum())
    
    c_test = np.sum(multilabel_confusion_matrix((data.y[data.test_mask].int()).cpu() , ((out>0.5)[data.test_mask].int()).cpu()) , axis=0)
    test_acc = (c_test[0][0] + c_test[1][1])/(c_test.sum())

    if save_model_results:
      print ("Saving Model Predictions")

      data = {}
      data['y_pred'] = y_pred.view(-1).cpu().detach().numpy()

      df = pd.DataFrame(data=data)
      # Save locally as csv
      df.to_csv('ogbn-arxiv_node.csv', sep=',', index=False)


    return train_acc , val_acc , test_acc

In [54]:
args = {
    'device': device,
    'num_layers': 3,
    'hidden_dim': 16,
    'dropout': 0.2,
    'lr': 0.01,
    'epochs': 1000,
}
args

mps device


{'device': device(type='mps'),
 'num_layers': 3,
 'hidden_dim': 16,
 'dropout': 0.2,
 'lr': 0.01,
 'epochs': 1000}

In [268]:
data = d[0].to(device)
model = GCN(input_dim = data.x.shape[1], hidden_dim = args["hidden_dim"], output_dim = data.y.shape[1], num_layers = args["num_layers"], dropout = args['dropout'] , return_embeds=False).to(device)

In [314]:
data = d[0].to(device)
model = MLP(input_dim = data.x.shape[1], hidden_dim = args["hidden_dim"], output_dim = data.y.shape[1], num_layers = args["num_layers"], dropout = args['dropout']).to(device)

In [73]:
model = n2v_model.to(device)
data = data.to(device)

In [63]:
n2v_model

n2vnet(
  (dense_layers): ModuleList(
    (0): Linear(in_features=32, out_features=16, bias=True)
    (1): Linear(in_features=16, out_features=16, bias=True)
    (2): Linear(in_features=16, out_features=100, bias=True)
  )
  (output): Sigmoid()
)

In [64]:
total_params = sum(
	param.numel() for param in model.parameters()
)
print(total_params)

total_params = sum(
	param.numel() for param in mlp_model.parameters()
)
print(total_params)

2500


NameError: name 'mlp_model' is not defined

In [76]:
# reset the parameters to initial random value
model.reset_parameters()
print("params reset")

optimizer = torch.optim.Adam(model.parameters(), lr=args['lr'])
loss_fn = torch.nn.CrossEntropyLoss(weight=None , reduction="sum")

best_model = None
best_valid_acc = 0

for epoch in range(1, 1 + args["epochs"]):
  
  loss = train(model , data , optimizer, loss_fn , device)
  result = test(model, data , device)

  train_acc , val_acc , test_acc = result

  if val_acc > best_valid_acc:
      best_valid_acc = val_acc
      best_model = copy.deepcopy(model)
        
  print(f'Epoch: {epoch:02d}, '
        f'Loss: {loss:.4f}, '
        f'Train: {100 * train_acc:.2f}%, '
        f'Valid: {100 * val_acc:.2f}% '
        f'Test: {100 * test_acc:.2f}%')

params reset
Training... batch
    starting model
   forward pass done
     loss: 19924.45703125
     Weights updated
Testing...
   Test forward pass done
Epoch: 01, Loss: 19924.4570, Train: 55.10%, Valid: 55.02% Test: 54.97%
Training... batch
    starting model
   forward pass done
     loss: 19858.9453125
     Weights updated
Testing...
   Test forward pass done
Epoch: 02, Loss: 19858.9453, Train: 58.12%, Valid: 58.00% Test: 58.15%
Training... batch
    starting model
   forward pass done
     loss: 19799.59765625
     Weights updated
Testing...
   Test forward pass done
Epoch: 03, Loss: 19799.5977, Train: 61.84%, Valid: 61.76% Test: 61.80%
Training... batch
    starting model
   forward pass done
     loss: 19725.15625
     Weights updated
Testing...
   Test forward pass done
Epoch: 04, Loss: 19725.1562, Train: 65.90%, Valid: 65.86% Test: 65.87%
Training... batch
    starting model
   forward pass done
     loss: 19641.5546875
     Weights updated
Testing...
   Test forward pass don

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x105e66a70>>
Traceback (most recent call last):
  File "/Users/sbhardwaj/Documents/project_2/p2_env/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 770, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


Epoch: 387, Loss: 17197.8086, Train: 91.23%, Valid: 91.09% Test: 91.22%
Training... batch
    starting model
   forward pass done
     loss: 17191.73828125
     Weights updated
Testing...
   Test forward pass done
Epoch: 388, Loss: 17191.7383, Train: 91.25%, Valid: 91.12% Test: 91.24%
Training... batch
    starting model
   forward pass done
     loss: 17204.607421875
     Weights updated
Testing...
   Test forward pass done
Epoch: 389, Loss: 17204.6074, Train: 91.25%, Valid: 91.13% Test: 91.25%
Training... batch
    starting model
   forward pass done
     loss: 17189.5234375
     Weights updated
Testing...
   Test forward pass done


Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x105e66a70>>
Traceback (most recent call last):
  File "/Users/sbhardwaj/Documents/project_2/p2_env/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 796, in _clean_thread_parent_frames
    for identity in list(thread_to_parent.keys()):
KeyboardInterrupt: 


Epoch: 390, Loss: 17189.5234, Train: 91.26%, Valid: 91.14% Test: 91.27%
Training... batch
    starting model
   forward pass done
     loss: 17183.072265625
     Weights updated
Testing...
   Test forward pass done
Epoch: 391, Loss: 17183.0723, Train: 91.27%, Valid: 91.14% Test: 91.27%
Training... batch
    starting model
   forward pass done
     loss: 17190.73046875
     Weights updated
Testing...
   Test forward pass done
Epoch: 392, Loss: 17190.7305, Train: 91.28%, Valid: 91.14% Test: 91.28%
Training... batch
    starting model
   forward pass done
     loss: 17189.1875
     Weights updated
Testing...
   Test forward pass done
Epoch: 393, Loss: 17189.1875, Train: 91.29%, Valid: 91.15% Test: 91.28%
Training... batch
    starting model
   forward pass done
     loss: 17195.0078125
     Weights updated
Testing...
   Test forward pass done
Epoch: 394, Loss: 17195.0078, Train: 91.29%, Valid: 91.16% Test: 91.28%
Training... batch
    starting model
   forward pass done
     loss: 17195.2

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x105e66a70>>
Traceback (most recent call last):
  File "/Users/sbhardwaj/Documents/project_2/p2_env/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 770, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


Epoch: 423, Loss: 17171.7500, Train: 91.39%, Valid: 91.27% Test: 91.36%
Training... batch
    starting model
   forward pass done
     loss: 17173.46875
     Weights updated
Testing...
   Test forward pass done
Epoch: 424, Loss: 17173.4688, Train: 91.39%, Valid: 91.28% Test: 91.36%
Training... batch
    starting model
   forward pass done
     loss: 17170.51953125
     Weights updated
Testing...
   Test forward pass done
Epoch: 425, Loss: 17170.5195, Train: 91.40%, Valid: 91.29% Test: 91.38%
Training... batch
    starting model
   forward pass done
     loss: 17167.64453125
     Weights updated
Testing...
   Test forward pass done
Epoch: 426, Loss: 17167.6445, Train: 91.41%, Valid: 91.29% Test: 91.39%
Training... batch
    starting model
   forward pass done
     loss: 17175.05078125
     Weights updated
Testing...
   Test forward pass done
Epoch: 427, Loss: 17175.0508, Train: 91.41%, Valid: 91.29% Test: 91.40%
Training... batch
    starting model
   forward pass done
     loss: 17168.

KeyboardInterrupt: 

In [304]:
best_valid_acc*100

93.02

### Trial 

In [282]:
data = d[0].to(device)
model = GCN(data.x.shape[1] , args['hidden_dim'] , data.y.shape[1] , args['num_layers'] , args['dropout']).to(device)
out = model( (data.x) , (data.edge_index) )

optimizer = torch.optim.Adam(model.parameters(), lr=args['lr'])
loss_fn = torch.nn.BCELoss(weight=None , reduction="mean")


torch.Size([2, 231889])


In [272]:
data

Data(x=[1764, 5], edge_index=[2, 6393], y=[1764, 50], dtype=torch.float32, g=Graph with 1764 nodes and 6393 edges, train_mask=[1764], val_mask=[1764], test_mask=[1764])

In [288]:
loss = train(model , data , optimizer , loss_fn , device)

Training... batch
    starting model
torch.Size([2, 231889])
   forward pass done
     loss: 0.5323440432548523
     Weights updated


In [192]:
result = test(model, data , device)
result

Testing...
torch.Size([2, 231889])
   Test forward pass done
(473, 2, 2)


(0.9976004273941966, 0.9975986610288936, 0.9976085271317829)

In [None]:
conf_matrix = np.sum(multilabel_confusion_matrix((data.y[data.train_mask].int()).cpu() , ((out>0.5)[data.train_mask].int()).cpu()) , axis=0)
train_acc = (conf_matrix[0][0] + conf_matrix[1][1])/(conf_matrix.sum())


In [1]:
import node2vec

  from .autonotebook import tqdm as notebook_tqdm


In [34]:
class n2vnet(torch.nn.Module):

    def __init__(self , g , embedding_dim , walk_length , num_walks , min_count , batch_words , window ,
                 hidden_dim , output_dim , num_layers , dropout):

        super(n2vnet, self).__init__()

        sorted_node_list = (list(g.nodes))
        sorted_node_list.sort()
        sorted_node_list = [str(node) for node in sorted_node_list]

        self.n2v = node2vec(g , dimensions=embedding_dim , walk_length=walk_length , num_walks=num_walks) 
        self.embeddings = (self.n2v.fit(window=window , min_count=min_count , batch_words=batch_words))[sorted_node_list]
        

        self.dense_layers = torch.nn.ModuleList([torch.nn.Linear(embedding_dim, hidden_dim)])
        for i in range(num_layers-2):
            self.dense_layers.append(torch.nn.Linear(hidden_dim, hidden_dim))
        self.dense_layers.append(torch.nn.Linear(hidden_dim , output_dim))

        self.output = torch.nn.Sigmoid()

        self.dropout = dropout

    def reset_parameters(self):

        for layer in self.dense_layers:
            layer.reset_parameters()

    def forward(self, x , edge_index):

        out = None

        for layer in self.dense_layers[:-1]:
            x = layer(x)
            x = F.relu(x)
            x = F.dropout(x , p=self.dropout , training=self.training)

        x = self.dense_layers[-1](x)
        out = self.output(x)

        return out

In [None]:
n2v_model = 

In [8]:
data = d[0]

In [9]:
data

Data(x=[6285, 5], edge_index=[2, 46207], y=[6285, 100], dtype=torch.float32, g=Graph with 6285 nodes and 19961 edges, train_mask=[6285], val_mask=[6285], test_mask=[6285])

In [11]:
n2v = node2vec.Node2Vec(data.g , dimensions=32 , walk_length=10 , num_walks=100 , workers=4)

Computing transition probabilities: 100%|██████████| 6285/6285 [00:00<00:00, 8322.11it/s]
Generating walks (CPU: 1): 100%|██████████| 100/100 [00:12<00:00,  8.13it/s]


In [12]:
model = n2v.fit(window=10, min_count=1)

In [33]:
model.wv[sorted_node_list].shape


(6285, 32)

In [29]:
node_list = [str(node) for node in data.g.nodes]
node_list

['26',
 '25',
 '24',
 '23',
 '22',
 '21',
 '20',
 '19',
 '18',
 '17',
 '16',
 '15',
 '14',
 '13',
 '12',
 '11',
 '10',
 '9',
 '8',
 '7',
 '6',
 '5',
 '4',
 '3',
 '2',
 '1',
 '0',
 '6284',
 '6283',
 '6282',
 '6281',
 '6280',
 '6279',
 '6278',
 '6277',
 '6276',
 '6275',
 '6274',
 '6273',
 '6272',
 '6271',
 '6270',
 '6269',
 '6268',
 '6267',
 '6266',
 '6265',
 '6264',
 '6263',
 '6262',
 '6261',
 '6260',
 '6259',
 '6258',
 '6257',
 '6256',
 '6255',
 '6254',
 '6253',
 '6252',
 '6251',
 '6250',
 '6249',
 '6248',
 '6247',
 '6246',
 '6245',
 '6244',
 '6243',
 '6242',
 '6241',
 '6240',
 '6239',
 '6238',
 '6237',
 '6236',
 '6235',
 '6234',
 '6233',
 '6232',
 '6231',
 '6230',
 '6229',
 '6228',
 '6227',
 '6226',
 '6225',
 '6224',
 '6223',
 '6222',
 '6221',
 '6220',
 '6219',
 '6218',
 '6217',
 '6216',
 '6215',
 '6214',
 '6213',
 '6212',
 '6211',
 '6210',
 '6209',
 '6208',
 '6207',
 '6206',
 '6205',
 '6204',
 '6203',
 '6202',
 '6201',
 '6200',
 '6199',
 '6198',
 '6197',
 '6196',
 '6195',
 '6194',
 '

In [32]:
sorted_node_list = (list(data.g.nodes))
sorted_node_list.sort()
sorted_node_list = [str(node) for node in sorted_node_list]
sorted_node_list

['0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '20',
 '21',
 '22',
 '23',
 '24',
 '25',
 '26',
 '27',
 '28',
 '29',
 '30',
 '31',
 '32',
 '33',
 '34',
 '35',
 '36',
 '37',
 '38',
 '39',
 '40',
 '41',
 '42',
 '43',
 '44',
 '45',
 '46',
 '47',
 '48',
 '49',
 '50',
 '51',
 '52',
 '53',
 '54',
 '55',
 '56',
 '57',
 '58',
 '59',
 '60',
 '61',
 '62',
 '63',
 '64',
 '65',
 '66',
 '67',
 '68',
 '69',
 '70',
 '71',
 '72',
 '73',
 '74',
 '75',
 '76',
 '77',
 '78',
 '79',
 '80',
 '81',
 '82',
 '83',
 '84',
 '85',
 '86',
 '87',
 '88',
 '89',
 '90',
 '91',
 '92',
 '93',
 '94',
 '95',
 '96',
 '97',
 '98',
 '99',
 '100',
 '101',
 '102',
 '103',
 '104',
 '105',
 '106',
 '107',
 '108',
 '109',
 '110',
 '111',
 '112',
 '113',
 '114',
 '115',
 '116',
 '117',
 '118',
 '119',
 '120',
 '121',
 '122',
 '123',
 '124',
 '125',
 '126',
 '127',
 '128',
 '129',
 '130',
 '131',
 '132',
 '133',
 '134',
 '135',
 '136',
 '137',
 '138'

In [80]:
len(list(data.g.edges))

19961

In [97]:
edge_sampler = np.random.rand(1 , len(list(data.g.edges)))


In [101]:
len(data.g.edges)

19961

In [100]:
np.array(list(data.g.edges))[(edge_sampler<0.75)[0,:]].shape

(14931, 2)

In [96]:
np.array(data.g.edges)

array([[  26, 6117],
       [  26, 6030],
       [  26, 3733],
       ...,
       [  44,   43],
       [  37,   36],
       [  34,   33]])