### Class GraphDataset
    - Reads a collection of graphs stored in ".gml" file format.
    - Reads a list of node labels
    - Computes maximum node degrees, maximum # nodes, and maximum # cycles
    - Reads a list of tuples of unweighted node degrees connected by edges

In [2]:
import igraph as ig
import glob

class GraphDataset:
    def __init__(self,name):
        self.name = name
        self.dataset = sorted(glob.glob("data/"+name+"/*.gml"))
        self.label_list = []
        self.max_degree = 0
        self.max_vertex = 0
        self.max_cycle = 0
    
    def get_node_label(self):
        #computes label_list
        
        label_set = set()
        for file in self.dataset:
            g = ig.read(file)
            label_local = set(g.vs["label"])
            for item in label_local:
                label_set.add(item)
        
        check_column=[]
        for check in g.vs.attributes():
             if check != 'id' and check != 'label':
                check_column+=[check]           
        if check_column:        
            total_mat=[]
            for file in self.dataset:
                g = ig.read(file) 
                label_mat=[]
                for node in g.vs:
                    labels=[]
                    for column in (check_column):
                        labels+=[float(node[column])]
                    label_mat.append(labels)
                total_mat.append(np.array(label_mat))
            mm_check=np.concatenate(total_mat,0)
            self.label_min=np.min(mm_check,axis=0)
            self.label_max=np.max(mm_check,axis=0)
            self.label_mean=np.mean(mm_check,axis=0)
            self.label_std=np.std(mm_check,axis=0)
            
        label_mid = [(entry,chr(int(entry)+32)) for entry in label_set]
        label_mid.sort(key=lambda x:x[1])
        self.label_list = [entry[0] for entry in label_mid]
    
    def get_max_degree(self):
        #computes max_degree
        
        max_deg = 0
        for file in self.dataset:
            g = ig.read(file)
            local_deg = g.maxdegree()
            if local_deg > max_deg:
                max_deg = local_deg
        self.max_degree = max_deg
    
    def get_max_vertex(self):
        #computes max_vertex
        
        max_node = 0
        for file in self.dataset:
            g = ig.read(file)
            local_node = g.vcount()
            if local_node > max_node:
                max_node = local_node
        self.max_vertex = max_node
    
    def get_max_cycle(self):
        #computes max_cycle
        
        max_cyc = 0
        for file in self.dataset:
            g = ig.read(file)
            local_cyc = g.ecount() - g.vcount() + len(g.components())
            if local_cyc > max_cyc:
                max_cyc = local_cyc
        self.max_cycle = max_cyc
        
    def get_edge_degree_list(self):
        #computes a list of tuples of unweighted node degrees
        
        edge_list = []
        for file in self.dataset:
            g = ig.read(file)
            for e in g.es:
                v1,v2 = e.source, e.target
                e_deg = sorted(((g.vs[v1]).degree(),(g.vs[v2]).degree()))
                if e_deg not in edge_list:
                    edge_list += [e_deg]
        
        edge_list = sorted(edge_list)
        return edge_list

### Class PWLR
    - Embeds a dataset of graphs to R^n using random walk (Markov chains) and persistent homology
    - Produces both PWLR-H_i and PWLR-OPT-H_i representations
    - Exports the embedded vectors in ".npy" format in "embed" folder
    - Dimensions:
        + 1st dimension : graph dataset number
        + 2nd dimension : WL iteration steps
        + 3rd dimension : RW iteration steps
        + 4th dimension : real embedded vector components
        Ex) A[0][2][5]: A real embedding of graph no.0 obtained from applying WL 2 times & RW 5 times.

In [38]:
import torch
import igraph as ig
import glob
from sklearn import preprocessing
import pandas as pd
import numpy as np
import re

class PWLR:
    def __init__(self, name, label_type, output_type='single'):
        
        self.name = name
        self.dataset = sorted(glob.glob("data/"+name+"/*.gml"))
        self.dataset_type = label_type
        self.output_type=output_type
        graph_data = GraphDataset(self.name)
        graph_data.get_max_degree()
        graph_data.get_max_vertex()
        graph_data.get_max_cycle()
        
        if (self.dataset_type == 'Discrete'):
            graph_data.get_node_label()
            self.label_dict = graph_data.label_list 
        elif (self.dataset_type == 'Continuous'):
            graph_data.get_node_label()
            self.label_dict = graph_data.label_list
            self.label_min=graph_data.label_min   
            self.label_max=graph_data.label_max
            self.label_mean=graph_data.label_mean
            self.label_std=graph_data.label_std
        else:
            self.label_dict = ['1']
        
        self.max_degree = graph_data.max_degree #maximum degree of nodes of graphs in the dataset
        self.max_vertex = graph_data.max_vertex #maximum number of nodes of graphs in the dataset
        self.max_cycle = graph_data.max_cycle # maximum number of cycles of graphs in the dataset
        self.data_edge_list = graph_data.get_edge_degree_list() #list of edge degrees
        
        self.homology_zeroth = pd.DataFrame([]) #H0 (component) PWLR embedding
        self.homology_first = pd.DataFrame([]) #H1 (cycle) PWLR embedding
        
        #Variable for each graph
        self.graph = 0 #read a graph using igraph python library
        self.graph_node_num = 0 #number of nodes of a graph
        self.graph_label_matrix = torch.tensor([]) #node label matrix of a graph
        self.graph_label_matrix_init = torch.tensor([]) #initial node label matrix of a graph
        self.graph_adj_matrix = torch.tensor([]) #adjacency matrix of a graph
        self.graph_norm_adj_matrix = torch.tensor([]) #normalized adjacency matrix of a graph with added self-loops
        self.graph_weight_matrix = torch.tensor([]) #adjacency matrix with edge weights obtained from the L_p distance between adjacent node labels
        self.graph_weight_list = torch.tensor([]) #a list of edge weights obtained from L_p distances between any two adjacent node labels
        self.graph_homology_zeroth = torch.tensor([]) #H0 (component) PWLR embedding
        self.graph_homology_first = torch.tensor([]) #H1 (cycle) PWLR embedding
    
    def read_edge_degree_list(self):
        graph_data = GraphDataset(self.name)
        self.data_edge_list = graph_data.get_edge_degree_list() #list of edge degrees
    
    #Compile a single graph from a dataset
    def read_graph(self, graph_name):
        self.graph = ig.read(graph_name)
        self.graph_node_num = (self.graph).vcount()
        self.graph_label_matrix = []
    
    #Given a compiled graph, construct the normalized adjacency matrix
    def graph_get_norm_adjacency(self, weight=True):
        if self.graph == 0:
            return "Graph Compilation Required"
        
        if weight:
            weight_attr = "weight"
        elif weight == "attribute":
            weight_attr = "attribute0"
        else:
            weight_attr = None
            
        adj_matrix = list((self.graph).get_adjacency(attribute = weight_attr))
        self.graph_adj_matrix = torch.tensor(adj_matrix)

        adj_matrix_self_loop = self.graph_adj_matrix + torch.eye(self.graph_node_num)
        self.graph_norm_adj_matrix = torch.nn.functional.normalize(adj_matrix_self_loop,p=1.0)
    
    #Given a compiled graph, construct a matrix of node labels. (Discrete Labels)
    def graph_get_init_node_label(self):
        if self.graph == 0:
            return "Graph Compilation Required"
        
        label_mat1 = []
        label_num = []
        
        for label_ref in self.label_dict:
            label_vector = []
            label_sub = 0
            for node in (self.graph).vs:
                if self.dataset_type == 'Discrete' or self.output_type == 'both':
                    if node["label"] == label_ref:
                        label_vector += [1]
                        label_sub += 1
                    else:
                        label_vector += [0]
                elif self.dataset_type == 'No Label':
                    label_vector += [1]
                    label_sub += 1
                else:
                    label_vector += [0]
                    label_sub += 1
            label_mat1 += [label_vector]
            label_num += [label_sub]
        label_mat_a = np.array(label_mat1)
        
        label_mat2=[]
        label_mat_b=[]
        check_column=[]
        
        if self.output_type =='single':
            for check in (self.graph).vs.attributes():
                if check != 'id' and check != 'label':
                    check_column+=[check] 
            if check_column: 
                for node in (self.graph).vs:
                    labels=[]
                    for j, column in enumerate(check_column):
                        labels+=[float(node[column])]
                    label_mat2.append(labels)           
                label_mat_b=np.transpose(label_mat2,(1,0))
        else : 
            for check in (self.graph).vs.attributes():
                if check != 'id' and check != 'label':
                    check_column+=[check] 
            if check_column: 
                for node in (self.graph).vs:
                    labels=[]
                    for j, column in enumerate(check_column):
                        labels+=[float(node[column])]
                    label_mat2.append(labels)           
                label_mat_b=np.transpose(label_mat2,(1,0))
        
        if self.output_type == 'single':
            if self.dataset_type == 'Discrete':
                label_mat=label_mat_a
            elif self.dataset_type == 'Continuous':
                label_mat=label_mat_b
        elif self.output_type == 'both':
            label_mat=np.concatenate((label_mat_a, label_mat_b), axis=0)
        
        label_mat=torch.FloatTensor(label_mat)
        self.graph_label_matrix = label_mat
        self.graph_label_matrix_init = label_mat
    
    #Update node labels once using Markov chains (Random Walks)
    def graph_label_update(self):
        if self.graph == 0:
            return "Graph Compilation Required"
        
        rw_mat = self.graph_norm_adj_matrix
        self.graph_label_matrix = torch.matmul(self.graph_label_matrix, rw_mat)    
                
    def graph_label_update_WL(self):
        if self.graph == 0:
            return "Graph Compilation Required"
        
        rw_mat = self.graph_norm_adj_matrix
        self.graph_label_matrix = torch.transpose(torch.matmul(rw_mat, torch.transpose(self.graph_label_matrix,0,1)),0,1)
    
    #Compute the adjacency matrix with weights on edges obtained from the L_p distance between adjacent node labels
    def graph_edge_weight_matrix(self, p=1):
        if self.graph == 0:
            return "Graph Compilation Required"
        
        edge_weight = []
        edge_list = torch.nonzero((self.graph_adj_matrix).gt(0))
        graph_label = torch.transpose(self.graph_label_matrix,0,1)
        graph_node = self.graph_node_num
        for edge in edge_list:
            node_label_diff = graph_label[edge[0]] - graph_label[edge[1]]
            weight = torch.norm(node_label_diff,p) + 1 ##Add a Bias Term +1
            edge_weight += [weight]
    
        edge_list_and_weight = torch.sparse_coo_tensor(edge_list.t(), edge_weight, (graph_node, graph_node))
        self.graph_weight_matrix = edge_list_and_weight.to_dense()
        self.graph_weight_list = torch.unique(torch.tensor(sorted(edge_weight)))

    def graph_unsorted_weight_list(self,p=1):
        if self.graph == 0:
            return "Graph Compilation Required"
        edge_weight = []
        old_edge_list = torch.nonzero((self.graph_adj_matrix).gt(0))
        graph_label = torch.transpose(self.graph_label_matrix,0,1)
        graph_node = self.graph_node_num
        
        edge_list = []
        for edge in old_edge_list:
            edge = sorted(edge.tolist())
            if edge not in edge_list:
                edge_list += [edge]
        
        for edge in edge_list:
            node_label_diff = graph_label[edge[0]] - graph_label[edge[1]]
            weight = torch.norm(node_label_diff,p) + 1 ##Add a Bias Term +1
            edge_weight += [weight]
        return torch.tensor(edge_weight), torch.tensor(edge_list)
    
    #Compute PWLR-H0 and PWLR-H1 for each graph.
    def graph_label_persistence(self, error=1e-12,p=1):
        if self.graph == 0:
            return "Graph Compilation Required"
        
        edge_degree_list = self.data_edge_list
        unsorted_edge_weight, unsorted_edge_list = self.graph_unsorted_weight_list(p)
        homology_zeroth = []
        homology_first = []
        
        component = self.graph_node_num
        cycle = 0
        
        #initialize graph
        h = ig.Graph()
        h.add_vertices((self.graph).vcount())
        for weight in self.graph_weight_list:
            #Obtain the corresponding edge with the exact weight from the original graph
            masked_exact_edge_index = (unsorted_edge_weight == weight).nonzero(as_tuple=False)
            masked_exact_edge_index = masked_exact_edge_index.tolist()
            
            masked_exact_edge = []
            for local_ind in masked_exact_edge_index:
                new_local_ind = (unsorted_edge_list[local_ind]).tolist()
                new_local_ind = [int(x) for x in new_local_ind[0]]
                masked_exact_edge += [new_local_ind]
            
            #Add Edges
            h.add_edges(masked_exact_edge)
            h_component = len(h.components())
            h_node = h.vcount()
            h_edge = h.ecount()
            h_cycle = h_edge - h_node + h_component
                
            component_diff = component - h_component
            cycle_diff = h_cycle - cycle
            new_weight = weight #May subtract bias terms if needed
                
            if component_diff > 0:
                component = h_component
                homology_zeroth += [float(new_weight)]*component_diff 
            if cycle_diff > 0:
                cycle = h_cycle
                homology_first += [float(new_weight)]*cycle_diff
                
        self.graph_homology_zeroth = torch.tensor(homology_zeroth)
        self.graph_homology_first = torch.tensor(homology_first)
    
    #pad the embedded graphs
    def graph_embed_pad(self, bias=float("nan")):
        h0_dim = len(self.graph_homology_zeroth)
        h1_dim = len(self.graph_homology_first)
        h0_max = self.max_vertex
        h1_max = self.max_cycle

        self.graph_homology_zeroth = torch.cat((self.graph_homology_zeroth, torch.tensor([bias]*(h0_max - h0_dim))),0)
        self.graph_homology_first = torch.cat((self.graph_homology_first, torch.tensor([bias]*(h1_max - h1_dim))),0)
    
    def edge_list_degree(self,input_edge_list):
        output_edge_list = []
        for input_edge in input_edge_list:
            input_source = (self.graph).vs[input_edge[0]]
            input_target = (self.graph).vs[input_edge[1]]
            output_deg = sorted([input_source.degree(), input_target.degree()])
            output_edge_list += [output_deg]
        return output_edge_list
    
    #Compute PWLR-OPT-H0 and PWLR-OPT-H1 for each graph.
    def graph_label_persistence_node_degree(self,error=1e-12,p=1):
        if self.graph == 0:
            return "Graph Compilation Required"
        
        edge_degree_list = self.data_edge_list
        unsorted_edge_weight, unsorted_edge_list = self.graph_unsorted_weight_list(p)
        homology_zeroth = [0]*len(edge_degree_list)
        homology_first = [0]*len(edge_degree_list)
        
        component = self.graph_node_num
        cycle = 0
        
        #initialize graph
        h = ig.Graph()
        h.add_vertices((self.graph).vcount())
        for weight in self.graph_weight_list:
            #Obtain the corresponding edge with the exact weight from the original graph
            masked_exact_edge_index = (unsorted_edge_weight == weight).nonzero(as_tuple=False)
            masked_exact_edge_index = masked_exact_edge_index.tolist()
            
            masked_exact_edge = []
            for local_ind in masked_exact_edge_index:
                new_local_ind = (unsorted_edge_list[local_ind]).tolist()
                new_local_ind = [int(x) for x in new_local_ind[0]]
                masked_exact_edge += [new_local_ind]
            
            #obtain edge degrees
            masked_exact_edge_degree = self.edge_list_degree(masked_exact_edge)
            
            #Add Edges
            for i in range(0,len(masked_exact_edge)):
                edge_exact = masked_exact_edge[i]
                edge_exact_deg = masked_exact_edge_degree[i]
                edge_exact_ind = edge_degree_list.index(edge_exact_deg)
                
                h.add_edges([edge_exact])
                h_component = len(h.components())
                h_node = h.vcount()
                h_edge = h.ecount()
                h_cycle = h_edge - h_node + h_component
                
                component_diff = component - h_component
                cycle_diff = h_cycle - cycle
                new_weight = weight #May subtract bias terms if needed
                
                if component_diff > 0:
                    component = h_component
                    homology_zeroth[edge_exact_ind] += new_weight**p
                if cycle_diff > 0:
                    cycle = h_cycle
                    homology_first[edge_exact_ind] += new_weight**p
        
        self.graph_homology_zeroth = torch.tensor(homology_zeroth)
        self.graph_homology_first = torch.tensor(homology_first)

    def embed_graph_exact_dataset(self, weight=True, p=1, error=1e-12, bias=-1.0, markov_step=1, wl_step=1, embed_type=1):
        total_h0_data = []
        total_h1_data = []
        graph_idx=0
        for files in self.dataset:
            h0_data = []
            h1_data = []
            wl_h0_data = []
            wl_h1_data = []

            self.read_graph(files)
            self.graph_get_norm_adjacency(weight)
            self.graph_get_init_node_label()
            self.graph_label_matrix = self.graph_label_matrix_init
            for k_index in range(0,wl_step):
                self.graph_label_update_WL()
            for k_index in range(0,markov_step):
                self.graph_label_update()

            if embed_type == 1:
                self.graph_edge_weight_matrix(p)
                self.graph_label_persistence(error)
                self.graph_embed_pad(bias)
            else:
                embed_type = 2
                self.graph_edge_weight_matrix(p)
                self.graph_label_persistence_node_degree(error,p)

            h0_graph = list(self.graph_homology_zeroth)
            h1_graph = list(self.graph_homology_first)
            wl_h0_data += [h0_graph]
            wl_h1_data += [h1_graph]
            h0_data += [wl_h0_data]
            h1_data += [wl_h1_data]
            total_h0_data += [h0_data]
            total_h1_data += [h1_data]

        total_h0_data = np.array(total_h0_data)
        total_h1_data = np.array(total_h1_data)
        return total_h0_data, total_h1_data
    
    #embed_type = 1: Export embedded vectors of form PWLR-H_i for all graphs in a given dataset as .npy file
    #embed_type = 2: Export embedded vectors of form PWLR-OPT-H_i for all graphs in a given dataset as .npy file
    def embed_graph_dataset(self, weight=True, p=1, error=1e-12, bias=-1.0, markov_step=1, wl_step=1, embed_type=1):
        total_h0_data = []
        total_h1_data = []
        graph_idx=0
        for files in self.dataset: 
            graph_idx += 1
            if graph_idx % 100 == 0:
                print(graph_idx)
            h0_data = []
            h1_data = []
            
            #Embed the graph
            self.read_graph(files)
            self.graph_get_norm_adjacency(weight)
            self.graph_get_init_node_label()
            graph_label_only_wl=[]
            self.graph_label_matrix = self.graph_label_matrix_init
            graph_label_only_wl.append(self.graph_label_matrix)            
            
            for _ in range(1,wl_step):
                self.graph_label_update_WL()
                graph_label_only_wl.append(self.graph_label_matrix)

            for step_wl in range(0, wl_step):  
                wl_h0_data = []
                wl_h1_data = []

                self.graph_label_matrix=graph_label_only_wl[step_wl] 
                for step in range(0,markov_step):
                    if embed_type == 1:
                        self.graph_edge_weight_matrix(p)
                        self.graph_label_persistence(error)
                        self.graph_embed_pad(bias)
                    else:
                        embed_type = 2
                        self.graph_edge_weight_matrix(p)
                        self.graph_label_persistence_node_degree(error,p)

                    h0_graph = list(self.graph_homology_zeroth)
                    h1_graph = list(self.graph_homology_first)
                    wl_h0_data += [h0_graph]
                    wl_h1_data += [h1_graph]
                    self.graph_label_update()

                h0_data += [wl_h0_data]
                h1_data += [wl_h1_data]  
                
            total_h0_data += [h0_data]
            total_h1_data += [h1_data]                       


        total_h0_data = np.array(total_h0_data)
        total_h1_data = np.array(total_h1_data)                
       
        if self.dataset_type == 'Discrete':
            np.save("embed/"+self.name+f"_discrete_component_seq_{wl_step}_{markov_step}_ver{embed_type}", total_h0_data)
            np.save("embed/"+self.name+f"_discrete_cycle_seq_{wl_step}_{markov_step}_ver{embed_type}", total_h1_data)
        elif self.dataset_type == 'Continuous':
            np.save("embed/"+self.name+f"_component_seq_{wl_step}_{markov_step}_ver{embed_type}", total_h0_data)
            np.save("embed/"+self.name+f"_cycle_seq_{wl_step}_{markov_step}_ver{embed_type}", total_h1_data)
        else:
            np.save("embed/"+self.name+f"_no_label_component_seq_{wl_step}_{markov_step}_ver{embed_type}", total_h0_data)
            np.save("embed/"+self.name+f"_no_label_cycle_seq_{wl_step}_{markov_step}_ver{embed_type}", total_h1_data)
    
    #embed_type=1: Export embedded vectors of form PWLR-H_i for a single graph as .npy file
    #embed_type=2: Export embedded vectors of form PWLR-OPT-H_i for a single graph as .npy file
    #Allows parallel Computing
    def embed_graph(self, file_name, weight=True, p=1, error=1e-12, bias=-1.0, markov_step=1, wl_step=1, embed_type=1):
        h0_data = []
        h1_data = []
        
        #Embed the graph
        self.read_graph(file_name)
        self.graph_get_norm_adjacency(weight)
        self.graph_get_init_node_label()
        graph_label_only_wl=[]
        self.graph_label_matrix = self.graph_label_matrix_init
        graph_label_only_wl.append(self.graph_label_matrix)            
                       
        for _ in range(1,wl_step):
            self.graph_label_update_WL()
            graph_label_only_wl.append(self.graph_label_matrix)

        for step_wl in range(0, wl_step):  
            wl_h0_data = []
            wl_h1_data = []

            self.graph_label_matrix=graph_label_only_wl[step_wl] 
            for step in range(0,markov_step):
                if embed_type == 1:
                    self.graph_edge_weight_matrix(p)
                    self.graph_label_persistence(error)
                    self.graph_embed_pad(bias)
                else:
                    embed_type=2
                    self.graph_edge_weight_matrix(p)
                    self.graph_label_persistence_node_degree(error,p)

                h0_graph = list(self.graph_homology_zeroth)
                h1_graph = list(self.graph_homology_first)
                wl_h0_data += [h0_graph]
                wl_h1_data += [h1_graph]
                self.graph_label_update()

            h0_data += [wl_h0_data]
            h1_data += [wl_h1_data]  
        
        number_length = len(list(filter(str.isdigit, file_name)))
        if self.dataset_type == 'Discrete':
            np.save("embed/"+self.name+"/"+file_name[-4-number_length:-4]+"_discrete_component_seq_"+str(embed_type), h0_data)
            np.save("embed/"+self.name+"/"+file_name[-4-number_length:-4]+"_discrete_cycle_seq_"+str(embed_type), h1_data)
        elif self.dataset_type == 'Continuous':
            np.save("embed/"+self.name+"/"+file_name[-4-number_length:-4]+"_component_seq_"+str(embed_type), h0_data)
            np.save("embed/"+self.name+"/"+file_name[-4-number_length:-4]+"_cycle_seq_"+str(embed_type), h1_data)
        else:
            np.save("embed/"+self.name+"/"+file_name[-4-number_length:-4]+"_no_label_component_seq_"+str(embed_type), h0_data)
            np.save("embed/"+self.name+"/"+file_name[-4-number_length:-4]+"_no_label_cycle_seq_"+str(embed_type), h1_data)
        
    def concatenator(self, embed_type=1):
        if embed_type != 1:
            embed_type = 2
        
        #load each graph representation
        if self.dataset_type == 'Discrete':
            h0_dataset_file = sorted(glob.glob("embed/"+self.name+"/*_discrete_component_seq_"+str(embed_type)+".npy"))
            h1_dataset_file = sorted(glob.glob("embed/"+self.name+"/*_discrete_cycle_seq_"+str(embed_type)+".npy"))
        elif self.dataset_type == 'Continuous':
            h0_dataset_file = sorted(glob.glob("embed/"+self.name+"/*_component_seq_"+str(embed_type)+".npy"))
            h1_dataset_file = sorted(glob.glob("embed/"+self.name+"/*_cycle_seq_"+str(embed_type)+".npy"))
            h0_dataset_file = list(filter(lambda x: not re.search('discrete', x), h0_dataset_file))
            h1_dataset_file = list(filter(lambda x: not re.search('discrete', x), h1_dataset_file))
        else:
            h0_dataset_file = sorted(glob.glob("embed/"+self.name+"/*_no_label_component_seq_"+str(embed_type)+".npy"))
            h1_dataset_file = sorted(glob.glob("embed/"+self.name+"/*_no_label_cycle_seq_"+str(embed_type)+".npy"))
        sample_file_0 = np.load(h0_dataset_file[0])
        sample_file_1 = np.load(h1_dataset_file[0])

        sample_file_0_size = sample_file_0.shape
        sample_file_1_size = sample_file_1.shape
        h0_dataset_array = np.empty(shape=(0,*sample_file_0_size))
        h1_dataset_array = np.empty(shape=(0,*sample_file_1_size))
        
        #Concatenate Files
        i = 0
        for files in h0_dataset_file:
            i += 1
            if i % 100 == 0:
                print(i)
            h0_dataset_array = np.append(h0_dataset_array, [np.load(files)], axis=0)
        i = 0
        for files in h1_dataset_file:
            i += 1
            if i % 100 == 0:
                print(i)
            h1_dataset_array = np.append(h1_dataset_array, [np.load(files)], axis=0)
        
        #Export Files
        if self.dataset_type == 'Discrete':
            np.save("embed/"+self.name+f"_discrete_component_seq_{sample_file_0_size[0]}_{sample_file_0_size[1]}_ver{embed_type}", h0_dataset_array)
            np.save("embed/"+self.name+f"_discrete_cycle_seq_{sample_file_1_size[0]}_{sample_file_1_size[1]}_ver{embed_type}", h1_dataset_array)
        elif self.dataset_type == 'Continuous':
            np.save("embed/"+self.name+f"_component_seq_{sample_file_0_size[0]}_{sample_file_0_size[1]}_ver{embed_type}", h0_dataset_array)
            np.save("embed/"+self.name+f"_cycle_seq_{sample_file_1_size[0]}_{sample_file_1_size[1]}_ver{embed_type}", h1_dataset_array)
        else:
            np.save("embed/"+self.name+f"_no_label_component_seq_{sample_file_0_size[0]}_{sample_file_0_size[1]}_ver{embed_type}", h0_dataset_array)
            np.save("embed/"+self.name+f"_no_label_cycle_seq_{sample_file_1_size[0]}_{sample_file_1_size[1]}_ver{embed_type}", h1_dataset_array)

# Embedding

MUTAG & PTC_FR Datasets

In [5]:
import timeit
from joblib import Parallel, delayed
import itertools

dataset_name_list = ["MUTAG", "PTC_FR"]

for dataset_name in dataset_name_list:
    print(dataset_name)
    label_dataset_type = "Discrete"
    OUTPUT_TYPE='single'
    data = PWLR(dataset_name, label_dataset_type, output_type=OUTPUT_TYPE)
    
    n_jobs=8
    file_dataset = sorted(glob.glob("data/"+dataset_name+"/*.gml"))
    
    #PWLR-H0 and PWLR-H1 representations
    Parallel(n_jobs,verbose=5)(delayed(data.embed_graph)(file_name, weight=True, p=1, error=1e-12, bias=0, markov_step=30, wl_step=30, embed_type=1) for file_name in file_dataset)
    data.concatenator(embed_type=1)
    
    #PWLR-OPT-H0 and PWLR-OPT-H1 representations
    Parallel(n_jobs,verbose=5)(delayed(data.embed_graph)(file_name, weight=True, p=1, error=1e-12, bias=0, markov_step=30, wl_step=30, embed_type=2) for file_name in file_dataset)
    data.concatenator(embed_type=2)

MUTAG


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 tasks      | elapsed:    3.6s
[Parallel(n_jobs=8)]: Done  56 tasks      | elapsed:   20.3s
[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed:   48.8s
[Parallel(n_jobs=8)]: Done 188 out of 188 | elapsed:  1.0min finished


100
100


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 tasks      | elapsed:    1.9s
[Parallel(n_jobs=8)]: Done  56 tasks      | elapsed:   20.5s
[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed:   50.7s
[Parallel(n_jobs=8)]: Done 188 out of 188 | elapsed:  1.1min finished


100
100
PTC_FR


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 tasks      | elapsed:    0.8s
[Parallel(n_jobs=8)]: Done  56 tasks      | elapsed:   15.7s
[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed:   43.1s
[Parallel(n_jobs=8)]: Done 272 tasks      | elapsed:  1.3min
[Parallel(n_jobs=8)]: Done 351 out of 351 | elapsed:  1.7min finished


100
200
300
100
200
300


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 tasks      | elapsed:    0.9s
[Parallel(n_jobs=8)]: Done  56 tasks      | elapsed:   27.0s
[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed:  1.1min
[Parallel(n_jobs=8)]: Done 272 tasks      | elapsed:  2.2min
[Parallel(n_jobs=8)]: Done 351 out of 351 | elapsed:  2.8min finished


100
200
300
100
200
300


BZR & BZR-MD datasets (Discrete labels)

In [4]:
import timeit
from joblib import Parallel, delayed
import itertools

dataset_name_list = ["BZR", "BZR_MD"]

for dataset_name in dataset_name_list:
    print(dataset_name)
    label_dataset_type = "Discrete"
    OUTPUT_TYPE='single'
    data = PWLR(dataset_name, label_dataset_type, output_type=OUTPUT_TYPE)
    
    n_jobs=8
    file_dataset = sorted(glob.glob("data/"+dataset_name+"/*.gml"))
    
    #PWLR-H0 and PWLR-H1 representations
    Parallel(n_jobs,verbose=5)(delayed(data.embed_graph)(file_name, weight="attribute", p=1, error=1e-12, bias=0, markov_step=30, wl_step=30, embed_type=1) for file_name in file_dataset)
    data.concatenator(embed_type=1)
    
    #PWLR-OPT-H0 and PWLR-OPT-H1 representations
    Parallel(n_jobs,verbose=5)(delayed(data.embed_graph)(file_name, weight="attribute", p=1, error=1e-12, bias=0, markov_step=30, wl_step=30, embed_type=2) for file_name in file_dataset)
    data.concatenator(embed_type=2)

BZR


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 tasks      | elapsed:    5.1s
[Parallel(n_jobs=8)]: Done  56 tasks      | elapsed:   29.9s
[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed:  1.3min
[Parallel(n_jobs=8)]: Done 272 tasks      | elapsed:  2.5min
[Parallel(n_jobs=8)]: Done 405 out of 405 | elapsed:  3.6min finished


100
200
300
400
100
200
300
400


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 tasks      | elapsed:    3.6s
[Parallel(n_jobs=8)]: Done  56 tasks      | elapsed:   29.4s
[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed:  1.3min
[Parallel(n_jobs=8)]: Done 272 tasks      | elapsed:  2.6min
[Parallel(n_jobs=8)]: Done 405 out of 405 | elapsed:  3.9min finished


100
200
300
400
100
200
300
400
BZR_MD


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 tasks      | elapsed:    2.4s
[Parallel(n_jobs=8)]: Done  56 tasks      | elapsed:   18.7s
[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed:   52.7s
[Parallel(n_jobs=8)]: Done 272 tasks      | elapsed:  1.5min
[Parallel(n_jobs=8)]: Done 306 out of 306 | elapsed:  1.7min finished


100
200
300
100
200
300


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 tasks      | elapsed:    2.5s
[Parallel(n_jobs=8)]: Done  56 tasks      | elapsed:   19.7s
[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed:   56.4s
[Parallel(n_jobs=8)]: Done 272 tasks      | elapsed:  1.7min
[Parallel(n_jobs=8)]: Done 306 out of 306 | elapsed:  1.9min finished


100
200
300
100
200
300


BZR dataset (Discrete & Continuous labels)

In [6]:
import timeit
from joblib import Parallel, delayed
import itertools

dataset_name_list = ["BZR"]

for dataset_name in dataset_name_list:
    print(dataset_name)
    label_dataset_type = "Continuous"
    OUTPUT_TYPE='both'
    data = PWLR(dataset_name, label_dataset_type, output_type=OUTPUT_TYPE)
    
    n_jobs=8
    file_dataset = sorted(glob.glob("data/"+dataset_name+"/*.gml"))
    
    #PWLR-H0 and PWLR-H1 representations
    Parallel(n_jobs,verbose=5)(delayed(data.embed_graph)(file_name, weight="attribute", p=1, error=1e-12, bias=0, markov_step=30, wl_step=30, embed_type=1) for file_name in file_dataset)
    data.concatenator(embed_type=1)
    
    #PWLR-OPT-H0 and PWLR-OPT-H1 representations
    Parallel(n_jobs,verbose=5)(delayed(data.embed_graph)(file_name, weight="attribute", p=1, error=1e-12, bias=0, markov_step=30, wl_step=30, embed_type=2) for file_name in file_dataset)
    data.concatenator(embed_type=2)

BZR


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 tasks      | elapsed:    7.5s
[Parallel(n_jobs=8)]: Done  56 tasks      | elapsed:  1.0min
[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed:  2.7min
[Parallel(n_jobs=8)]: Done 272 tasks      | elapsed:  5.4min
[Parallel(n_jobs=8)]: Done 405 out of 405 | elapsed:  6.8min finished


100
200
300
400
100
200
300
400


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 tasks      | elapsed:    4.3s
[Parallel(n_jobs=8)]: Done  56 tasks      | elapsed:   35.2s
[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed:  1.6min
[Parallel(n_jobs=8)]: Done 272 tasks      | elapsed:  3.1min
[Parallel(n_jobs=8)]: Done 405 out of 405 | elapsed:  4.4min finished


100
200
300
400
100
200
300
400
