In [2]:
import pandas as pd
import networkx as nx
import os
import numpy as np
import torch

In [12]:
def generate_train_test_mask(num_nodes):
    n = num_nodes
    train_mask = np.zeros(n,dtype = bool)
    random_indices = np.random.permutation(range(n))
    train_indices = random_indices[:int(0.6*n)]
    train_mask[train_indices] = True
    test_mask = np.zeros(n,dtype = bool)
    test_indices = random_indices[int(0.6*n):]
    print(test_indices)
    test_mask[test_indices]= True
    return train_mask, test_mask

def get_vicker_chan_dataset(multiplex_folder_path, size_x = 5):
    vicker_data_folder = os.path.join(multiplex_folder_path, "Vickers-Chan Dataset" , "Dataset")
    edges_file_path = os.path.join(vicker_data_folder,"Vickers-Chan-7thGraders_multiplex.edges" )
    edges_df = pd.read_csv(edges_file_path, sep = " ", header = None,  names = ["layerId", "src", "dst", "weight"],dtype=int)
    edges_df['src'] = edges_df['src'] - 1 # index IDs from 0
    edges_df['dst'] = edges_df['dst'] - 1 # index IDs from 0
    layers = [1, 2, 3]
    graphs = []
    adj_mats = []
    sum_ = 0
    for layer in layers : 
        df = edges_df[edges_df['layerId'] == layer]
        G= nx.from_pandas_edgelist(df, source='src', target='dst',create_using = nx.DiGraph)
        graphs.append(G)
        adj_mat = nx.adjacency_matrix(G).todense()
        
        adj_mats.append(np.array(adj_mat,dtype=int))
        
        sum_ += adj_mat.sum()
        print("# edges in layer {} are {}".format( layer, adj_mat.sum()))
    print("# edges are {}".format( sum_))
    
    n = max(edges_df["src"]) + 1
    print("# nodes are {}".format( n ))
    train_mask, test_mask = generate_train_test_mask(n)
    random_X = np.random.normal(size = [n, size_x])
    final_random_X = np.stack([random_X]* len(layers),axis = 2)
    adj = np.stack(adj_mats, axis = 2)
    labels = np.zeros(n,dtype = int) 
    labels[12:] = 1 # 0 for boy from index 0 - 11 , 12 - 28 is for girl
    return graphs, final_random_X , torch.from_numpy(labels),  torch.from_numpy(train_mask), torch.from_numpy(test_mask), torch.from_numpy(test_mask), adj

get_vicker_chan_dataset("/home/keshav/courses/master_thesis/multiplex_datasets")

# edges in layer 1 are 361
# edges in layer 2 are 181
# edges in layer 3 are 198
# edges are 740
# nodes are 29
[22 21 19 11 25  0 12 15  2  8  5 28]


([<networkx.classes.digraph.DiGraph at 0x7f344a4a6ac8>,
  <networkx.classes.digraph.DiGraph at 0x7f344a4a6b38>,
  <networkx.classes.digraph.DiGraph at 0x7f344a4a6ba8>],
 array([[[ 1.30589645e+00,  1.30589645e+00,  1.30589645e+00],
         [-7.21451938e-01, -7.21451938e-01, -7.21451938e-01],
         [ 1.63448359e+00,  1.63448359e+00,  1.63448359e+00],
         [ 4.02993004e-01,  4.02993004e-01,  4.02993004e-01],
         [-2.51839561e-01, -2.51839561e-01, -2.51839561e-01]],
 
        [[ 3.61209064e-02,  3.61209064e-02,  3.61209064e-02],
         [ 1.45226963e+00,  1.45226963e+00,  1.45226963e+00],
         [-1.00387374e+00, -1.00387374e+00, -1.00387374e+00],
         [ 9.87601125e-02,  9.87601125e-02,  9.87601125e-02],
         [-1.68861688e-01, -1.68861688e-01, -1.68861688e-01]],
 
        [[-2.54913694e-01, -2.54913694e-01, -2.54913694e-01],
         [ 1.85182592e+00,  1.85182592e+00,  1.85182592e+00],
         [ 2.99382121e-01,  2.99382121e-01,  2.99382121e-01],
         [ 2.337192

In [47]:
def add_edges_for_index(df, index_this, layer_id, G, col_prefix = "vote"):
    index_vote = df.iloc[index_this].loc["{}{}".format(col_prefix, layer_id)]
    if(index_vote == "?"):
        print(index_vote)
        return []
        
    other_votes = [(index_this, val ) for val in list((df.loc[df["{}{}".format(col_prefix, layer_id)] == index_vote]).index)]
    #print(other_votes)
    G.add_edges_from(other_votes)
    return other_votes

In [36]:


def get_congress_dataset(multiplex_folder_path, size_x = 5):
    vicker_data_folder = os.path.join(multiplex_folder_path, "Congress Dataset" )
    edges_file_path = os.path.join(vicker_data_folder,"house-votes-84.data")
    layer_ids = list(range(0,16))
    edges_df = pd.read_csv(edges_file_path, sep = ",", header = None,  names = ["layerId"] + ["vote{}".format(i) for i in layer_ids])
    edges_df['labels'] = 0
    edges_df.loc[edges_df['layerId'] == "republican",'labels'] = 1 
    ids = np.array(list(range(len(edges_df))))
    graphs_list = []
    adj_mats = []
    sum_ = 0
    for layer in layer_ids:
        G = nx.DiGraph()
        G.add_nodes_from(ids)
        for i in ids:
            add_edges_for_index(edges_df, i, layer, G)
            break
        adj_mat = nx.adjacency_matrix(G).todense()
        graphs_list.append(G)
        adj_mats.append(np.array(adj_mat,dtype=int))
        
        sum_ += adj_mat.sum()
        print("# edges in layer {} are {}".format( layer, adj_mat.sum()))
    
    print("# edges are {}".format( sum_))
    
    n = len(edges_df)
    print("# nodes are {}".format( n ))
    train_mask, test_mask = generate_train_test_mask(n)
    random_X = np.random.normal(size = [n, size_x])
    final_random_X = np.stack([random_X]* len(layer_ids),axis = 2)
    adj = np.stack(adj_mats, axis = 2)
    labels = np.array(list(edges_df['labels']))
    return graphs_list, final_random_X , torch.from_numpy(labels),  torch.from_numpy(train_mask), torch.from_numpy(test_mask), torch.from_numpy(test_mask), adj
    

res = get_congress_dataset("/home/keshav/courses/master_thesis/multiplex_datasets")

# edges in layer 0 are 236
# edges in layer 1 are 195
# edges in layer 2 are 171
# edges in layer 3 are 177
# edges in layer 4 are 212
# edges in layer 5 are 272
# edges in layer 6 are 182
# edges in layer 7 are 178
# edges in layer 8 are 206
# edges in layer 9 are 216
# edges in layer 10 are 21
# edges in layer 11 are 171
# edges in layer 12 are 209
# edges in layer 13 are 248
# edges in layer 14 are 233
# edges in layer 15 are 269
# edges are 3196
# nodes are 435
[378   0 400 428 289  13 172  86 102 189 281  87 210  81 201 303 223  61
 418  12 105 337 358  58 135 317 226  32 366 424 168  51  20 376  65 390
 360 233 197 205 421 241 397 120  97 332  66 191 106 293 110 187 245 291
  91 145 152  27  29 346   6 175 257 365 217 344  92 348 151 407 372 137
 323  63 305 185 313 148 211 422 331  18 158  43 309  84 335 426 423 213
   1  16 364 125 188  74 165 232 385 265  33  82 236 382 208 302 420 133
 329 256 321  21 359  30  53 254 139  57 288 215 206 399  76  60 324 255
 192 178  77 109 31

In [40]:
from sklearn import preprocessing

In [43]:
def get_mammo_dataset(multiplex_folder_path, size_x = 5):
    mammo_data_folder = os.path.join(multiplex_folder_path, "Mammogram Dataset" )
    edges_file_path = os.path.join(mammo_data_folder,"mammographic_masses.data")
    layer_ids = list(range(0,5))
    layer_names= ["layer{}".format(i) for i in layer_ids]
    edges_df = pd.read_csv(edges_file_path, sep = ",", header = None, names =  layer_names + ["labels"]  )
    
    ids = np.array(list(range(len(edges_df))))
    graphs_list = []
    adj_mats = []
    sum_ = 0
    for layer in layer_ids:
        G = nx.DiGraph()
        G.add_nodes_from(ids)
        for i in ids:
            add_edges_for_index(edges_df, i, layer, G, col_prefix="layer")
            break
        adj_mat = nx.adjacency_matrix(G).todense()
        graphs_list.append(G)
        adj_mats.append(np.array(adj_mat,dtype=int))
        
        sum_ += adj_mat.sum()
        print("# edges in layer {} are {}".format( layer, adj_mat.sum()))
    
    print("# edges are {}".format( sum_))
    
    n = len(edges_df)
    print("# nodes are {}".format( n ))
    train_mask, test_mask = generate_train_test_mask(n)
    X = edges_df.iloc[ids].loc[:,layer_names].replace("?", -1).to_numpy().astype(float)
    X = preprocessing.scale(X)
    #random_X = np.random.normal(size = [n, size_x])
    #final_random_X = np.stack([random_X]* len(layer_ids),axis = 2)
    adj = np.stack(adj_mats, axis = 2)
    labels = np.array(list(edges_df.iloc[ids]['labels'])).astype(int)
    return graphs_list, X , torch.from_numpy(labels),  torch.from_numpy(train_mask), torch.from_numpy(test_mask), torch.from_numpy(test_mask), adj
    

res = get_mammo_dataset("/home/keshav/courses/master_thesis/multiplex_datasets")

# edges in layer 0 are 345
# edges in layer 1 are 32
# edges in layer 2 are 95
# edges in layer 3 are 136
# edges in layer 4 are 798
# edges are 1406
# nodes are 961
[621 688 781 141 918 373 277 890 664 439 560 311   9 281 496 237 815 845
 620  78  79 299  44 803 471 113  22 539 928 156 341 627  63 301 398 241
 544 132 668 384 703 558 484 715 432 742 774 929 457 733 419 324 456 748
 493 624 111 618 478 399 771 906 463 170 436 896  42 334 672 804 718 140
 234 948 726 388 208 258 907 159 892 636 925 653 332 369 955 103  90 114
 841 511 801 469  89 546 893 589  21 107 578 913 529  29  27 216 327 756
 650 531   1 743 247 339 657 936 939 798 723 438 452 886 368 870 606 775
 240 422 273 379 853 705 195 864 593 525 713 566 406 178 307  16 135 651
 116 564 378 749 877  96 480 391 791  43 402  33 356 574 244 874 451 157
 201 326 808 412 349 212 608  32 623 611 846 246 194  95 414 567 945 110
 649 934 652 351 655 700 895 171 746 303 404 283 336 772 375 317 181 935
 521 686  93 435 338 767 403 37

In [None]:
def get_mammo_dataset(multiplex_folder_path, size_x = 5):
    mammo_data_folder = os.path.join(multiplex_folder_path, "Mammogram Dataset" )
    edges_file_path = os.path.join(mammo_data_folder,"mammographic_masses.data")
    layer_ids = list(range(0,5))
    layer_names= ["layer{}".format(i) for i in layer_ids]
    edges_df = pd.read_csv(edges_file_path, sep = ",", header = None, names =  layer_names + ["labels"]  )
    
    ids = np.array(list(range(len(edges_df))))
    graphs_list = []
    adj_mats = []
    sum_ = 0
    for layer in layer_ids:
        G = nx.DiGraph()
        G.add_nodes_from(ids)
        for i in ids:
            add_edges_for_index(edges_df, i, layer, G, col_prefix="layer")
            break
        adj_mat = nx.adjacency_matrix(G).todense()
        graphs_list.append(G)
        adj_mats.append(np.array(adj_mat,dtype=int))
        
        sum_ += adj_mat.sum()
        print("# edges in layer {} are {}".format( layer, adj_mat.sum()))
    
    print("# edges are {}".format( sum_))
    
    n = len(edges_df)
    print("# nodes are {}".format( n ))
    train_mask, test_mask = generate_train_test_mask(n)
    X = edges_df.iloc[ids].loc[:,layer_names].replace("?", -1).to_numpy().astype(float)
    X = preprocessing.scale(X)
    #random_X = np.random.normal(size = [n, size_x])
    #final_random_X = np.stack([random_X]* len(layer_ids),axis = 2)
    adj = np.stack(adj_mats, axis = 2)
    labels = np.array(list(edges_df.iloc[ids]['labels'])).astype(int)
    return graphs_list, X , torch.from_numpy(labels),  torch.from_numpy(train_mask), torch.from_numpy(test_mask), torch.from_numpy(test_mask), adj
    

res = get_mammo_dataset("/home/keshav/courses/master_thesis/multiplex_datasets")

In [52]:
def get_balance_dataset(multiplex_folder_path, size_x = 5):
    mammo_data_folder = os.path.join(multiplex_folder_path, "Balance-Scale Dataset" )
    edges_file_path = os.path.join(mammo_data_folder,"balance-scale.data")
    layer_ids = list(range(0,4))
    layer_names= ["layer{}".format(i) for i in layer_ids]
    edges_df = pd.read_csv(edges_file_path, sep = ",", header = None, names = ["labels"]+ layer_names   )
    print(edges_df.head())
    ids = np.array(list(range(len(edges_df))))
    graphs_list = []
    adj_mats = []
    sum_ = 0
    for layer in layer_ids:
        G = nx.DiGraph()
        G.add_nodes_from(ids)
        for i in ids:
            add_edges_for_index(edges_df, i, layer, G, col_prefix="layer")
            break
        adj_mat = nx.adjacency_matrix(G).todense()
        graphs_list.append(G)
        adj_mats.append(np.array(adj_mat,dtype=int))
        
        sum_ += adj_mat.sum()
        print("# edges in layer {} are {}".format( layer, adj_mat.sum()))
    
    print("# edges are {}".format( sum_))
    
    n = len(edges_df)
    print("# nodes are {}".format( n ))
    train_mask, test_mask = generate_train_test_mask(n)
    X = edges_df.iloc[ids].loc[:,layer_names].replace("?", -1).to_numpy().astype(float)
    X = preprocessing.scale(X)
    #random_X = np.random.normal(size = [n, size_x])
    #final_random_X = np.stack([random_X]* len(layer_ids),axis = 2)
    adj = np.stack(adj_mats, axis = 2)
    edges_df["labels_style"] = edges_df["labels"].astype('category')
    labels = np.array(list(edges_df.iloc[ids]['labels_style'].cat.codes))
    return graphs_list, X , torch.from_numpy(labels),  torch.from_numpy(train_mask), torch.from_numpy(test_mask), torch.from_numpy(test_mask), adj
    

res = get_balance_dataset("/home/keshav/courses/master_thesis/multiplex_datasets")

  labels  layer0  layer1  layer2  layer3
0      B       1       1       1       1
1      R       1       1       1       2
2      R       1       1       1       3
3      R       1       1       1       4
4      R       1       1       1       5
# edges in layer 0 are 125
# edges in layer 1 are 125
# edges in layer 2 are 125
# edges in layer 3 are 125
# edges are 500
# nodes are 625
[248 291 572  85 590 108 182 562 529 136 351 494 526 403 303 281 588 295
 258 138 507  15 538 202 286 427  76  92 139 368 419 442 345 113 199 210
 441 358 459 595 586 577 404 398 354 127 198 578 150 435  22 125  84 305
 143 251 456 158 156 353 159 176   0 464 416 216 564 428 541 478 120 592
 620 567 271 576 304  80 514 162 482 331 276  83 379 234 471  58 275 205
 371 554 423 297 155  26  12 417  59 493 270 289 175 392 561 596 511 247
 341 383 605 366 530 594 260   2 609 357 254 173 377 114 233 129 179  78
 284 224 306 178 161 350  86 488  36 546 374 201 491 421 591 446 460 429
 325 333 280 505 509 105 388 4

In [96]:
def get_leskovec_dataset(multiplex_folder_path, size_x = 5):
    les_data_folder = os.path.join(multiplex_folder_path, "Leskovec-Ng Dataset" )
    edges_file_path = os.path.join(les_data_folder,"Leskovec-Ng.multilayer.edges")
    labels = np.loadtxt(os.path.join(les_data_folder,'Leskovec-Ng.multilayer.labels')).astype(np.int32)
    
    data = np.loadtxt(fname=edges_file_path).astype(np.int32)
    layers = [0, 1, 2, 3]
    graphs = []
    adj_mats = []
    sum_ = 0
    edges_df = pd.read_csv(edges_file_path, sep = " ", header = None,  names = ["layerId", "src", "dst"],dtype=int)
    print(edges_df['src'].min())
    
    for layer in layers : 
        df = edges_df[edges_df['layerId'] == layer]
        G= nx.from_pandas_edgelist(df, source='src', target='dst',create_using = nx.DiGraph)
        graphs.append(G)
        adj_mat = nx.adjacency_matrix(G).todense()
        
        adj_mats.append(np.array(adj_mat,dtype=int))
        
        sum_ += adj_mat.sum()
        print("# edges in layer {} are {}".format( layer, adj_mat.sum()))
    print("# edges are {}".format( sum_))
    
    n = max(edges_df["src"].max(), edges_df["dst"].max())  + 1
    print("# nodes are {}".format( n ))
    train_mask, test_mask = generate_train_test_mask(n)
    random_X = np.random.normal(size = [n, size_x])
    final_random_X = np.stack([random_X]* len(layers),axis = 2)
    adj = np.stack(adj_mats, axis = 2)
    
    
    return graphs, final_random_X , torch.from_numpy(labels),  torch.from_numpy(train_mask), torch.from_numpy(test_mask), torch.from_numpy(test_mask), adj
    

get_les_dataset("/home/keshav/courses/master_thesis/multiplex_datasets")

0
# edges in layer 0 are 225
# edges in layer 1 are 227
# edges in layer 2 are 761
# edges in layer 3 are 623
# edges are 1836
# nodes are 191
[ 88 132 155  52  87 113 101 161  22  68  41  82  44 135  76  39  92 138
  25 164  45  34  77 144   5 145 187 105  13  21 167 185 112 128  14 151
  37 173 133 177 100  91  98 169  73  83  24  15   0 141 139  50 180 114
  59  43   6 118 189  17  90 102  49 190 111 175 115  58   1 108 122 152
  42  47  96 129  19]


([<networkx.classes.digraph.DiGraph at 0x7f3444c07a58>,
  <networkx.classes.digraph.DiGraph at 0x7f3444c07f98>,
  <networkx.classes.digraph.DiGraph at 0x7f344a3cb9e8>,
  <networkx.classes.digraph.DiGraph at 0x7f344a3cb4e0>],
 array([[[-1.13913014, -1.13913014, -1.13913014, -1.13913014],
         [-0.19130125, -0.19130125, -0.19130125, -0.19130125],
         [-0.84218474, -0.84218474, -0.84218474, -0.84218474],
         [ 0.54362255,  0.54362255,  0.54362255,  0.54362255],
         [ 1.94488969,  1.94488969,  1.94488969,  1.94488969]],
 
        [[-1.29630258, -1.29630258, -1.29630258, -1.29630258],
         [-0.04711979, -0.04711979, -0.04711979, -0.04711979],
         [-0.74389284, -0.74389284, -0.74389284, -0.74389284],
         [-1.72291031, -1.72291031, -1.72291031, -1.72291031],
         [-2.23690371, -2.23690371, -2.23690371, -2.23690371]],
 
        [[-1.20319467, -1.20319467, -1.20319467, -1.20319467],
         [-1.34063564, -1.34063564, -1.34063564, -1.34063564],
         [ 0.