In [31]:
import pandas as pd
import networkx as nx
import os
import numpy as np
import torch

In [2]:
def generate_train_test_mask(num_nodes):
    n = num_nodes
    train_mask = np.zeros(n,dtype = bool)
    random_indices = np.random.permutation(range(n))
    train_indices = random_indices[:int(0.6*n)]
    train_mask[train_indices] = True
    test_mask = np.zeros(n,dtype = bool)
    test_indices = random_indices[int(0.6*n):]
    print(test_indices)
    test_mask[test_indices]= True
    return train_mask, test_mask

def get_vicker_chan_dataset(multiplex_folder_path, size_x = 5):
    vicker_data_folder = os.path.join(multiplex_folder_path, "Vickers-Chan Dataset" , "Dataset")
    edges_file_path = os.path.join(vicker_data_folder,"Vickers-Chan-7thGraders_multiplex.edges" )
    edges_df = pd.read_csv(edges_file_path, sep = " ", header = None,  names = ["layerId", "src", "dst", "weight"],dtype=int)
    edges_df['src'] = edges_df['src'] - 1 # index IDs from 0
    edges_df['dst'] = edges_df['dst'] - 1 # index IDs from 0
    layers = [1, 2, 3]
    graphs = []
    adj_mats = []
    sum_ = 0
    for layer in layers : 
        df = edges_df[edges_df['layerId'] == layer]
        G= nx.from_pandas_edgelist(df, source='src', target='dst',create_using = nx.DiGraph)
        graphs.append(G)
        adj_mat = nx.adjacency_matrix(G).todense()
        
        adj_mats.append(np.array(adj_mat,dtype=int))
        
        sum_ += adj_mat.sum()
        print("# edges in layer {} are {}".format( layer, adj_mat.sum()))
    print("# edges are {}".format( sum_))
    
    n = max(edges_df["src"]) + 1
    print("# nodes are {}".format( n ))
    train_mask, test_mask = generate_train_test_mask(n)
    random_X = np.random.normal(size = [n, size_x])
    final_random_X = np.stack([random_X]* len(layers),axis = 2)
    adj = np.stack(adj_mats, axis = 2)
    labels = np.zeros(n,dtype = int) 
    labels[12:] = 1 # 0 for boy from index 0 - 11 , 12 - 28 is for girl
    return graphs, final_random_X , torch.from_numpy(labels),  torch.from_numpy(train_mask), torch.from_numpy(test_mask), torch.from_numpy(test_mask), adj

get_vicker_chan_dataset("/home/keshav/courses/master_thesis/multiplex_datasets")

# edges in layer 1 are 361
# edges in layer 2 are 181
# edges in layer 3 are 198
# edges are 740
# nodes are 29
[26 28  3 23  8 18 27 20  0 19 21  9]


([<networkx.classes.digraph.DiGraph at 0x7f4180877c88>,
  <networkx.classes.digraph.DiGraph at 0x7f4180877cf8>,
  <networkx.classes.digraph.DiGraph at 0x7f4180877d30>],
 array([[[ 0.84463349,  0.84463349,  0.84463349],
         [-0.16880075, -0.16880075, -0.16880075],
         [ 0.3103442 ,  0.3103442 ,  0.3103442 ],
         [ 1.26937487,  1.26937487,  1.26937487],
         [ 0.14938443,  0.14938443,  0.14938443]],
 
        [[-0.03802018, -0.03802018, -0.03802018],
         [-0.15277118, -0.15277118, -0.15277118],
         [-1.43360586, -1.43360586, -1.43360586],
         [ 0.19639727,  0.19639727,  0.19639727],
         [ 1.1196925 ,  1.1196925 ,  1.1196925 ]],
 
        [[-1.13090704, -1.13090704, -1.13090704],
         [ 0.41809181,  0.41809181,  0.41809181],
         [ 0.55268843,  0.55268843,  0.55268843],
         [-1.18695125, -1.18695125, -1.18695125],
         [-0.50097347, -0.50097347, -0.50097347]],
 
        [[-0.26768899, -0.26768899, -0.26768899],
         [-0.36287807,

In [47]:
def add_edges_for_index(df, index_this, layer_id, G, col_prefix = "vote"):
    index_vote = df.iloc[index_this].loc["{}{}".format(col_prefix, layer_id)]
    if(index_vote == "?"):
        print(index_vote)
        return []
        
    other_votes = [(index_this, val ) for val in list((df.loc[df["{}{}".format(col_prefix, layer_id)] == index_vote]).index)]
    #print(other_votes)
    G.add_edges_from(other_votes)
    return other_votes

In [36]:


def get_congress_dataset(multiplex_folder_path, size_x = 5):
    vicker_data_folder = os.path.join(multiplex_folder_path, "Congress Dataset" )
    edges_file_path = os.path.join(vicker_data_folder,"house-votes-84.data")
    layer_ids = list(range(0,16))
    edges_df = pd.read_csv(edges_file_path, sep = ",", header = None,  names = ["layerId"] + ["vote{}".format(i) for i in layer_ids])
    edges_df['labels'] = 0
    edges_df.loc[edges_df['layerId'] == "republican",'labels'] = 1 
    ids = np.array(list(range(len(edges_df))))
    graphs_list = []
    adj_mats = []
    sum_ = 0
    for layer in layer_ids:
        G = nx.DiGraph()
        G.add_nodes_from(ids)
        for i in ids:
            add_edges_for_index(edges_df, i, layer, G)
            break
        adj_mat = nx.adjacency_matrix(G).todense()
        graphs_list.append(G)
        adj_mats.append(np.array(adj_mat,dtype=int))
        
        sum_ += adj_mat.sum()
        print("# edges in layer {} are {}".format( layer, adj_mat.sum()))
    
    print("# edges are {}".format( sum_))
    
    n = len(edges_df)
    print("# nodes are {}".format( n ))
    train_mask, test_mask = generate_train_test_mask(n)
    random_X = np.random.normal(size = [n, size_x])
    final_random_X = np.stack([random_X]* len(layer_ids),axis = 2)
    adj = np.stack(adj_mats, axis = 2)
    labels = np.array(list(edges_df['labels']))
    return graphs_list, final_random_X , torch.from_numpy(labels),  torch.from_numpy(train_mask), torch.from_numpy(test_mask), torch.from_numpy(test_mask), adj
    

res = get_congress_dataset("/home/keshav/courses/master_thesis/multiplex_datasets")

# edges in layer 0 are 236
# edges in layer 1 are 195
# edges in layer 2 are 171
# edges in layer 3 are 177
# edges in layer 4 are 212
# edges in layer 5 are 272
# edges in layer 6 are 182
# edges in layer 7 are 178
# edges in layer 8 are 206
# edges in layer 9 are 216
# edges in layer 10 are 21
# edges in layer 11 are 171
# edges in layer 12 are 209
# edges in layer 13 are 248
# edges in layer 14 are 233
# edges in layer 15 are 269
# edges are 3196
# nodes are 435
[378   0 400 428 289  13 172  86 102 189 281  87 210  81 201 303 223  61
 418  12 105 337 358  58 135 317 226  32 366 424 168  51  20 376  65 390
 360 233 197 205 421 241 397 120  97 332  66 191 106 293 110 187 245 291
  91 145 152  27  29 346   6 175 257 365 217 344  92 348 151 407 372 137
 323  63 305 185 313 148 211 422 331  18 158  43 309  84 335 426 423 213
   1  16 364 125 188  74 165 232 385 265  33  82 236 382 208 302 420 133
 329 256 321  21 359  30  53 254 139  57 288 215 206 399  76  60 324 255
 192 178  77 109 31

In [40]:
from sklearn import preprocessing

In [43]:
def get_mammo_dataset(multiplex_folder_path, size_x = 5):
    mammo_data_folder = os.path.join(multiplex_folder_path, "Mammogram Dataset" )
    edges_file_path = os.path.join(mammo_data_folder,"mammographic_masses.data")
    layer_ids = list(range(0,5))
    layer_names= ["layer{}".format(i) for i in layer_ids]
    edges_df = pd.read_csv(edges_file_path, sep = ",", header = None, names =  layer_names + ["labels"]  )
    
    ids = np.array(list(range(len(edges_df))))
    graphs_list = []
    adj_mats = []
    sum_ = 0
    for layer in layer_ids:
        G = nx.DiGraph()
        G.add_nodes_from(ids)
        for i in ids:
            add_edges_for_index(edges_df, i, layer, G, col_prefix="layer")
            break
        adj_mat = nx.adjacency_matrix(G).todense()
        graphs_list.append(G)
        adj_mats.append(np.array(adj_mat,dtype=int))
        
        sum_ += adj_mat.sum()
        print("# edges in layer {} are {}".format( layer, adj_mat.sum()))
    
    print("# edges are {}".format( sum_))
    
    n = len(edges_df)
    print("# nodes are {}".format( n ))
    train_mask, test_mask = generate_train_test_mask(n)
    X = edges_df.iloc[ids].loc[:,layer_names].replace("?", -1).to_numpy().astype(float)
    X = preprocessing.scale(X)
    #random_X = np.random.normal(size = [n, size_x])
    #final_random_X = np.stack([random_X]* len(layer_ids),axis = 2)
    adj = np.stack(adj_mats, axis = 2)
    labels = np.array(list(edges_df.iloc[ids]['labels'])).astype(int)
    return graphs_list, X , torch.from_numpy(labels),  torch.from_numpy(train_mask), torch.from_numpy(test_mask), torch.from_numpy(test_mask), adj
    

res = get_mammo_dataset("/home/keshav/courses/master_thesis/multiplex_datasets")

# edges in layer 0 are 345
# edges in layer 1 are 32
# edges in layer 2 are 95
# edges in layer 3 are 136
# edges in layer 4 are 798
# edges are 1406
# nodes are 961
[621 688 781 141 918 373 277 890 664 439 560 311   9 281 496 237 815 845
 620  78  79 299  44 803 471 113  22 539 928 156 341 627  63 301 398 241
 544 132 668 384 703 558 484 715 432 742 774 929 457 733 419 324 456 748
 493 624 111 618 478 399 771 906 463 170 436 896  42 334 672 804 718 140
 234 948 726 388 208 258 907 159 892 636 925 653 332 369 955 103  90 114
 841 511 801 469  89 546 893 589  21 107 578 913 529  29  27 216 327 756
 650 531   1 743 247 339 657 936 939 798 723 438 452 886 368 870 606 775
 240 422 273 379 853 705 195 864 593 525 713 566 406 178 307  16 135 651
 116 564 378 749 877  96 480 391 791  43 402  33 356 574 244 874 451 157
 201 326 808 412 349 212 608  32 623 611 846 246 194  95 414 567 945 110
 649 934 652 351 655 700 895 171 746 303 404 283 336 772 375 317 181 935
 521 686  93 435 338 767 403 37

In [None]:
def get_mammo_dataset(multiplex_folder_path, size_x = 5):
    mammo_data_folder = os.path.join(multiplex_folder_path, "Mammogram Dataset" )
    edges_file_path = os.path.join(mammo_data_folder,"mammographic_masses.data")
    layer_ids = list(range(0,5))
    layer_names= ["layer{}".format(i) for i in layer_ids]
    edges_df = pd.read_csv(edges_file_path, sep = ",", header = None, names =  layer_names + ["labels"]  )
    
    ids = np.array(list(range(len(edges_df))))
    graphs_list = []
    adj_mats = []
    sum_ = 0
    for layer in layer_ids:
        G = nx.DiGraph()
        G.add_nodes_from(ids)
        for i in ids:
            add_edges_for_index(edges_df, i, layer, G, col_prefix="layer")
            break
        adj_mat = nx.adjacency_matrix(G).todense()
        graphs_list.append(G)
        adj_mats.append(np.array(adj_mat,dtype=int))
        
        sum_ += adj_mat.sum()
        print("# edges in layer {} are {}".format( layer, adj_mat.sum()))
    
    print("# edges are {}".format( sum_))
    
    n = len(edges_df)
    print("# nodes are {}".format( n ))
    train_mask, test_mask = generate_train_test_mask(n)
    X = edges_df.iloc[ids].loc[:,layer_names].replace("?", -1).to_numpy().astype(float)
    X = preprocessing.scale(X)
    #random_X = np.random.normal(size = [n, size_x])
    #final_random_X = np.stack([random_X]* len(layer_ids),axis = 2)
    adj = np.stack(adj_mats, axis = 2)
    labels = np.array(list(edges_df.iloc[ids]['labels'])).astype(int)
    return graphs_list, X , torch.from_numpy(labels),  torch.from_numpy(train_mask), torch.from_numpy(test_mask), torch.from_numpy(test_mask), adj
    

res = get_mammo_dataset("/home/keshav/courses/master_thesis/multiplex_datasets")

In [52]:
def get_balance_dataset(multiplex_folder_path, size_x = 5):
    mammo_data_folder = os.path.join(multiplex_folder_path, "Balance-Scale Dataset" )
    edges_file_path = os.path.join(mammo_data_folder,"balance-scale.data")
    layer_ids = list(range(0,4))
    layer_names= ["layer{}".format(i) for i in layer_ids]
    edges_df = pd.read_csv(edges_file_path, sep = ",", header = None, names = ["labels"]+ layer_names   )
    print(edges_df.head())
    ids = np.array(list(range(len(edges_df))))
    graphs_list = []
    adj_mats = []
    sum_ = 0
    for layer in layer_ids:
        G = nx.DiGraph()
        G.add_nodes_from(ids)
        for i in ids:
            add_edges_for_index(edges_df, i, layer, G, col_prefix="layer")
            break
        adj_mat = nx.adjacency_matrix(G).todense()
        graphs_list.append(G)
        adj_mats.append(np.array(adj_mat,dtype=int))
        
        sum_ += adj_mat.sum()
        print("# edges in layer {} are {}".format( layer, adj_mat.sum()))
    
    print("# edges are {}".format( sum_))
    
    n = len(edges_df)
    print("# nodes are {}".format( n ))
    train_mask, test_mask = generate_train_test_mask(n)
    X = edges_df.iloc[ids].loc[:,layer_names].replace("?", -1).to_numpy().astype(float)
    X = preprocessing.scale(X)
    #random_X = np.random.normal(size = [n, size_x])
    #final_random_X = np.stack([random_X]* len(layer_ids),axis = 2)
    adj = np.stack(adj_mats, axis = 2)
    edges_df["labels_style"] = edges_df["labels"].astype('category')
    labels = np.array(list(edges_df.iloc[ids]['labels_style'].cat.codes))
    return graphs_list, X , torch.from_numpy(labels),  torch.from_numpy(train_mask), torch.from_numpy(test_mask), torch.from_numpy(test_mask), adj
    

res = get_balance_dataset("/home/keshav/courses/master_thesis/multiplex_datasets")

  labels  layer0  layer1  layer2  layer3
0      B       1       1       1       1
1      R       1       1       1       2
2      R       1       1       1       3
3      R       1       1       1       4
4      R       1       1       1       5
# edges in layer 0 are 125
# edges in layer 1 are 125
# edges in layer 2 are 125
# edges in layer 3 are 125
# edges are 500
# nodes are 625
[248 291 572  85 590 108 182 562 529 136 351 494 526 403 303 281 588 295
 258 138 507  15 538 202 286 427  76  92 139 368 419 442 345 113 199 210
 441 358 459 595 586 577 404 398 354 127 198 578 150 435  22 125  84 305
 143 251 456 158 156 353 159 176   0 464 416 216 564 428 541 478 120 592
 620 567 271 576 304  80 514 162 482 331 276  83 379 234 471  58 275 205
 371 554 423 297 155  26  12 417  59 493 270 289 175 392 561 596 511 247
 341 383 605 366 530 594 260   2 609 357 254 173 377 114 233 129 179  78
 284 224 306 178 161 350  86 488  36 546 374 201 491 421 591 446 460 429
 325 333 280 505 509 105 388 4

In [97]:
def get_leskovec_dataset(multiplex_folder_path, size_x = 5):
    les_data_folder = os.path.join(multiplex_folder_path, "Leskovec-Ng Dataset" )
    edges_file_path = os.path.join(les_data_folder,"Leskovec-Ng.multilayer.edges")
    labels = np.loadtxt(os.path.join(les_data_folder,'Leskovec-Ng.multilayer.labels')).astype(np.int32)
    
    data = np.loadtxt(fname=edges_file_path).astype(np.int32)
    layers = [0, 1, 2, 3]
    graphs = []
    adj_mats = []
    sum_ = 0
    edges_df = pd.read_csv(edges_file_path, sep = " ", header = None,  names = ["layerId", "src", "dst"],dtype=int)
    print(edges_df['src'].min())
    
    for layer in layers : 
        df = edges_df[edges_df['layerId'] == layer]
        G= nx.from_pandas_edgelist(df, source='src', target='dst',create_using = nx.DiGraph)
        graphs.append(G)
        adj_mat = nx.adjacency_matrix(G).todense()
        
        adj_mats.append(np.array(adj_mat,dtype=int))
        
        sum_ += adj_mat.sum()
        print("# edges in layer {} are {}".format( layer, adj_mat.sum()))
    print("# edges are {}".format( sum_))
    
    n = max(edges_df["src"].max(), edges_df["dst"].max())  + 1
    print("# nodes are {}".format( n ))
    train_mask, test_mask = generate_train_test_mask(n)
    random_X = np.random.normal(size = [n, size_x])
    final_random_X = np.stack([random_X]* len(layers),axis = 2)
    adj = np.stack(adj_mats, axis = 2)
    
    
    return graphs, final_random_X, torch.from_numpy(labels),  torch.from_numpy(train_mask), torch.from_numpy(test_mask), torch.from_numpy(test_mask), adj
    

get_les_dataset("/home/keshav/courses/master_thesis/multiplex_datasets")

0
# edges in layer 0 are 225
# edges in layer 1 are 227
# edges in layer 2 are 761
# edges in layer 3 are 623
# edges are 1836
# nodes are 191
[130  73 108 111   7   6  38 177  99 127  60 182  62 138 152  20 155  76
 172  27   2 169  87  58 147  49  82 171  86 174 144 139  12 116  96   1
 151  23  22 175 112 159 154  34  46 187 168  54  36  26 167  37  29 156
 178 109   8 170 135 110  65  40 129  56  13 153  93 137  14  70 184 122
 101 120  41 160  64]


([<networkx.classes.digraph.DiGraph at 0x7f3447b5d400>,
  <networkx.classes.digraph.DiGraph at 0x7f3447b5d080>,
  <networkx.classes.digraph.DiGraph at 0x7f3447b5d5f8>,
  <networkx.classes.digraph.DiGraph at 0x7f3447b5d630>],
 array([[[-0.04222158, -0.04222158, -0.04222158, -0.04222158],
         [ 0.25202653,  0.25202653,  0.25202653,  0.25202653],
         [ 0.58025645,  0.58025645,  0.58025645,  0.58025645],
         [ 0.74788162,  0.74788162,  0.74788162,  0.74788162],
         [ 0.31653638,  0.31653638,  0.31653638,  0.31653638]],
 
        [[ 1.31203142,  1.31203142,  1.31203142,  1.31203142],
         [ 0.0081169 ,  0.0081169 ,  0.0081169 ,  0.0081169 ],
         [ 0.98317348,  0.98317348,  0.98317348,  0.98317348],
         [ 2.35307381,  2.35307381,  2.35307381,  2.35307381],
         [-0.039751  , -0.039751  , -0.039751  , -0.039751  ]],
 
        [[-1.15457347, -1.15457347, -1.15457347, -1.15457347],
         [ 0.05305854,  0.05305854,  0.05305854,  0.05305854],
         [ 1.

In [3]:
import scipy.io
import os
def process_adj_mat(A):
    A[A>0] = 1
    return A.astype(int)

def get_leskovec_true_dataset(multiplex_folder_path, size_x = 5):
    data_folder = os.path.join(multiplex_folder_path, "Leskovec-Ng Dataset" )
    file_names = ["LN_2000_2004.mat", "LN_2005_2009.mat" , "LN_2010_2014.mat"]
    adj_mats = []
    G = []
    for i, file  in enumerate(file_names):
        
        mat1 = scipy.io.loadmat( os.path.join(data_folder, file))
        
        adj = process_adj_mat(mat1["A{}".format(i+2)])
        adj_mats.append(adj)
        G.append(nx.convert_matrix.from_numpy_array(adj, create_using = nx.DiGraph))
    labels_mat = scipy.io.loadmat( os.path.join(data_folder, "LN_true.mat"))
    labels= np.array(labels_mat["s_LNG"].flatten(), dtype = int)
    n = adj_mats[0].shape[0]
    train_mask, test_mask = generate_train_test_mask(n, args.train_fraction)
    random_X = np.random.normal(size = [n, size_x])
    final_random_X = np.stack([random_X]* len(file_names),axis = 2)
    adj = np.stack(adj_mats, axis = 2)
    
    return G, final_random_X, torch.from_numpy(labels),  torch.from_numpy(train_mask), torch.from_numpy(test_mask), torch.from_numpy(test_mask), adj
    

In [4]:
get_leskovec_true_dataset("/home/keshav/courses/master_thesis/multiplex_datasets")

NameError: name 'nx' is not defined

In [9]:
data_path = os.path.join("/home/keshav/courses/master_thesis/PM/Datasets/WikipediaArticles.mat")
mat1 = scipy.io.loadmat( data_path)

In [10]:
mat1['data'].shape

(1, 2)

In [21]:
np.array(mat1['data'][0,3].todense())

array([[1., 0., 0., ..., 0., 0., 0.],
       [4., 1., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [61]:
f.keys()

<KeysViewHDF5 ['#refs#', 'W', 'data', 'truelabel']>

In [62]:
vars(f['W'])

{'_id': <h5py.h5g.GroupID at 0x7f758a297f68>}

In [63]:
f[f['data'][0][0]].value.shape

  """Entry point for launching an IPython kernel.


(187, 1703)

In [64]:
f[f['#refs#'][0][0]].value.shape

AttributeError: 'int' object has no attribute 'encode'

In [65]:
f[f['truelabel'][0][0]].shape

(1, 187)

In [66]:
f_W = f['W']

In [67]:
f_W['data'].shape

(578,)

In [68]:
f_W['ir'].shape

(578,)

In [69]:
f_W['jc'].shape

(188,)

In [None]:
def generate_nx_adj_lap():
    nx_g = data[0].to_networkx()
    adj = np.array(nx.convert_matrix.to_numpy_matrix(nx_g))
    adj_list = [adj]
    graphs_list = [nx_g]
    Ls = [sgwt_raw_laplacian(adj)]
    features = torch.tensor(PCA(n_components=args.size_x).fit_transform(g.ndata['feat'].numpy()),dtype=torch.float).to(args.device)
    features_list = [features]
    if(args.create_similarity_layer):
        adj_2 = np.array(kneighbors_graph(g.ndata['feat'].numpy(),n_neighbors = args.num_similarity_neighbors, metric = "cosine",include_self = True).todense())
        nx_g2 = nx.convert_matrix.from_numpy_array(adj_2, create_using = nx.DiGraph)
        adj_list.append(adj_2)
        graphs_list.append(nx_g2)
        features_list.append(features)
        Ls.append(sgwt_raw_laplacian(adj_2))


    adj_final = np.stack(adj_list,axis = 2)
    L = np.stack(Ls, axis = 2)
    features = torch.stack(features_list, axis = 2 )
    process_adj_mat(adj_final, args)
    args.update(graph_obj =graphs_list)
    args.update(laplacian=L

In [None]:
def load_ml_clustering_mat_dataset(args):
    data_folder = args.ml_cluster_mat_folder
    mat_file_path = os.path.join(data_folder, "{}.mat".format(args.dataset))
    adj, feats , labels = mat_file_load_all(mat_file_path)
    train_mask, test_mask = generate_train_test_mask(n, args.train_fraction)
    nx_g = nx.convert_matrix.from_numpy_array(adj, create_using = nx.DiGraph)
    nx_list = [nx_g]
    adj_list = [adj]
    Ls = [sgwt_raw_laplacian(adj)]
    if args.size_x < feats.shape[1] :
        feats = torch.tensor(PCA(n_components=args.size_x).fit_transform(feats),dtype=torch.float).to(args.device)
    
    features_list = [feats]
    if(args.create_similarity_layer):
        adj_2 = np.array(kneighbors_graph(feats ,n_neighbors = args.num_similarity_neighbors, metric = "cosine",include_self = True).todense())
        nx_g2 = nx.convert_matrix.from_numpy_array(adj_2, create_using = nx.DiGraph)
        adj_list.append(adj_2)
        graphs_list.append(nx_g2)
        features_list.append(feats)
        Ls.append(sgwt_raw_laplacian(adj_2))
    
    adj_final = np.stack(adj_list,axis = 2)
    L = np.stack(Ls, axis = 2)
    features = torch.stack(features_list, axis = 2 ).to(args.device)

    
    return nx_list, features , torch.from_numpy(labels),  torch.from_numpy(train_mask), torch.from_numpy(test_mask), torch.from_numpy(test_mask), L ,adj
        

In [6]:
from scipy import sparse
import numpy as np
import h5py
def mat_file_load_all(fname) :
    f = h5py.File(fname)
    print(f.keys())
    #f_W = f['W']
    #M = np.array(sparse.csc_matrix( (f_W['data'], f_W['ir'], f_W['jc']) ).todense())
    print(['#refs#'])
    features = f[f['data'][0][0]].value
    labels = f[f["truelabel"][0][0]].value.squeeze()
    f.close()
    
    return features.shape, labels

mat_file_load_all("/home/keshav/courses/master_thesis/PM/Datasets/UCI_mfeat.mat")

<KeysViewHDF5 ['#refs#', 'data', 'truelabel']>
['#refs#']


  """
  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()


((2000, 216), array([0., 0., 0., ..., 9., 9., 9.]))

In [63]:
mat1['data'].squeeze().shape

(3,)

In [None]:
def load_ml_clustering_scipymat_dataset(args):
    data_folder = args.ml_cluster_mat_folder
    mat_file_path = os.path.join(data_folder, "{}.mat".format(args.dataset))
    mat1 = scipy.io.loadmat( mat_file_path)
    num_layers = mat1['data'].shape[1]
    print("# num layers {}".format(num_layers))
    labels = mat1['truelabel'][0,0]
    print("# num nodes {}".format( len(labels)))
    n = len(labels)
    feats_list = []
    nx_list = []
    adj_list = []
    Ls = []
    for i in num_layers:
        print("# current layer {}".format(i))
        feats = mat1['data'][0,i].T
        
        print(feats.shape)
        adj = np.array(kneighbors_graph(feats ,n_neighbors = args.num_similarity_neighbors, metric = "cosine",include_self = True).todense())
        if(args.scale_features):
            feats_scaled = sklearn.preprocessing.scale(feats)
        else:
            feats_scaled = feats
        if args.size_x < feats.shape[1] :
            features = torch.tensor(PCA(n_components=args.size_x).fit_transform(feats_scaled),dtype=torch.float).to(args.device)
        else:
            features = torch.tensor(feats_scaled,dtype=torch.float).to(args.device)
        feats_list.append(features)
        nx_list.append(nx.convert_matrix.from_numpy_array(adj, create_using = nx.DiGraph))
        adj_list.append(adj)
        Ls.append(sgwt_raw_laplacian(adj))
        
    
    train_mask, test_mask = generate_train_test_mask(n, args.train_fraction)
    adj_final = np.stack(adj_list,axis = 2)
    L = np.stack(Ls, axis = 2)
    features = torch.stack(features_list, axis = 2 ).to(args.device)
    

    
    return nx_list, features , torch.from_numpy(labels),  torch.from_numpy(train_mask), torch.from_numpy(test_mask), torch.from_numpy(test_mask), L ,adj_final
        

In [52]:
def get_uci_true_dataset(args):
    multiplex_folder_path= args.multiplex_folder_path
    #size_x = args.size_x
    data_folder = os.path.join(multiplex_folder_path, "UCI",  "mfeat")
    file_names = ["mfeat-fac" , "mfeat-fou", "mfeat-kar", "mfeat-mor" , "mfeat-pix" , "mfeat-zer"]
    adj_mats = []
    edges = []
    G = []
    Ls = []
    sum_ = 0
    labels = [[i] * 200 for i in range ( 10 )]
    np_labels = np.array(labels).flatten()
    feats_list = []
    nx_list = []
    adj_list = []
    Ls = []
    for i, file  in enumerate(file_names):
        print(os.path.join(data_folder, file))
        print("# current layer {}".format(i))
        with open(os.path.join(data_folder, file),'r') as f:
            mat = f.readlines()
        #print(mat)
        mat_2d = [ l.split() for l in mat]
        np_ary = np.array(mat_2d, dtype = np.float)
        feats = np_ary
        adj = np.array(kneighbors_graph(feats ,n_neighbors = args.num_similarity_neighbors, metric = "cosine",include_self = True).todense())
        if(args.scale_features):
            feats_scaled = sklearn.preprocessing.scale(feats)
        else:
            feats_scaled = feats
        if args.size_x < feats.shape[1] :
            features = torch.tensor(PCA(n_components=args.size_x).fit_transform(feats_scaled),dtype=torch.float).to(args.device)
        else:
            features = torch.tensor(feats_scaled,dtype=torch.float).to(args.device)
        feats_list.append(features)
        nx_list.append(nx.convert_matrix.from_numpy_array(adj, create_using = nx.DiGraph))
        adj_list.append(adj)
        Ls.append(sgwt_raw_laplacian(adj))
        
    
    train_mask, test_mask = generate_train_test_mask(n, args.train_fraction)
    adj_final = np.stack(adj_list,axis = 2)
    L = np.stack(Ls, axis = 2)
    features = torch.stack(features_list, axis = 2 ).to(args.device)
    
        #mat1 = pd.read_csv( os.path.join(data_folder, file), sep = " ", header = None, ).to_numpy()
        #print(mat1.shape)
        
    return None
'''
        adj = process_adj_mat(mat1["A{}".format(i+1)])
        Ls.append(sgwt_raw_laplacian(adj))
        adj_mats.append(adj)
        idx_nonzeros = np.nonzero(adj)
        for (src,dst) in zip(idx_nonzeros[0],idx_nonzeros[1]):
            edges.append([i,src,dst])
        print("# edges in layer {} are {}".format( i + 1, adj.sum()))
        sum_ += adj.sum()
        G.append(nx.convert_matrix.from_numpy_array(adj, create_using = nx.DiGraph))
    labels_mat = scipy.io.loadmat( os.path.join(data_folder, "LN_true.mat"))
    labels= np.array(labels_mat["s_LNG"].flatten(), dtype = int) - 1
    print("# edges are {}".format( sum_))
    n = adj_mats[0].shape[0]
    L = np.stack(Ls,axis = 2)
    train_mask, test_mask = generate_train_test_mask(n, args.train_fraction)
    random_X = np.random.normal(size = [n, size_x])
    final_random_X = np.stack([random_X]* len(file_names),axis = 2)
    adj = np.stack(adj_mats, axis = 2)
    final_random_X = torch.from_numpy(final_random_X).float()
    print("# nodes are {}".format( n ))
    print("# train samples are {}".format(train_mask.sum()))
    print("# test samples are {}".format(test_mask.sum()))
    if(args.save_input_list):
        edges_np = np.array(edges,dtype=int)    
        np.savetxt(os.path.join(data_folder,"leskovec_multiple_edges.txt"),edges_np,fmt='%i')
        np.savetxt(os.path.join(data_folder,"leskovec_labels.txt"),labels,fmt='%i')
        print("saved to {}".format(data_folder))
    return G, final_random_X, torch.from_numpy(labels),  torch.from_numpy(train_mask), torch.from_numpy(test_mask), torch.from_numpy(test_mask),L, adj
    '''

'\n        adj = process_adj_mat(mat1["A{}".format(i+1)])\n        Ls.append(sgwt_raw_laplacian(adj))\n        adj_mats.append(adj)\n        idx_nonzeros = np.nonzero(adj)\n        for (src,dst) in zip(idx_nonzeros[0],idx_nonzeros[1]):\n            edges.append([i,src,dst])\n        print("# edges in layer {} are {}".format( i + 1, adj.sum()))\n        sum_ += adj.sum()\n        G.append(nx.convert_matrix.from_numpy_array(adj, create_using = nx.DiGraph))\n    labels_mat = scipy.io.loadmat( os.path.join(data_folder, "LN_true.mat"))\n    labels= np.array(labels_mat["s_LNG"].flatten(), dtype = int) - 1\n    print("# edges are {}".format( sum_))\n    n = adj_mats[0].shape[0]\n    L = np.stack(Ls,axis = 2)\n    train_mask, test_mask = generate_train_test_mask(n, args.train_fraction)\n    random_X = np.random.normal(size = [n, size_x])\n    final_random_X = np.stack([random_X]* len(file_names),axis = 2)\n    adj = np.stack(adj_mats, axis = 2)\n    final_random_X = torch.from_numpy(final_rand

In [53]:
from U2GNN_pytorch import util
import numpy as np
import pandas as pd
import os

args = {}
args['multiplex_folder_path'] = "/home/keshav/courses/master_thesis/multiplex_datasets"
args = util.Namespace(**args)

In [54]:
get_uci_true_dataset(args)

(2000,)
/home/keshav/courses/master_thesis/multiplex_datasets/UCI/mfeat/mfeat-fac
(2000, 216)
/home/keshav/courses/master_thesis/multiplex_datasets/UCI/mfeat/mfeat-fou
(2000, 76)
/home/keshav/courses/master_thesis/multiplex_datasets/UCI/mfeat/mfeat-kar
(2000, 64)
/home/keshav/courses/master_thesis/multiplex_datasets/UCI/mfeat/mfeat-mor
(2000, 6)
/home/keshav/courses/master_thesis/multiplex_datasets/UCI/mfeat/mfeat-pix
(2000, 240)
/home/keshav/courses/master_thesis/multiplex_datasets/UCI/mfeat/mfeat-zer
(2000, 47)
