In [6]:
import pandas as pd
import networkx as nx
import os
import numpy as np
import torch

In [18]:
def generate_train_test_mask(num_nodes):
    n = num_nodes
    train_mask = np.zeros(n,dtype = bool)
    random_indices = np.random.permutation(range(n))
    train_indices = random_indices[:int(0.6*n)]
    train_mask[train_indices] = True
    test_mask = np.zeros(n,dtype = bool)
    test_indices = random_indices[int(0.6*n):]
    print(test_indices)
    test_mask[test_indices]= True
    return train_mask, test_mask

def get_vicker_chan_dataset(multiplex_folder_path, size_x = 5):
    vicker_data_folder = os.path.join(multiplex_folder_path, "Vickers-Chan Dataset" , "Dataset")
    edges_file_path = os.path.join(vicker_data_folder,"Vickers-Chan-7thGraders_multiplex.edges" )
    edges_df = pd.read_csv(edges_file_path, sep = " ", header = None,  names = ["layerId", "src", "dst", "weight"],dtype=int)
    edges_df['src'] = edges_df['src'] - 1 # index IDs from 0
    edges_df['dst'] = edges_df['dst'] - 1 # index IDs from 0
    layers = [1, 2, 3]
    graphs = []
    adj_mats = []
    sum_ = 0
    for layer in layers : 
        df = edges_df[edges_df['layerId'] == layer]
        G= nx.from_pandas_edgelist(df, source='src', target='dst',create_using = nx.DiGraph)
        graphs.append(G)
        adj_mat = nx.adjacency_matrix(G).todense()
        
        adj_mats.append(np.array(adj_mat,dtype=int))
        
        sum_ += adj_mat.sum()
        print("# edges in layer {} are {}".format( layer, adj_mat.sum()))
    print("# edges are {}".format( sum_))
    
    n = max(edges_df["src"]) + 1
    print("# nodes are {}".format( n ))
    train_mask, test_mask = generate_train_test_mask(n)
    random_X = np.random.normal(size = [n, size_x])
    final_random_X = np.stack([random_X]* len(layers),axis = 2)
    adj = np.stack(adj_mats, axis = 2)
    labels = np.zeros(n,dtype = int) 
    labels[12:] = 1 # 0 for boy from index 0 - 11 , 12 - 28 is for girl
    return graphs, final_random_X , torch.from_numpy(labels),  torch.from_numpy(train_mask), torch.from_numpy(test_mask), torch.from_numpy(test_mask), adj

get_vicker_chan_dataset("/home/keshav/courses/master_thesis/multiplex_datasets")

# edges in layer 1 are 361
# edges in layer 2 are 181
# edges in layer 3 are 198
# edges are 740
# nodes are 29
[24 20  0  3  8  6  1 16  2 18  9 12]


([<networkx.classes.digraph.DiGraph at 0x7f47e2890b38>,
  <networkx.classes.digraph.DiGraph at 0x7f47e2890cc0>,
  <networkx.classes.digraph.DiGraph at 0x7f47e2890828>],
 array([[[-7.31592751e-01, -7.31592751e-01, -7.31592751e-01],
         [ 1.03216956e+00,  1.03216956e+00,  1.03216956e+00],
         [ 1.16553338e+00,  1.16553338e+00,  1.16553338e+00],
         [-2.65299307e-01, -2.65299307e-01, -2.65299307e-01],
         [ 5.08648116e-01,  5.08648116e-01,  5.08648116e-01]],
 
        [[-9.32532714e-02, -9.32532714e-02, -9.32532714e-02],
         [-2.32039621e-01, -2.32039621e-01, -2.32039621e-01],
         [ 2.77618787e-01,  2.77618787e-01,  2.77618787e-01],
         [-2.49051006e-01, -2.49051006e-01, -2.49051006e-01],
         [-1.33724646e+00, -1.33724646e+00, -1.33724646e+00]],
 
        [[ 1.14362859e-01,  1.14362859e-01,  1.14362859e-01],
         [-4.23810882e-01, -4.23810882e-01, -4.23810882e-01],
         [-7.41991911e-01, -7.41991911e-01, -7.41991911e-01],
         [ 6.509966

In [63]:
def add_edges_for_index(df, index_this, layer_id, G):
    index_vote = df.iloc[index_this].loc["vote{}".format(layer_id)]
    if(index_vote == "?"):
        #print(index_vote)
        return []
        
    other_votes = [(index_this, val ) for val in list((df.loc[df["vote{}".format(layer_id)] == index_vote]).index)]
    #print(other_votes)
    G.add_edges_from(other_votes)
    return other_votes

def get_congress_dataset(multiplex_folder_path, size_x = 5):
    vicker_data_folder = os.path.join(multiplex_folder_path, "Congress Dataset" )
    edges_file_path = os.path.join(vicker_data_folder,"house-votes-84.data")
    layer_ids = list(range(0,16))
    edges_df = pd.read_csv(edges_file_path, sep = ",", header = None,  names = ["layerId"] + ["vote{}".format(i) for i in layer_ids])
    edges_df['labels'] = 0
    edges_df.loc[edges_df['layerId'] == "republican",'labels'] = 1 
    ids = np.array(list(range(len(edges_df))))
    graphs_list = []
    adj_mats = []
    sum_ = 0
    for layer in layer_ids:
        G = nx.DiGraph()
        G.add_nodes_from(ids)
        for i in ids:
            add_edges_for_index(edges_df, i, layer, G)
            break
        adj_mat = nx.adjacency_matrix(G).todense()
        graphs_list.append(G)
        adj_mats.append(np.array(adj_mat,dtype=int))
        
        sum_ += adj_mat.sum()
        print("# edges in layer {} are {}".format( layer, adj_mat.sum()))
    
    print("# edges are {}".format( sum_))
    
    n = len(edges_df)
    print("# nodes are {}".format( n ))
    train_mask, test_mask = generate_train_test_mask(n)
    random_X = np.random.normal(size = [n, size_x])
    final_random_X = np.stack([random_X]* len(layer_ids),axis = 2)
    adj = np.stack(adj_mats, axis = 2)
    labels = np.array(list(edges_df['labels']))
    return edges_df, graphs_list, final_random_X , torch.from_numpy(labels),  torch.from_numpy(train_mask), torch.from_numpy(test_mask), torch.from_numpy(test_mask), adj
    

res = get_congress_dataset("/home/keshav/courses/master_thesis/multiplex_datasets")

# edges in layer 0 are 236
# edges in layer 1 are 195
# edges in layer 2 are 171
# edges in layer 3 are 177
# edges in layer 4 are 212
# edges in layer 5 are 272
# edges in layer 6 are 182
# edges in layer 7 are 178
# edges in layer 8 are 206
# edges in layer 9 are 216
# edges in layer 10 are 0.0
# edges in layer 11 are 171
# edges in layer 12 are 209
# edges in layer 13 are 248
# edges in layer 14 are 233
# edges in layer 15 are 269
# edges are 3175.0
# nodes are 435
[396 291 157 225 124 237 155  88  78 406 354 333 183 378 280 179 292  54
 110 351  50   6 389 238  67 307 131 340  61  36 210 385  16 140 227  81
  44 338  53 310 348 217 358 132 360 265 294 414  40 264 232 122  85 342
 127 429 359 226 250  51 205 370 272 199 182  52 407 169 259 388  91 364
 301 284 412 168   3  42 109 355 149 282  48 117   0 261  98 384  26 343
 222 195  64 279  80 105 176 303 111   5 241 287  76 409  33 150 145 121
 297 181 228 102 299  18 180 197  45 374 304 136  25  39  70   8 335 190
 148 318  55 425