### Necessary Set-up

In [1]:
import numpy as np
import networkx as nx
import pandas as pd
import tensorflow as tf
import pickle

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

from tensorflow.keras import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Flatten, Input, Embedding

from sklearn.cluster import KMeans
from tensorflow.python.client import device_lib

tsne_model = TSNE(learning_rate=100)

In [2]:
tf.__version__
print(device_lib.list_local_devices())
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))


[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 8161728494386800531
]


AttributeError: module 'tensorflow' has no attribute 'config'

### The Embedder Model

#### Model Architecture, Losses and Accuracy

##### The Encoder Model

In [3]:
def build_encoder(embedding_size):
    model = Sequential()

    # The first encoder layer
    model.add(Dense(embedding_size * 4, activation='relu'))

    # The second encoder layer
    model.add(Dense(embedding_size * 2, activation='relu'))

    # The output layer
    model.add(Dense(embedding_size, activation='relu'))

    return model


##### The Decoder Model

In [4]:
def build_decoder(embedding_size, output_size):
    model = Sequential()

    # The first decoder layer
    model.add(Dense(embedding_size * 2, activation='relu'))

    # The secod decoder layer
    model.add(Dense(embedding_size * 4, activation='relu'))

    # The third decoder layer
    model.add(Dense(output_size, activation='sigmoid'))

    return model


##### Putting together the Ecnoder and Decoder into an AutoEncoder

In [5]:
def build_ae(encoder, decoder, output_size):
    input_tensor = Input(shape=(output_size,))
    embeddings = encoder(input_tensor)
    reconstructions = decoder(embeddings)

    auto_encoder = Model(input_tensor, reconstructions)

    return auto_encoder


##### The Auto-encoder Loss and Acuraccy

In [6]:
def recon_loss(x, x_hat):
    return tf.reduce_sum(tf.keras.losses.binary_crossentropy(x, x_hat))


In [7]:
def first_order_loss(X, Z):
    X = tf.cast(X, tf.float32)
    Z = tf.cast(Z, tf.float32)

    D = tf.linalg.diag(tf.reduce_sum(X, 1))
    L = D - X  ## L is laplation-matriX

    return 2 * tf.linalg.trace(tf.matmul(tf.matmul(tf.transpose(Z), L), Z))


In [8]:
def ae_adversarial_loss(x1, x1_hat, d_z10, d_z11, x2, x2_hat, d_z20, d_z21):
    # Recon loss
    reccon_loss_1 = recon_loss(x1, x1_hat)
    reccon_loss_2 = recon_loss(x2, x2_hat)
    
    reccon_loss = reccon_loss_1 + reccon_loss_2
    
    ### Loss 2 -> Same as the loss of the generator
    adversarial_loss_1 = tf.reduce_sum(tf.keras.losses.binary_crossentropy(tf.ones_like(d_z10), d_z10)) + \
                       tf.reduce_sum(tf.keras.losses.binary_crossentropy(tf.zeros_like(d_z11), d_z11))
    
    adversarial_loss_2 = tf.reduce_sum(tf.keras.losses.binary_crossentropy(tf.ones_like(d_z20), d_z20)) + \
                   tf.reduce_sum(tf.keras.losses.binary_crossentropy(tf.zeros_like(d_z21), d_z21))
    
    adversarial_loss = adversarial_loss_1 + adversarial_loss_2

    return reccon_loss + 2 * adversarial_loss


In [9]:
def ae_accuracy(x, x_hat):
    round_x_hat = tf.round(x_hat)
    return tf.reduce_mean(tf.cast(tf.equal(x, round_x_hat), tf.float32))


#### Pretraining the Embedder

In [10]:
def pretrain_step_embd(X, x, encoder, decoder, auto_encoder, pre_optimizer, first_order, alpha):
    with tf.GradientTape() as pre_tape:
        z = encoder(x, training=True)
        x_hat = decoder(z, training=True)

        Z = encoder(X, training=True)

        pre_loss = recon_loss(x, x_hat)

        if first_order == 'with_f1':
            pre_loss += alpha * first_order_loss(X, Z)

    pre_gradients = pre_tape.gradient(pre_loss, auto_encoder.trainable_variables)
    pre_optimizer.apply_gradients(zip(pre_gradients, auto_encoder.trainable_variables))

    pre_acc = ae_accuracy(x, x_hat)

    return tf.reduce_mean(pre_loss), pre_acc


In [11]:
def pretrain_embd(X, idxs, encoder, decoder, auto_encoder, pre_optimizer, first_order, alpha):
    np.random.shuffle(idxs)
    PRE_EPOCHS = 300
    Batch_size = 50

    for epoch in range(PRE_EPOCHS):

        epoch_losses = []
        epoch_acc = []

        for batch_idx in range(0, len(idxs), Batch_size):
            selected_idxs = idxs[batch_idx: batch_idx + Batch_size]
            adjacency_batch = X[selected_idxs, :]

            loss, accuracy = pretrain_step_embd(X, tf.cast(adjacency_batch, tf.float32), encoder, decoder, auto_encoder,
                                                pre_optimizer, first_order, alpha)

            epoch_losses.append(loss)
            epoch_acc.append(accuracy)


#     if epoch % 50 == 0:
#        print(f"Loss is {np.array(epoch_losses).mean()} and accuracy is {np.array(epoch_acc).mean()}")

### The Discriminator Model

#### Model Architecture

In [12]:
def build_discriminator(embedding_size):
    model = Sequential()

    # The input layer
    model.add(Input(shape=(embedding_size,)))

    # The first hidden layer
    model.add(Dense(25, activation='relu'))
    model.add(Dropout(0.25))

    # The second layer
    model.add(Dense(15, activation='relu'))
    model.add(Dropout(0.20))

    # The third layer
    model.add(Dense(6, activation='relu'))
    model.add(Dropout(0.20))

    model.add(Dense(1, activation='sigmoid'))

    return model


In [13]:
def disc_loss_function(d_z0, d_z1):
    loss_zero = tf.keras.losses.binary_crossentropy(tf.zeros_like(d_z0), d_z0)
    loss_one = tf.keras.losses.binary_crossentropy(tf.ones_like(d_z1), d_z1)

    return tf.cast(loss_zero, tf.float32) + tf.cast(loss_one, tf.float32)


#### Joint Training in a step

In [14]:
def train_step(x10, x11, x20, x21, encoder, decoder, auto_encoder, discriminator1, discriminator2, ae_optimizer, 
               disc1_optimizer, disc2_optimizer):
    
    with tf.GradientTape() as ae_tape, tf.GradientTape() as disc1_tape, tf.GradientTape() as disc2_tape:
        z10 = encoder(x10, training=True)
        z11 = encoder(x11, training=True)
        
        z20 = encoder(x20, training=True)
        z21 = encoder(x21, training=True)

        d_z10 = discriminator1(z10, training=True)
        d_z11 = discriminator1(z11, training=True)
        
        d_z20 = discriminator2(z20, training=True)
        d_z21 = discriminator2(z21, training=True)

        x10_hat = decoder(z10, training=True)
        x11_hat = decoder(z11, training=True)   
        
        x20_hat = decoder(z20, training=True)
        x21_hat = decoder(z21, training=True)

        ae_loss = ae_adversarial_loss(tf.concat([x10, x11], 0), tf.concat([x10_hat, x11_hat], 0), d_z10, d_z11,
                                      tf.concat([x20, x21], 0), tf.concat([x20_hat, x21_hat], 0), d_z20, d_z21,)
        disc_loss_1 = disc_loss_function(d_z10, d_z11)
        disc_loss_2 = disc_loss_function(d_z20, d_z21)

    gradients_ae = ae_tape.gradient(ae_loss, auto_encoder.trainable_variables)
    gradients_disc_1 = disc1_tape.gradient(disc_loss_1, discriminator1.trainable_variables)
    gradients_disc_2 = disc2_tape.gradient(disc_loss_2, discriminator2.trainable_variables)

    ae_optimizer.apply_gradients(zip(gradients_ae, auto_encoder.trainable_variables))
    disc1_optimizer.apply_gradients(zip(gradients_disc_1, discriminator1.trainable_variables))
    disc2_optimizer.apply_gradients(zip(gradients_disc_2, discriminator2.trainable_variables))

    ae_acc = ae_accuracy(tf.concat([x10, x11], 0), tf.concat([x10_hat, x11_hat], 0))

    return tf.reduce_mean(ae_loss), ae_acc, tf.reduce_mean(disc_loss_1) + tf.reduce_mean(disc_loss_2)


#### Pretrain Step for Discriminator

In [15]:
def pretrain_step_disc(x0, x1, encoder, discriminator, disc_pre_optimizer):
    z0 = encoder(x0)
    z1 = encoder(x1)

    with tf.GradientTape() as disc_tape_sep:
        d_z0 = discriminator(z0, training=True)
        d_z1 = discriminator(z1, training=True)

        disc_loss = disc_loss_function(d_z0, d_z1)

    gradients_disc = disc_tape_sep.gradient(disc_loss, discriminator.trainable_variables)
    disc_pre_optimizer.apply_gradients(zip(gradients_disc, discriminator.trainable_variables))

    return tf.reduce_mean(disc_loss)


#### Pretraining the discriminator

In [16]:
def pretrain_disc(X, idxs_zeros, idxs_ones, encoder, discriminator, disc_pre_optimizer):
    EPOCHS = 40

    np.random.shuffle(idxs_zeros)
    np.random.shuffle(idxs_ones)
    Batch_size = 50

    for epoch in range(EPOCHS):
        for batch_idx in range(0, len(idxs_ones), Batch_size):
            selected_zeros = idxs_zeros[batch_idx: batch_idx + Batch_size]
            selected_ones = idxs_ones[batch_idx: batch_idx + Batch_size]

            x0 = X[selected_zeros]
            x1 = X[selected_ones]

            pretrain_step_disc(x0, x1, encoder, discriminator, disc_pre_optimizer)


#### The Train Loop

In [17]:
def adversarial_train(idxs1_zeros, idxs1_ones, idxs2_zeros, idxs2_ones, encoder, decoder, auto_encoder, discriminator1, 
                      discriminator2, ae_optimizer, disc1_optimizer, disc2_optimizer):
    EPOCHS = 500

    np.random.shuffle(idxs1_zeros)
    np.random.shuffle(idxs1_ones)
    np.random.shuffle(idxs2_zeros)
    np.random.shuffle(idxs2_ones)

    Batch_size = 50

    for epoch in range(EPOCHS):
        for batch_idx in range(0, len(idxs1_ones), Batch_size):
            selected_zeros1 = idxs1_zeros[batch_idx: batch_idx + Batch_size]
            selected_ones1 = idxs1_ones[batch_idx: batch_idx + Batch_size]
            selected_zeros2 = idxs2_zeros[batch_idx: batch_idx + Batch_size]
            selected_ones2 = idxs2_ones[batch_idx: batch_idx + Batch_size]

            x10 = X[selected_zeros1]
            x11 = X[selected_ones1]
            x20 = X[selected_zeros2]
            x21 = X[selected_ones2]

            ### Joint Training
            train_step(tf.cast(x10, tf.float32), tf.cast(x11, tf.float32), tf.cast(x20, tf.float32), tf.cast(x21, tf.float32),
                       encoder, decoder, auto_encoder, discriminator1, discriminator2, ae_optimizer, disc1_optimizer, 
                       disc2_optimizer)


### Piciking the Seeds Using Embedding

In [18]:
def get_seeds(N_CLUS, embedding, nodes, labels, nodes_zero, nodes_one, strategy, n_seeds):
    '''
    stratgey can be random, nearest, fair, re-cluster, fair_re-cluster
    '''

    model = KMeans(n_clusters=N_CLUS)
    model.fit(embedding)

    cluster_number = model.labels_
    centers = model.cluster_centers_

    seed_ids = [[] for i in range(N_CLUS)]

    for i in range(N_CLUS):

        if strategy == 'nearest':
            sorted_distance = np.array(sorted(
                [[np.sqrt(np.sum(np.power(centers[i] - embedding[j], 2))), j] for j in range(len(embedding)) if
                 i == cluster_number[j]]))
            seed_ids[i].extend(list(sorted_distance[:n_seeds, 1]))


        elif strategy == 're-cluster':
            temp = []
            sorted_distance = np.array(sorted(
                [[np.sqrt(np.sum(np.power(centers[i] - embedding[j], 2))), j] for j in range(len(embedding)) if
                 i == cluster_number[j]]))
            temp.extend(list(sorted_distance[:n_seeds, 1]))

            portion_zero = 0
            portion_one = 0

            for num in temp:
                if num in nodes_zero:
                    portion_zero += 1
                elif num in nodes_one:
                    portion_one += 1

            zero_in_clus = embedding[np.logical_and(cluster_number == i, labels == 0)]
            zero_inds = nodes[np.logical_and(cluster_number == i, labels == 0)]

            one_in_clus = embedding[np.logical_and(cluster_number == i, labels == 1)]
            one_inds = nodes[np.logical_and(cluster_number == i, labels == 1)]

            added_to_zero = 0
            if len(zero_in_clus) != 0:
                model_on_zero = KMeans(n_clusters=1)
                model_on_zero.fit(zero_in_clus)
                center_zero = model_on_zero.cluster_centers_

                sorted_distance_zero = np.array(sorted(
                    [[np.sqrt(np.sum(np.power(center_zero - zero_in_clus[j], 2))), j] for j in
                     range(len(zero_in_clus))]))
                seed_ids[i].extend([zero_inds[int(i)] for i in sorted_distance_zero[:portion_zero, 1]])

                added_to_zero = len(seed_ids[i])
                assert added_to_zero == portion_zero

            added_to_one = 0
            if len(one_in_clus) != 0:
                model_on_one = KMeans(n_clusters=1)
                model_on_one.fit(one_in_clus)
                center_one = model_on_one.cluster_centers_

                sorted_distance_one = np.array(sorted(
                    [[np.sqrt(np.sum(np.power(center_one - one_in_clus[j], 2))), j] for j in range(len(one_in_clus))]))
                seed_ids[i].extend([one_inds[int(i)] for i in sorted_distance_one[:portion_one, 1]])

                added_to_one = len(seed_ids[i]) - added_to_zero
                assert added_to_one == portion_one

            assert n_seeds == added_to_zero + added_to_one
            assert len(seed_ids[i]) == n_seeds

    return np.reshape(seed_ids, newshape=(-1,))


### The IC algorithm

In [19]:
def IC(G, seeds, imp_prob, recover_prob=0, remove=0):
    impressed = []
    removed = []
    front = list(seeds[:])

    while front:
        impressed.extend(front)
        impressed = np.array(impressed)

        if recover_prob != 0:

            random_draws = np.random.uniform(size=len(impressed))

            if remove:
                removed.extend(impressed[random_draws < recover_prob])
                removed = list(set(removed))

            impressed = impressed[random_draws >= recover_prob]

        impressed = list(impressed)
        new_front = []

        for node in front:

            neighbours = list(G.neighbors(node))

            for neigh in neighbours:

                expr_prob = np.random.uniform(size=1)[0]
                if expr_prob < imp_prob and not (neigh in impressed) and not (neigh in new_front) and not (
                        neigh in removed):
                    new_front.append(neigh)

        front = new_front[:]

    impressed = np.reshape(np.array(impressed), newshape=(-1,))

    return impressed


#### Repeated IC

In [20]:
def repeated_IC(G, seeds, seed_type, n_expr, imp_prob):
    zeros_count1 = []
    ones_count1 = []
    zeros_count2 = []
    ones_count2 = []
    total_count = []

    for i in range(n_expr):
        impressed = IC(G, seeds, imp_prob)
        total_count.append(len(impressed))

        count_zeros1 = 0
        count_ones1 = 0
        count_zeros2 = 0
        count_ones2 = 0

        for imp in impressed:
            if imp in attr1_zero:
                count_zeros1 += 1
            elif imp in attr1_one:
                count_ones1 += 1
            if imp in attr2_zero:
                count_zeros2 += 1
            elif imp in attr2_one:
                count_ones2 += 1

        zeros_count1.append(count_zeros1)
        ones_count1.append(count_ones1)
        zeros_count2.append(count_zeros2)
        ones_count2.append(count_ones2)

    total_imp = np.round(np.mean(total_count), 2)
    total_fraction = np.round(total_imp / len(G.nodes()), 3)

    fraction_zero1 = np.round(np.mean(zeros_count1) / len(attr1_zero), 3)
    fraction_one1 = np.round(np.mean(ones_count1) / len(attr1_one), 3)
    
    fraction_zero2 = np.round(np.mean(zeros_count2) / len(attr2_zero), 3)
    fraction_one2 = np.round(np.mean(ones_count2) / len(attr2_one), 3)

    return total_imp, total_fraction, fraction_zero1, fraction_one1, fraction_zero2, fraction_one2


In [21]:
def get_IC_influenced(G, seeds, n_expr, imp_prob):
    zeros_count1 = []
    ones_count1 = []
    zeros_count2 = []
    ones_count2 = []
    total_count = []

    for i in range(n_expr):
        impressed = IC(G, seeds, imp_prob)
        total_count.append(len(impressed))

        count_zeros1 = 0
        count_ones1 = 0
        count_zeros2 = 0
        count_ones2 = 0

        for imp in impressed:
            if imp in attr1_zero:
                count_zeros1 += 1
            elif imp in attr1_one:
                count_ones1 += 1
            if imp in attr2_zero:
                count_zeros2 += 1
            elif imp in attr2_one:
                count_ones2 += 1

        zeros_count1.append(count_zeros1)
        ones_count1.append(count_ones1)
        zeros_count2.append(count_zeros2)
        ones_count2.append(count_ones2)

    return np.array(total_count, count_zeros1, count_ones1)


### Loading the real graph

In [21]:
def get_graph_real():
    graph_df = pd.read_csv('edges.txt', sep="\t", header=None)
    graph_df.columns = ['s', 't']

    attr_df = pd.read_csv('attr.txt', sep="\t", header=None)
    attr_df.columns = ['id', 'College', 'Age', 'Major']

    edges = []

    for index, row in graph_df.iterrows():
        edge_cur = (row.s, row.t)

        edges.append(edge_cur)

    input_G = nx.from_edgelist(edges)

    extra_nodes = []
    for index, row in attr_df.iterrows():
        if row.Age > 20:
            extra_nodes.append(row.id)
    
    unfrozen_G = nx.Graph(input_G)
    
    for node in input_G.nodes():
        if node in extra_nodes:
            unfrozen_G.remove_node(node)   
    
    X = nx.to_numpy_matrix(unfrozen_G)
    G = nx.from_numpy_matrix(X)

    return G, X, unfrozen_G


In [22]:
def get_nodes_labels_real():
    
    # 1. Building the graph.
    graph_df = pd.read_csv('edges.txt', sep="\t", header=None)
    graph_df.columns = ['s', 't']
    
    edges = []
    for index, row in graph_df.iterrows():
        edge_cur = (row.s, row.t)

        edges.append(edge_cur)
        
    input_G = nx.from_edgelist(edges)
    unfrozen_G = nx.Graph(input_G)
    nodes_G = list(input_G.nodes())

    # 2. Fetching the attributes
    data = pd.read_csv('attr.txt', sep="\t", header=None)
    
    # Major takes a value between 1-60
    data.columns = ['id', 'College', 'Age', 'Major']
    
    extra_nodes = []
    attr1_labels = {}
    attr2_labels = {}

    count_l20 = 0
    count_e20 = 0
    count_g20 = 0

    for index, row in data.iterrows():
        # Grouping based on the first attribute
        # Ages take a value between 18-24
        if row.Age < 20 and row.id in nodes_G:
            attr1_labels[row.id] = 0
            count_l20 += 1
        elif row.Age == 20 and row.id in nodes_G:
            attr1_labels[row.id] = 1
            count_e20 += 1
        elif row.Age > 20 and row.id in nodes_G:
            attr1_labels[row.id] = 2
            extra_nodes.append(row.id)
            count_g20 += 1
        
        # Grouping based on the second attribute
        # College takes a value between 1-9
        if row.College < 7 and row.id in nodes_G:
            attr2_labels[row.id] = 0
        elif row.College >= 7 and row.id in nodes_G:
            attr2_labels[row.id] = 1
    
    # 3. Pruning the data based on the age labels (the get the same dataset as before)
    for node in input_G.nodes():
        if node in extra_nodes:
            unfrozen_G.remove_node(node)
    # Asserting the size
    # print(len(list(unfrozen_G.nodes())))
    # print(count_l20 + count_e20)
    
    labels_attr1 = []
    labels_attr2 = []
    
    # 4. creating the list of attribute labels
    for node in unfrozen_G.nodes():
        labels_attr1.append(attr1_labels[node])
        labels_attr2.append(attr2_labels[node])
    assert len(labels_attr1) == len(labels_attr2)
    
    # Remember that whenever you want do logical operation on a sequence, that sequence should be numpy array
    labels_attr1 = np.array(labels_attr1)
    labels_attr2 = np.array(labels_attr2)

    nodes = np.arange(len(unfrozen_G.nodes()))
    
    attr1_zero = nodes[labels_attr1 == 0]
    attr1_one = nodes[labels_attr1 == 1]
    
    attr2_zero = nodes[labels_attr2 == 0]
    attr2_one = nodes[labels_attr2 == 1]
    
    # Size of zero and one community of each attribute should be different
    # print(f'For attribute 1, zero: {len(attr1_zero)}, one: {len(attr1_one)}')
    # print(f'for attribute 2, zero: {len(attr2_zero)}, one: {len(attr2_one)}')
    # 
    # # Counting the number of both zero and both one
    # print(f'The number of both zeros are: {len(nodes[np.logical_and(labels_attr1 == 0, labels_attr2 == 0)])}')
    # print(f'The number of both ones are: {len(nodes[np.logical_and(labels_attr1 == 1, labels_attr2 == 1)])}')

    return attr1_zero, attr1_one, labels_attr1, attr2_zero, attr2_one, labels_attr2


In [23]:
def get_idxs(n, nodes_zero, nodes_one, idxs_size):

    idxs_zeros = nodes_zero[:]
    idxs_ones = nodes_one[:]
    
    rep_policy = False
    if len(nodes_zero) < idxs_size:
        if idxs_size  > 2 * len(nodes_zero):
            rep_policy = True
        zero_draws = np.random.choice(nodes_zero, size=idxs_size - len(nodes_zero), replace=rep_policy)
        idxs_zeros = np.concatenate((idxs_zeros, zero_draws))
    
    rep_policy = False
    if len(nodes_one) < idxs_size:
        if idxs_size > 2 * len(nodes_one):
            rep_policy = True
        one_draws = np.random.choice(nodes_one, size=idxs_size - len(nodes_one), replace=rep_policy)
        idxs_ones = np.concatenate((idxs_ones, one_draws))

    assert len(idxs_zeros) == len(idxs_ones)

    return np.arange(n), idxs_zeros, idxs_ones


In [24]:
def print_edges(G, nodes_zero, nodes_one):
    zero_edges = 0
    one_edges = 0
    accross_edges = 0

    for (v1, v2) in G.edges():
        if v1 in nodes_zero and v2 in nodes_zero:
            zero_edges += 1
        elif v1 in nodes_one and v2 in nodes_one:
            one_edges += 1
        elif v1 in nodes_one and v2 in nodes_zero:
            accross_edges += 1
        elif v1 in nodes_zero and v2 in nodes_one:
            accross_edges += 1

    print(f" edges in zero community: {zero_edges}")
    print(f" edges in one community: {one_edges}")
    print(f" edges across communities: {accross_edges}")


### Running the experiments

In [25]:
import time


embedding_size = 30

# Can get `with_f1` or `without_f1`
first_order_imp = 'no_f1'
alpha = 0.05

# 1. Creating the Graph and Getting the Adj Matrix
G, X, input_G = get_graph_real()
n = len(G.nodes())

# 2. Getting seperate lists for seperate communities and the label for each community
attr1_zero, attr1_one, labels_attr1, attr2_zero, attr2_one, labels_attr2 = get_nodes_labels_real()

idxs_size = np.max([len(attr1_zero), len(attr1_one), len(attr2_zero), len(attr2_one)])

print_edges(G, attr1_zero, attr1_one)

# 3. Getting the idxs suitable for training.
idxs, idxs1_zeros, idxs1_ones = get_idxs(n, attr1_zero, attr1_one, idxs_size)
_, idxs2_zeros, idxs2_ones = get_idxs(n, attr2_zero, attr2_one, idxs_size)

# 4. Creating the Embedder
encoder = build_encoder(embedding_size)
decoder = build_decoder(embedding_size, n)
auto_encoder = build_ae(encoder, decoder, n)

# 5. Creating the Discriminator
discriminator1 = build_discriminator(embedding_size)
discriminator2 = build_discriminator(embedding_size)

# 6. Pretraining the Embedder and the Discriminator
pre_optimizer_embd = tf.keras.optimizers.Adam()
pre_optimizer_disc1 = tf.keras.optimizers.Adam()
pre_optimizer_disc2 = tf.keras.optimizers.Adam()

time1= time.time()
pretrain_embd(X, idxs, encoder, decoder, auto_encoder, pre_optimizer_embd, first_order_imp, alpha)
pretrain_disc(X, idxs1_zeros, idxs1_ones, encoder, discriminator1, pre_optimizer_disc1)
pretrain_disc(X, idxs2_zeros, idxs2_ones, encoder, discriminator2, pre_optimizer_disc2)
# print('6')

# # 6-1. Get the pretrain-embeddings
pre_embds = encoder(X)
print('pre-training done.')

 edges in zero community: 513
 edges in one community: 7441
 edges across communities: 1706


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

pre-training done.


In [26]:
# 7. Adversarial Training
ae_optimizer = tf.keras.optimizers.Adam()
disc1_optimizer = tf.keras.optimizers.Adam()
disc2_optimizer = tf.keras.optimizers.Adam()

adversarial_train(idxs1_zeros, idxs1_ones, idxs2_zeros, idxs2_ones, encoder, decoder, auto_encoder, discriminator1, 
                  discriminator2, ae_optimizer, disc1_optimizer, disc2_optimizer)

# # 6-1. Get the pretrain-embeddings
fair_embds = encoder(X)
print('adversarial training done.')
time_spent = np.round(time.time() - time1, 2)

print(time_spent)

adversarial training done.
724.38


#### Experiments settings

In [27]:
N_CLUSs = [4]

n_seedss = [1, 2, 3, 4, 5, 6, 7, 8, 10]
# n_seedss = [8]

# Methods for getting the seeds can be nearest or re-cluster
strategy = 're-cluster'

In [28]:
# Loading from somewhere else
# 
# with open('saved_models/embds_2_1.pickle', 'rb') as f:
# #     _, _, fair_embds, pre_embds, _, _, _, _ = pickle.load(f)
#     G, embedding_size, fair_embds, pre_embds, idxs, labels_attr1, attr1_zero, attr1_one = pickle.load(f)

In [29]:
# Loading the greedy seeds

with open('saved_models/greedy_seeds.pickle', 'rb') as f:
#     _, _, fair_embds, pre_embds, _, _, _, _ = pickle.load(f)
    greedy_seeds = pickle.load(f)

greedy_seeds = np.array(greedy_seeds[0])
# print(greedy_seeds)

In [30]:
rows = '['
first = True
seeds_cur = 0
for N_CLUS in N_CLUSs:
    for n_seeds in n_seedss:
        # if first:
        #     first = False
        # else:
        #     rows += ',\n'
        # # 8. Getting the seeds for the embeddings and baselines
        fair_seeds = get_seeds(N_CLUS, fair_embds, idxs, labels_attr1, attr1_zero, attr1_one, strategy, n_seeds)
        # pre_seeds = get_seeds(N_CLUS, pre_embds, idxs, labels_attr1, attr1_zero, attr1_one, strategy, n_seeds)
        # # 
        # # # 9. Getting the final results
        total_fair, fair_frac, zero_fair1, one_fair1, zero_fair2, one_fair2 = repeated_IC(G, fair_seeds, 'fair', 2000, 0.01)
        # total_pre, pre_frac, zero_pre1, one_pre1, zero_pre2, one_pre2 = repeated_IC(G, pre_seeds, 'pre', 2000, 0.01)
        # # 
        # # 10. Building the current row and adding it to the rows.
        # row_fair = [embedding_size, N_CLUS, n_seeds, total_fair, fair_frac, zero_fair1, one_fair1, zero_fair2, one_fair2, '\'' + strategy + '\'']
        # row_pre = [embedding_size, N_CLUS, n_seeds, total_pre, pre_frac, zero_pre1, one_pre1, zero_pre2, one_pre2, '\'' + strategy + '\'']
        # print(row_fair)
        # print(row_pre)
        # print('')
        
         # Results of the greedy
        total_greedy, greedy_frac, greedy_zero_1, greedy_one_1, greedy_zero_2, greedy_one_2 = repeated_IC(G, np.reshape(greedy_seeds[seeds_cur], newshape=(-1,)), 'greedy', 2000, 0.01)
        row_greedy = [n_seeds, total_greedy, greedy_frac, greedy_zero_1, greedy_one_1, greedy_zero_2, greedy_one_2]
        print(row_greedy)
        seeds_cur += 1

        # rows += '[' + ', '.join(map(str, row)) + ']'

# rows += ']'
# 
# print(rows)

[1, 8.74, 0.02, 0.004, 0.024, 0.026, 0.008]
[2, 16.94, 0.038, 0.009, 0.047, 0.051, 0.016]
[3, 25.62, 0.058, 0.014, 0.07, 0.073, 0.032]
[4, 34.06, 0.077, 0.022, 0.093, 0.093, 0.05]
[5, 40.56, 0.092, 0.025, 0.111, 0.108, 0.063]
[6, 46.68, 0.106, 0.028, 0.128, 0.124, 0.074]


KeyboardInterrupt: 

In [None]:
import pickle

with open('embds_2_5.pickle', 'wb') as f:
    pickle.dump([G, embedding_size, fair_embds, pre_embds, idxs, labels_attr1, attr1_zero, attr1_one], f)

print('Saved')

# %%
# G, X, input_G = get_graph_real()
# n = len(G.nodes())

# 2. Getting seperate lists for seperate communities and the label for each community
# nodes_zero, nodes_one, labels = get_nodes_labels_real()

In [32]:
with open('labels_nodes.pickle', 'wb') as f:
    pickle.dump([G, idxs, labels_attr1, attr1_zero, attr1_one, labels_attr2, attr2_zero, attr2_one], f)

print('saved_new')



saved_new


In [23]:
# Doing the t-test

# with open('saved_models/greedy_seeds.pickle', 'rb') as f:
# #     _, _, fair_embds, pre_embds, _, _, _, _ = pickle.load(f)
#     greedy_seeds = pickle.load(f)
# print(greedy_seeds)


[[[0, 1, 2, 3], [0, 1, 2, 3, 4, 5, 6, 7], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39]]]


In [31]:
import scipy as sc

In [49]:
with open('saved_models/embds_2_1.pickle', 'rb') as f:
#     _, _, fair_embds, pre_embds, _, _, _, _ = pickle.load(f)
    [G, embedding_size, fair_embds, pre_embds, idxs, labels_attr1, attr1_zero, attr1_one] = pickle.load(f)

In [50]:
p_values = []
N_CLUSs = [4]
n_seedss = [1, 2, 3, 4, 5, 6, 7, 8, 10]
strategy = 're-cluster'

for N_CLUS in N_CLUSs:
    for n_seeds in n_seedss:
        model = KMeans(n_clusters=N_CLUS)
        model.fit(fair_embds)
        # fair_seeds = get_seeds(N_CLUS, fair_embds, idxs, labels_attr1, attr1_zero, attr1_one, strategy, n_seeds)
        # fair_records = get_IC_influenced(G, fair_seeds, 2000, 0.01)
        print(n_seeds)
        
        
        pre_seeds = get_seeds(N_CLUS, pre_embds, idxs, labels_attr1, attr1_zero, attr1_one, strategy, n_seeds)
        pre_records = get_IC_influenced(G, pre_seeds, 2000, 0.1)
        
        t_stat, p_value = sc.stats.ttest_ind(fair_seeds, pre_seeds)
        p_values.append(p_value)

print(p_values)


ValueError: setting an array element with a sequence.