In [1]:
import pandas as pd
import numpy as np
import random
import networkx as nx
from tqdm import tqdm
import re
import matplotlib.pyplot as plt
import itertools
import csv

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from time import time

from node2vec import Node2Vec

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dropout, Dense
import talos
from talos.utils import lr_normalizer
from keras.callbacks import ModelCheckpoint, EarlyStopping

Using TensorFlow backend.


# Counting all nodes

In [2]:
nodes = dict()

with open("train-mod.txt") as file :
    end = file.seek(0, 2)
    file.seek(0)
    while file.tell() != end:
        line = file.readline().split()
        edges = list(itertools.combinations(line,2))
        for i in edges:
            if nodes.get(i) == None:
                node1 = i[0]
                node2 = i[1]
                if nodes.get((node2,node1)) == None:
                    nodes[i] = 1
                else:
                    nodes[(node2,node1)] += 1
            else:
                nodes[i] +=1

print(len(nodes))

16087


In [3]:
with open("weighted_graph.csv", "w", newline="") as a_file:

    writer = csv.writer(a_file)
    for key, value in nodes.items():
        writer.writerow([key[0], key[1], value])

    a_file.close()

In [4]:
g = nx.read_weighted_edgelist('weighted_graph.csv', delimiter=',', nodetype=int)

In [5]:
print(nx.info(g))

Name: 
Type: Graph
Number of nodes: 3816
Number of edges: 16087
Average degree:   8.4313
None


# Positive edges

In [7]:
edges_pos = list(nodes.keys())

In [8]:
with open("edges_pos_all.csv","w",newline="") as csvfile:
    writer=csv.writer(csvfile)
    writer.writerow(["Source","Target", "Label"])
    for edge in edges_pos:
        writer.writerow([edge[0], edge[1], 1])

# Generating negative edges (random sampling)

In [9]:
i = 0
num_test_edges = 16087
edges_neg = []
while i < num_test_edges:
    edge = random.sample(g.nodes(), 2)
    try:
        edge_exists = g.has_edge(edge[0],edge[1])
        if edge_exists == False:
            edges_neg.append([edge[0],edge[1]])
            i = i+1
    except Exception as e:
        pass

In [10]:
with open("edges_neg_16k.csv","w",newline="") as csvfile:
    writer=csv.writer(csvfile)
    writer.writerow(["Source","Target", "Label"])
    for edge in edges_neg:
        writer.writerow([edge[0], edge[1], 0])

# Train/Test split

In [12]:
edges_positive = pd.read_csv('edges_pos_all.csv').to_numpy()
edges_negative = pd.read_csv('edges_neg_16k.csv').to_numpy()

In [15]:
#reading total data df
df_pos = pd.DataFrame(edges_positive, columns=['source_node', 'destination_node', 'label'])
df_neg = pd.DataFrame(edges_negative, columns=['source_node', 'destination_node', 'label'])

In [18]:
df_pos_n2v =  df_pos[['source_node', 'destination_node']].copy()

In [20]:
# Generate walks
node2vec = Node2Vec(g, dimensions=128, walk_length=80, num_walks=30)

# train node2vec model
n2w_model = node2vec.fit(window=10, min_count=1)

Computing transition probabilities: 100%|██████████| 3816/3816 [00:02<00:00, 1650.16it/s]
Generating walks (CPU: 1): 100%|██████████| 30/30 [03:47<00:00,  7.58s/it]


In [21]:
data = pd.concat([df_pos, df_neg], ignore_index=True)

In [22]:
x_train_n2v = [(n2w_model[str(i)]+n2w_model[str(j)]) for i,j in zip(data['source_node'], data['destination_node'])]

In [24]:
def generate_features(sample_list, test = False):
    features = []
    i = 0
    for sample in sample_list:
        #print(sample)
        source = sample[0]
        target = sample[1]
        if test == False:
            label = sample[2]
        else:
            label = -1
        
        feature = []
        try:
            i = i+1
            #print(i)
            
            #p = nx.common_neighbors(g, source, target)
            #feature.append(len(p))
            
            #p = nx.simrank_similarity(g, source, target)
            #feature.append(p)
            
            #preds = nx.resource_allocation_index(g, [(source, target)])
            #for u, v, p in preds:
            #    feature.append(p)

            preds = nx.jaccard_coefficient(g, [(source, target)])
            for u, v, p in preds:
                feature.append(p)

            preds = nx.adamic_adar_index(g, [(source, target)])
            for u, v, p in preds:
                feature.append(p)

            #preds = nx.preferential_attachment(g, [(source, target)])
            #for u, v, p in preds:
            #    feature.append(p)
            
            feature.append(label)  # append label
            
        except Exception as e:
            #print(e)
            pass
        features.append(feature)
    print("features: "+str(len(features)))
    return features

In [25]:
features_pos = generate_features(edges_positive)
features_neg = generate_features(edges_negative)

features: 16087
features: 16087


In [26]:
features = features_pos + features_neg

In [28]:
def write_train_to_csv(features):
    with open("train_16k_sim.csv","w",newline="") as csvfile:
        writer=csv.writer(csvfile)
        writer.writerow(["JC","AA","Label"])
        writer.writerows(features)
        
write_train_to_csv(features)

In [29]:
dataset_sim = pd.read_csv('train_16k_sim.csv')
FEATURE_SIZE=2

X_sim = dataset_sim.iloc[:,:FEATURE_SIZE].values
y_sim = dataset_sim.iloc[:,FEATURE_SIZE].values

In [32]:
x_train_n2v_1 = np.array(x_train_n2v)

In [35]:
# Feature Scaling
sc = StandardScaler()
X_sim_1 = sc.fit_transform(X_sim)

In [37]:
all_feats = np.concatenate((X_sim_1,x_train_n2v_1),axis=1)

In [41]:
labels = data['label'].to_numpy()

In [44]:
# first we have to make sure to input data and params into the function
def sml_model(x_train, y_train, x_val, y_val, params):

    model = Sequential()
    model.add(Dense(params['first_neuron'], input_dim=x_train.shape[1],
                    activation=params['activation'],
                    kernel_initializer=params['kernel_initializer']))
    
    model.add(Dropout(params['dropout']))

    model.add(Dense(1, activation=params['last_activation'],
                    kernel_initializer=params['kernel_initializer']))
    
    model.compile(loss=params['losses'],
                  optimizer=params['optimizer'],
                  metrics=['acc', talos.utils.metrics.f1score])
    
    history = model.fit(x_train, y_train, 
                        validation_data=[x_val, y_val],
                        batch_size=params['batch_size'],
                        epochs=params['epochs'],
                        verbose=0,
                        callbacks=[talos.utils.early_stopper(params['epochs'])])

    return history, model

In [45]:
# then we can go ahead and set the parameter space
p = {'first_neuron':[128, 512],
     'hidden_layers':[2, 3, 4],
     'batch_size': [128],
     'epochs': [100],
     'dropout': (0, 0.2, 0.40, 10),
     'kernel_initializer': ['uniform','normal'],
     'optimizer': ['Adam', 'SGD'],
     'losses': ['binary_crossentropy'],
     'activation':['relu'],
     'last_activation': ['sigmoid']}

In [46]:
scan_object = talos.Scan(x=all_feats,
                         y=labels, 
                         params=p,
                         model=sml_model,
                         experiment_name='smlproj_2'
                         )

100%|██████████| 24/24 [14:23<00:00, 35.96s/it]


In [47]:
# use Scan object as input
analyze_object = talos.Analyze(scan_object)

In [48]:
# get the best paramaters
analyze_object.best_params('val_acc', ['acc', 'loss', 'val_loss'])

array([['binary_crossentropy', 'SGD', 0.0, 512, '04/13/21-112430',
        'relu', 2, 128, 77, 75.20409274101257, '04/13/21-112545',
        0.9549400806427002, 100, 0.9840306639671326, 'uniform',
        'sigmoid', 0],
       ['binary_crossentropy', 'SGD', 0.0, 128, '04/13/21-112027',
        'relu', 3, 128, 77, 47.13001203536987, '04/13/21-112114',
        0.9540058970451355, 100, 0.97808438539505, 'uniform', 'sigmoid',
        1],
       ['binary_crossentropy', 'SGD', 0.0, 512, '04/13/21-112720',
        'relu', 3, 128, 73, 69.19339895248413, '04/13/21-112829',
        0.9531311988830566, 100, 0.984320878982544, 'uniform', 'sigmoid',
        2],
       ['binary_crossentropy', 'SGD', 0.0, 128, '04/13/21-112326',
        'relu', 4, 128, 68, 42.83750820159912, '04/13/21-112409',
        0.9527631998062134, 100, 0.9783999919891357, 'normal', 'sigmoid',
        3],
       ['binary_crossentropy', 'SGD', 0.0, 128, '04/13/21-111836',
        'relu', 2, 128, 74, 46.18578100204468, '04/13/21-

In [49]:
evaluate_object = talos.Evaluate(scan_object)
evaluate_object.evaluate(x=all_feats,
                         y=labels, folds=10, metric='val_acc', task='binary')

[0.9759900218272529,
 0.9754562617998742,
 0.9779022720199191,
 0.9784263959390863,
 0.9788161993769471,
 0.9717868338557993,
 0.9743268628678773,
 0.9750394944707741,
 0.9746001881467544,
 0.9780743565300286]

In [50]:
talos.Deploy(scan_object=scan_object, model_name='smlproj_deploy_2', metric='val_acc')

Deploy package smlproj_deploy_2 have been saved.


<talos.commands.deploy.Deploy at 0x7fe5d062e590>

In [51]:
smlproj_model = talos.Restore('smlproj_deploy_2.zip')

In [52]:
testCols=['Id', 'source_node', 'destination_node'] 
df_test_public = pd.read_csv('test-public.csv')
ids = df_test_public['Id'].values
df_test_public.columns = testCols
df_test_public = df_test_public.drop('Id', axis = 1)
df_test_public

Unnamed: 0,source_node,destination_node
0,0,2917
1,0,2956
2,1,4038
3,2,1848
4,3,513
...,...,...
1995,3865,3924
1996,3917,4025
1997,3922,3947
1998,3955,3987


In [53]:
x_testing_n2v = [(n2w_model[str(i)]+n2w_model[str(j)]) for i,j in zip(df_test_public['source_node'], df_test_public['destination_node'])]

In [55]:
x_testing_n2v_1 = np.array(x_testing_n2v)

In [57]:
test_edges = df_test_public.to_numpy()

In [58]:
features_test = generate_features(test_edges, test=True)

features: 2000


In [59]:
def write_test_to_csv(features):
    with open("test.csv","w",newline="") as csvfile:
        writer=csv.writer(csvfile)
        writer.writerow(["JC","AA","Label"])
        writer.writerows(features)

In [60]:
write_test_to_csv(features_test)

In [61]:
dataset_test_sim = pd.read_csv('test.csv')
FEATURE_SIZE=2

X_test_sim = dataset_test_sim.iloc[:,:FEATURE_SIZE].values
y_test_sim = dataset_test_sim.iloc[:,FEATURE_SIZE].values

In [62]:
x_testing_sim = sc.transform(X_test_sim)

In [64]:
all_feats_testing = np.concatenate((x_testing_sim,x_testing_n2v_1),axis=1)

In [66]:
y = smlproj_model.model.predict(all_feats_testing)

In [68]:
with open("ANN_results.csv","w",newline="") as csvfile:
    writer=csv.writer(csvfile)
    writer.writerow(["Id","Predicted"])
    test_id=1
    for prediction in y:
        writer.writerow([test_id,prediction[0]])
        test_id+=1