In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import networkx as nx
import time
from node2vec import Node2Vec
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold, train_test_split
from sklearn import metrics 

## Load Data

In [2]:
nodes = pd.read_csv('data/cora.content', sep='\t', header=None)
nodes = nodes.rename(columns={0:'id', 1434:'class'})
y = nodes[['id','class']]
y = y.sort_values(by=['id'])
y = np.ravel(np.array(y[['class']]))
y

array(['Genetic_Algorithms', 'Genetic_Algorithms',
       'Reinforcement_Learning', ..., 'Rule_Learning', 'Rule_Learning',
       'Rule_Learning'], dtype=object)

In [3]:
#networkx graph
g = nx.read_gpickle('data/cora.gml')
print(g)

Graph named 'cora' with 2708 nodes and 5278 edges


## Define pipeline for grid search

In [4]:
# Pipeline Search for best parameters
def pipeline(transformer, classifier, EMBEDDING_FILENAME):
    # Generate embeddings
    model = transformer.fit(window=10, min_count=1, batch_words=4)
    model.wv.save_word2vec_format(EMBEDDING_FILENAME)
        
    # Read embeddings
    emb_df = pd.read_csv(EMBEDDING_FILENAME, sep=' ', skiprows=[0], header=None)
    emb_df = emb_df.sort_values(by=[0])
    emb_df = emb_df.drop(emb_df.columns[0], axis=1)
    
    # Run classifier on embeddings
    # k-fold validation
    i = 1
    acc = 0
    kf = KFold(n_splits=3, shuffle=True)
    
    for train, test in kf.split(emb_df):
        print("Running classification on split ", i)
        X_train, X_test = emb_df.iloc[train], emb_df.iloc[test]
        y_train, y_test = y[train], y[test]
        
        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)
        y_prob = classifier.predict_proba(X_test)
        
        acc_fold = metrics.accuracy_score(y_test, y_pred)
        
        acc += acc_fold
        i += 1
        
    acc = acc/3
    
    return acc

In [5]:
# Define downstream classification model
knn = KNeighborsClassifier(metric='euclidean', n_neighbors=4, weights='distance')

# parameters dict for node2vec
dimensions = 128
walk_length = 20
num_walks = 100

acc_hist = []
time_hist = []
parameter_hist = []

for p in [0.25,0.50,1,2,4]:
    for q in [0.25,0.50,1,2,4]:
        # create node2vec model
        node2vec = Node2Vec(graph=g, dimensions=dimensions, walk_length=walk_length, num_walks=num_walks,
                            p=p, q=q, seed=0)
        parameters=[dimensions,walk_length,num_walks,p,q]
        print(parameters)
        parameter_hist.append(parameters)
        
        # filename
        EMBEDDING_FILENAME = "embedding/" + str(parameters) + ".txt"
        
        # run pipeline
        start = time.time()
        acc_model = pipeline(transformer=node2vec, classifier=knn, EMBEDDING_FILENAME=EMBEDDING_FILENAME)
        end = time.time()
        
        acc_hist.append(acc_model)
        time_hist.append(end-start)     

Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████████| 100/100 [00:19<00:00,  5.23it/s]


[128, 20, 100, 0.25, 0.25]
Running classification on split  1
Running classification on split  2
Running classification on split  3


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████████| 100/100 [00:19<00:00,  5.14it/s]


[128, 20, 100, 0.25, 0.5]
Running classification on split  1
Running classification on split  2
Running classification on split  3


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████████| 100/100 [00:19<00:00,  5.06it/s]


[128, 20, 100, 0.25, 1]
Running classification on split  1
Running classification on split  2
Running classification on split  3


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████████| 100/100 [00:19<00:00,  5.07it/s]


[128, 20, 100, 0.25, 2]
Running classification on split  1
Running classification on split  2
Running classification on split  3


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████████| 100/100 [00:20<00:00,  4.89it/s]


[128, 20, 100, 0.25, 4]
Running classification on split  1
Running classification on split  2
Running classification on split  3


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████████| 100/100 [00:19<00:00,  5.05it/s]


[128, 20, 100, 0.5, 0.25]
Running classification on split  1
Running classification on split  2
Running classification on split  3


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████████| 100/100 [00:19<00:00,  5.07it/s]


[128, 20, 100, 0.5, 0.5]
Running classification on split  1
Running classification on split  2
Running classification on split  3


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████████| 100/100 [00:19<00:00,  5.13it/s]


[128, 20, 100, 0.5, 1]
Running classification on split  1
Running classification on split  2
Running classification on split  3


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████████| 100/100 [00:19<00:00,  5.08it/s]


[128, 20, 100, 0.5, 2]
Running classification on split  1
Running classification on split  2
Running classification on split  3


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████████| 100/100 [00:19<00:00,  5.11it/s]


[128, 20, 100, 0.5, 4]
Running classification on split  1
Running classification on split  2
Running classification on split  3


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████████| 100/100 [00:19<00:00,  5.12it/s]


[128, 20, 100, 1, 0.25]
Running classification on split  1
Running classification on split  2
Running classification on split  3


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████████| 100/100 [00:19<00:00,  5.09it/s]


[128, 20, 100, 1, 0.5]
Running classification on split  1
Running classification on split  2
Running classification on split  3


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████████| 100/100 [00:19<00:00,  5.10it/s]


[128, 20, 100, 1, 1]
Running classification on split  1
Running classification on split  2
Running classification on split  3


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████████| 100/100 [00:19<00:00,  5.09it/s]


[128, 20, 100, 1, 2]
Running classification on split  1
Running classification on split  2
Running classification on split  3


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████████| 100/100 [00:19<00:00,  5.09it/s]


[128, 20, 100, 1, 4]
Running classification on split  1
Running classification on split  2
Running classification on split  3


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████████| 100/100 [00:19<00:00,  5.12it/s]


[128, 20, 100, 2, 0.25]
Running classification on split  1
Running classification on split  2
Running classification on split  3


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████████| 100/100 [00:19<00:00,  5.16it/s]


[128, 20, 100, 2, 0.5]
Running classification on split  1
Running classification on split  2
Running classification on split  3


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████████| 100/100 [00:20<00:00,  4.99it/s]


[128, 20, 100, 2, 1]
Running classification on split  1
Running classification on split  2
Running classification on split  3


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████████| 100/100 [00:19<00:00,  5.04it/s]


[128, 20, 100, 2, 2]
Running classification on split  1
Running classification on split  2
Running classification on split  3


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████████| 100/100 [00:20<00:00,  4.99it/s]


[128, 20, 100, 2, 4]
Running classification on split  1
Running classification on split  2
Running classification on split  3


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████████| 100/100 [00:19<00:00,  5.09it/s]


[128, 20, 100, 4, 0.25]
Running classification on split  1
Running classification on split  2
Running classification on split  3


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████████| 100/100 [00:19<00:00,  5.04it/s]


[128, 20, 100, 4, 0.5]
Running classification on split  1
Running classification on split  2
Running classification on split  3


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████████| 100/100 [00:19<00:00,  5.07it/s]


[128, 20, 100, 4, 1]
Running classification on split  1
Running classification on split  2
Running classification on split  3


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████████| 100/100 [00:19<00:00,  5.01it/s]


[128, 20, 100, 4, 2]
Running classification on split  1
Running classification on split  2
Running classification on split  3


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████████| 100/100 [00:19<00:00,  5.00it/s]


[128, 20, 100, 4, 4]
Running classification on split  1
Running classification on split  2
Running classification on split  3


## Grid search results

In [6]:
para_df = pd.DataFrame(parameter_hist, columns=['dimensions','walk_length','num_walks','p','q'])
para_df['accuracy'] = acc_hist
para_df['time'] = time_hist
para_df['accuracy'] = para_df['accuracy'] * 100
para_df = para_df.round(2)
para_df

Unnamed: 0,dimensions,walk_length,num_walks,p,q,accuracy,time
0,128,20,100,0.25,0.25,84.6,182.88
1,128,20,100,0.25,0.5,83.94,178.27
2,128,20,100,0.25,1.0,83.31,171.44
3,128,20,100,0.25,2.0,84.31,164.92
4,128,20,100,0.25,4.0,83.27,162.17
5,128,20,100,0.5,0.25,84.08,195.68
6,128,20,100,0.5,0.5,84.12,182.95
7,128,20,100,0.5,1.0,84.16,177.17
8,128,20,100,0.5,2.0,84.23,171.84
9,128,20,100,0.5,4.0,83.72,162.97


In [7]:
import dataframe_image as dfi
df_styled = para_df.style.background_gradient(cmap='Greys')
dfi.export(df_styled,"images/grid_search_2.png")

[1206/230934.222310:ERROR:xattr.cc(63)] setxattr org.chromium.crashpad.database.initialized on file /var/folders/76/p5884tp57fj2wy8qn17tpc6c0000gn/T/: Operation not permitted (1)
[1206/230934.225604:ERROR:file_io.cc(94)] ReadExactly: expected 8, observed 0
[1206/230934.226518:ERROR:xattr.cc(63)] setxattr org.chromium.crashpad.database.initialized on file /var/folders/76/p5884tp57fj2wy8qn17tpc6c0000gn/T/: Operation not permitted (1)
[1206/230936.254103:INFO:headless_shell.cc(653)] Written to file /var/folders/76/p5884tp57fj2wy8qn17tpc6c0000gn/T/tmpdl2a6t5b/temp.png.


In [8]:
max_index = acc_hist.index(max(acc_hist))
para = parameter_hist[max_index]
para

[128, 20, 100, 4, 0.25]

## Define model with best parameters

In [9]:
from numpy.random import randint

#generate embeddings with best model
node2vec = Node2Vec(graph=g, dimensions=para[0], walk_length=para[1], num_walks=para[2], p=para[3], q=para[4], seed=0)
knn = KNeighborsClassifier(metric='euclidean', n_neighbors=4, weights='distance')

# Generate embeddings
model = node2vec.fit(window=10, min_count=1, batch_words=4)
model.wv.save_word2vec_format('embedding/best_embedding.txt')

# Read embeddings
emb_df = pd.read_csv('embedding/best_embedding.txt', sep=' ', skiprows=[0], header=None)
emb_df = emb_df.sort_values(by=[0])
emb_df = emb_df.drop(emb_df.columns[0], axis=1)
emb_df

# Run classifier on embeddings
seed = randint(0, len(emb_df), 1)
X_train, X_test, y_train, y_test = train_test_split(emb_df, y, test_size=0.20, random_state=seed[0])
y_train = np.ravel(y_train)
y_test = np.ravel(y_test)

knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
y_prob = knn.predict_proba(X_test)

acc = metrics.accuracy_score(y_test, y_pred)
acc

Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████████| 100/100 [00:20<00:00,  5.00it/s]


0.8413284132841329