In [1]:
import numpy as np
import pandas as pd 
import networkx as nx
import time
from node2vec import Node2Vec
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, KFold
from sklearn import metrics 

## Load Data

In [2]:
nodes = pd.read_csv('data/cora.content', sep='\t', header=None)
nodes = nodes.rename(columns={0:'id', 1434:'class'})
y = nodes[['id','class']]
y = y.sort_values(by=['id'])
y = np.ravel(np.array(y[['class']]))
y

array(['Genetic_Algorithms', 'Genetic_Algorithms',
       'Reinforcement_Learning', ..., 'Rule_Learning', 'Rule_Learning',
       'Rule_Learning'], dtype=object)

In [3]:
#networkx graph
g = nx.read_gpickle('data/cora.gml')
print(g)

Graph named 'cora' with 2708 nodes and 5278 edges


## Define pipeline for grid search

In [4]:
# Pipeline Search for best parameters
def pipeline(transformer, classifier, EMBEDDING_FILENAME):
    # Generate embeddings
    model = transformer.fit(window=10, min_count=1, batch_words=4)
    model.wv.save_word2vec_format(EMBEDDING_FILENAME)
        
    # Read embeddings
    emb_df = pd.read_csv(EMBEDDING_FILENAME, sep=' ', skiprows=[0], header=None)
    emb_df = emb_df.sort_values(by=[0])
    emb_df = emb_df.drop(emb_df.columns[0], axis=1)
    
    # Run classifier on embeddings
    # k-fold validation
    i = 1
    acc = 0
    kf = KFold(n_splits=3, shuffle=True)
    
    for train, test in kf.split(emb_df):
        print("Running classification on split ", i)
        X_train, X_test = emb_df.iloc[train], emb_df.iloc[test]
        y_train, y_test = y[train], y[test]
        
        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)
        y_prob = classifier.predict_proba(X_test)
        
        acc_fold = metrics.accuracy_score(y_test, y_pred)
        
        acc += acc_fold
        i += 1
        
    acc = acc/3
    
    return acc

In [5]:
# Define downstream classification model
knn = KNeighborsClassifier(metric='euclidean', n_neighbors=4, weights='distance')

# parameters dict for node2vec
dimensions = 128
walk_length = 80
num_walks = 50

acc_hist = []
time_hist = []
parameter_hist = []

for p in [0.25,0.5,1,2,4]:
    for q in [0.25,0.5,1,2,4]:
        # create node2vec model
        node2vec = Node2Vec(graph=g, dimensions=dimensions, walk_length=walk_length, num_walks=num_walks,
                            p=p, q=q, seed=0)
        parameters=[dimensions,walk_length,num_walks,p,q]
        print(parameters)
        parameter_hist.append(parameters)
        
        # filename
        EMBEDDING_FILENAME = "embedding/" + str(parameters) + ".txt"
        
        # run pipeline
        start = time.time()
        acc_model = pipeline(transformer=node2vec, classifier=knn, EMBEDDING_FILENAME=EMBEDDING_FILENAME)
        end = time.time()
        
        acc_hist.append(acc_model)
        time_hist.append(end-start)     

Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|████████████████| 50/50 [00:39<00:00,  1.26it/s]


[128, 80, 50, 0.25, 0.25]
Running classification on split  1
Running classification on split  2
Running classification on split  3


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|████████████████| 50/50 [00:40<00:00,  1.25it/s]


[128, 80, 50, 0.25, 0.5]
Running classification on split  1
Running classification on split  2
Running classification on split  3


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|████████████████| 50/50 [00:40<00:00,  1.23it/s]


[128, 80, 50, 0.25, 1]
Running classification on split  1
Running classification on split  2
Running classification on split  3


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|████████████████| 50/50 [00:40<00:00,  1.25it/s]


[128, 80, 50, 0.25, 2]
Running classification on split  1
Running classification on split  2
Running classification on split  3


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|████████████████| 50/50 [00:40<00:00,  1.23it/s]


[128, 80, 50, 0.25, 4]
Running classification on split  1
Running classification on split  2
Running classification on split  3


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|████████████████| 50/50 [00:40<00:00,  1.24it/s]


[128, 80, 50, 0.5, 0.25]
Running classification on split  1
Running classification on split  2
Running classification on split  3


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|████████████████| 50/50 [00:40<00:00,  1.24it/s]


[128, 80, 50, 0.5, 0.5]
Running classification on split  1
Running classification on split  2
Running classification on split  3


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|████████████████| 50/50 [00:40<00:00,  1.22it/s]


[128, 80, 50, 0.5, 1]
Running classification on split  1
Running classification on split  2
Running classification on split  3


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|████████████████| 50/50 [00:40<00:00,  1.23it/s]


[128, 80, 50, 0.5, 2]
Running classification on split  1
Running classification on split  2
Running classification on split  3


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|████████████████| 50/50 [00:41<00:00,  1.21it/s]


[128, 80, 50, 0.5, 4]
Running classification on split  1
Running classification on split  2
Running classification on split  3


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|████████████████| 50/50 [00:40<00:00,  1.24it/s]


[128, 80, 50, 1, 0.25]
Running classification on split  1
Running classification on split  2
Running classification on split  3


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|████████████████| 50/50 [00:40<00:00,  1.22it/s]


[128, 80, 50, 1, 0.5]
Running classification on split  1
Running classification on split  2
Running classification on split  3


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|████████████████| 50/50 [00:40<00:00,  1.23it/s]


[128, 80, 50, 1, 1]
Running classification on split  1
Running classification on split  2
Running classification on split  3


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|████████████████| 50/50 [00:41<00:00,  1.22it/s]


[128, 80, 50, 1, 2]
Running classification on split  1
Running classification on split  2
Running classification on split  3


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|████████████████| 50/50 [00:41<00:00,  1.21it/s]


[128, 80, 50, 1, 4]
Running classification on split  1
Running classification on split  2
Running classification on split  3


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|████████████████| 50/50 [00:40<00:00,  1.24it/s]


[128, 80, 50, 2, 0.25]
Running classification on split  1
Running classification on split  2
Running classification on split  3


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|████████████████| 50/50 [00:40<00:00,  1.23it/s]


[128, 80, 50, 2, 0.5]
Running classification on split  1
Running classification on split  2
Running classification on split  3


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|████████████████| 50/50 [00:40<00:00,  1.23it/s]


[128, 80, 50, 2, 1]
Running classification on split  1
Running classification on split  2
Running classification on split  3


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|████████████████| 50/50 [00:40<00:00,  1.22it/s]


[128, 80, 50, 2, 2]
Running classification on split  1
Running classification on split  2
Running classification on split  3


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|████████████████| 50/50 [00:41<00:00,  1.21it/s]


[128, 80, 50, 2, 4]
Running classification on split  1
Running classification on split  2
Running classification on split  3


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|████████████████| 50/50 [00:40<00:00,  1.25it/s]


[128, 80, 50, 4, 0.25]
Running classification on split  1
Running classification on split  2
Running classification on split  3


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|████████████████| 50/50 [00:41<00:00,  1.22it/s]


[128, 80, 50, 4, 0.5]
Running classification on split  1
Running classification on split  2
Running classification on split  3


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|████████████████| 50/50 [00:40<00:00,  1.22it/s]


[128, 80, 50, 4, 1]
Running classification on split  1
Running classification on split  2
Running classification on split  3


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|████████████████| 50/50 [00:41<00:00,  1.22it/s]


[128, 80, 50, 4, 2]
Running classification on split  1
Running classification on split  2
Running classification on split  3


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|████████████████| 50/50 [00:41<00:00,  1.20it/s]


[128, 80, 50, 4, 4]
Running classification on split  1
Running classification on split  2
Running classification on split  3


In [6]:
max_index = acc_hist.index(max(acc_hist))
para = parameter_hist[max_index]
para

[128, 80, 50, 2, 1]

## Grid search results

In [7]:
para_df = pd.DataFrame(parameter_hist, columns=['dimensions','walk_length','num_walks','p','q'])
para_df['accuracy'] = acc_hist
para_df['time'] = time_hist
para_df

Unnamed: 0,dimensions,walk_length,num_walks,p,q,accuracy,time
0,128,80,50,0.25,0.25,0.841587,365.462466
1,128,80,50,0.25,0.5,0.837526,354.827043
2,128,80,50,0.25,1.0,0.831251,338.969633
3,128,80,50,0.25,2.0,0.833096,320.228759
4,128,80,50,0.25,4.0,0.827187,304.844336
5,128,80,50,0.5,0.25,0.843064,374.866015
6,128,80,50,0.5,0.5,0.839743,369.816785
7,128,80,50,0.5,1.0,0.837528,358.433106
8,128,80,50,0.5,2.0,0.831249,342.237712
9,128,80,50,0.5,4.0,0.832359,322.873894


In [11]:
import dataframe_image as dfi
df_styled = para_df.style.background_gradient(cmap='Greys')
dfi.export(df_styled,'images/grid_search_scores.png')

[1209/141520.590472:ERROR:xattr.cc(63)] setxattr org.chromium.crashpad.database.initialized on file /var/folders/76/p5884tp57fj2wy8qn17tpc6c0000gn/T/: Operation not permitted (1)
[1209/141520.594254:ERROR:file_io.cc(94)] ReadExactly: expected 8, observed 0
[1209/141520.596056:ERROR:xattr.cc(63)] setxattr org.chromium.crashpad.database.initialized on file /var/folders/76/p5884tp57fj2wy8qn17tpc6c0000gn/T/: Operation not permitted (1)
[1209/141521.684620:INFO:headless_shell.cc(653)] Written to file /var/folders/76/p5884tp57fj2wy8qn17tpc6c0000gn/T/tmp1lmx_0em/temp.png.
