In [191]:
import os
import numpy as np
import matplotlib.pyplot as plt
from gensim.models.word2vec import PathLineSentences, LineSentence, Word2Vec
from pathlib import Path
from sklearn import model_selection, linear_model, preprocessing, svm
from sklearn.metrics import accuracy_score, f1_score
from sklearn.pipeline import make_pipeline
%matplotlib inline

**Load random walks from output files**

The word2vec model requires sentences of strings, so we convert node number to a string.

In [96]:
rw_location = Path("./karate/path")

In [227]:
random_walks_int = np.concatenate([
    np.loadtxt(f.open(), delimiter='\t', dtype=int)
    for f in rw_location.glob("part-*")
])
random_walks_str = [list(map(str, v)) for v in random_walks_int]

Train word2vec embeddings using gensim

In [237]:
model = Word2Vec(random_walks_str, sg=1, size=20, window=5, min_count=0, 
                 workers=4, seed=1321, iter=10, compute_loss=True)

In [238]:
model.get_latest_training_loss()

6837.341796875

Save embeddings

In [228]:
model.wv.save_word2vec_format("embeddings.out")

Extract embeddings as matrix of size `n_nodes` × `d`

In [233]:
V_w2v = model.wv.syn0
n_nodes, d_emb = V_w2v.shape

Node labels:

In [234]:
node_labels = np.zeros(n_nodes)
node_labels[:10] = 1

**Classification using single train/test split**

In [235]:
# Split using sklearn ShuffleSplit
ss = model_selection.ShuffleSplit(n_splits=1,
                                  train_size=0.6,
                                  test_size=0.4)
train_index, test_index = next(ss.split(V_w2v))

train_data = V_w2v[train_index]
test_data = V_w2v[test_index]
train_labels = node_labels[train_index]
test_labels  = node_labels[test_index]

# Classifier choice
#classifier = linear_model.LogisticRegression(C=10)
classifier = svm.SVC(C=1)

clf = make_pipeline(preprocessing.StandardScaler(), classifier)
clf.fit(train_data, train_labels)

train_pred = clf.predict(train_data)
test_pred = clf.predict(test_data)

print("Train acc:", clf.score(train_data, train_labels))
print("Test acc:", clf.score(test_data, test_labels))
print("Train f1:", f1_score(train_labels, train_pred, average='micro'))
print("Test f1:", f1_score(test_labels, test_pred, average='micro'))

Train acc: 0.95
Test acc: 0.714285714286
Train f1: 0.95
Test f1: 0.714285714286


**K-fold cross validation**

In [236]:
kf = model_selection.KFold(n_splits=4, shuffle=True)

acc_test_scores = []
f1_test_scores = []
for train_index, test_index in kf.split(V_w2v):
    train_data = V_w2v[train_index]
    test_data = V_w2v[test_index]
    train_labels = node_labels[train_index]
    test_labels  = node_labels[test_index]
    
    # Classifier choice
    #classifier = linear_model.LogisticRegression(C=10)
    classifier = svm.SVC(C=1)

    clf = make_pipeline(preprocessing.StandardScaler(), classifier)
    clf.fit(train_data, train_labels)

    train_pred = clf.predict(train_data)
    test_pred = clf.predict(test_data)

    acc_train = clf.score(train_data, train_labels)
    acc_test = clf.score(test_data, test_labels)
    f1_train = f1_score(train_labels, train_pred, average='micro')
    f1_test = f1_score(test_labels, test_pred, average='micro')
    
    acc_test_scores.append(acc_test)
    f1_test_scores.append(f1_test)
    
    print("Train acc={:4f},  Test acc={:4f},  Train f1={:4f},  Test f1={:4f}".format(
        acc_train, acc_test, f1_train, f1_test
    ))
    
print("Average scores:")
print("Avg test acc={:4f} [±{:2f}],  Avg test f1={:4f} [±{:2f}]".format(
    np.mean(acc_test_scores), np.std(acc_test_scores), 
    np.mean(f1_test_scores), np.std(f1_test_scores)
))

Train acc=0.880000,  Test acc=0.444444,  Train f1=0.880000,  Test f1=0.444444
Train acc=0.960000,  Test acc=0.777778,  Train f1=0.960000,  Test f1=0.777778
Train acc=0.961538,  Test acc=0.750000,  Train f1=0.961538,  Test f1=0.750000
Train acc=0.923077,  Test acc=0.875000,  Train f1=0.923077,  Test f1=0.875000
Average scores:
Avg test acc=0.711806 [±0.161187],  Avg test f1=0.711806 [±0.161187]
