In [12]:
import pandas as pd
import numpy as np
import collections
import networkx as nx
import matplotlib.pyplot as plt
from seaborn import color_palette, set_style, palplot
import random
from ge import DeepWalk, LINE, Node2Vec, SDNE, Struc2Vec
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score
import collections
import time

In [13]:
f=open("communities", "r")
f1 = f.readlines()

#construct community dictionary, each user -> community
community_dict = {}
communities = set()
for line in f1:
    community , ids = line.split(':')
    communities.add(community)
    ids = ids.strip('\n')
    ids = list(int(x) for x in ids.split(','))
    for id_ in ids:
        community_dict[id_] = community

print("There are in total {} communities".format(len(communities)))

There are in total 28 communities


In [14]:
f = open("list.features", "r")
f1 = f.readlines()

#construct list.feature dictionary list_id -> feature
list_dict = {}

for i, feature in enumerate(f1):
    list_dict[i] = feature

In [15]:
G = nx.read_edgelist('lists-to-community.txt',create_using=nx.Graph(),nodetype=None,data=[('weight',int)])#read graph
for u,v,d in G.edges(data=True):
    d['weight'] = 1
#print(G.edges.data())

In [16]:
#The embeddings are dictionary nodeid -> numpy arrays (embeddings)

label = {"archery":0, "athletics":1, "badminton":2, "basketball":3, "beach-volleyball":4,
         "boxing":5, "canoeing":6, "cycling":7, "diving":8, "equestrianism":9,
         "fencing":10, "gymnastics":11, "handball":12, "hockey":13, "judo":14,
         "pentathlon":15, "rowing":16, "sailing":17, "shooting":18, "swimming":19,
         "swimming-sync":20, "tabletennis":21, "taekwondo":22, "tennis":23, "triathlon":24,
         "waterpolo":25, "weightlifting":26, "wrestling":27}


#Deepwalk
exp_time = 1

file_fscore = "Olympics_report.csv"
F_score = open(file_fscore, "w")
F_score.write('Walk_length, testing_percentage, Micro-f1, Macro-f1\n')

time_dict = collections.defaultdict(float)
for walk_len in [3, 8, 10, 20]:
    Micro_dict = collections.defaultdict(float)
    Macro_dict = collections.defaultdict(float)
    for i in range(exp_time):
        start = time.time()
        DeepWalk_model = DeepWalk(G, walk_length=walk_len,num_walks=10,workers=1)#init model
        DeepWalk_model.train(embed_size = 128, window_size=2,iter=3)# train model
        time_dict[walk_len]+= (time.time()-start)
        DeepWalk_embeddings = DeepWalk_model.get_embeddings()# get embedding vectors


        X = []
        y = []

        for user in community_dict.keys():
            try:
                X.append(DeepWalk_embeddings[str(user)])
                y.append(label[community_dict[user]])
            except:
                continue
        X = np.asarray(X)
        y = np.asarray(y)

        #train_test_split
        for training_percentage in np.linspace(0.1,0.9,9):
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= (1-training_percentage))
            clf = LinearSVC(random_state=0, tol=1e-5)
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)

#             Micro_f1 = f1_score(y_test, y_pred, average='micro')
#             Macro_f1 = f1_score(y_test, y_pred, average='macro')
            Micro_dict[training_percentage] += f1_score(y_test, y_pred, average='micro')
            Macro_dict[training_percentage] += f1_score(y_test, y_pred, average='macro')
    for training_percentage in np.linspace(0.1,0.9,9):
        Micro_dict[training_percentage] /= exp_time
        Macro_dict[training_percentage] /= exp_time

        record = "{}, {}, {}, {}\n".format(walk_len, training_percentage,round(Micro_dict[training_percentage],3),
                                           round(Macro_dict[training_percentage],3))

        file_fscore = "Olympics_report.csv"
        F_score = open(file_fscore, "a")
        F_score.write(record)
        
    print("The average training time is {} secs for walk length as {}".format(time_dict[walk_len]/exp_time, walk_len))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.0s finished


Learning embedding vectors...
Learning embedding vectors done!
The average training time is 3.115568161010742 secs for walk length as 3


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.0s finished


Learning embedding vectors...
Learning embedding vectors done!
The average training time is 7.139678001403809 secs for walk length as 8


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.7s finished


Learning embedding vectors...
Learning embedding vectors done!
The average training time is 8.836431741714478 secs for walk length as 10


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.2s finished


Learning embedding vectors...
Learning embedding vectors done!
The average training time is 17.935011863708496 secs for walk length as 20


In [17]:
#LINE
# LINE_model = LINE(G,embedding_size=128,order='second') #init model,order can be ['first','second','all']
# LINE_model.train(batch_size=1024,epochs=50,verbose=2)# train model
# LINE_embeddings = LINE_model.get_embeddings()# get embedding vectors

In [18]:
#Node2Vec

# Node2Vec_model = Node2Vec(G, walk_length = 3, num_walks = 10,p = 0.25, q = 4, workers = 1)#init model
# Node2Vec_model.train(window_size = 5, iter = 3)# train model
# Node2Vec_embeddings = Node2Vec_model.get_embeddings()# get embedding vectors

In [19]:
#SDNE

# SDNE_model = SDNE(G,hidden_size=[256,128]) #init model
# SDNE_model.train(batch_size=3000,epochs=40,verbose=2)# train model
# SDNE_embeddings = SDNE_model.get_embeddings()# get embedding vectors

In [20]:
#Struc2Vec

# Struc2Vec_model = model = Struc2Vec(G, 10, 80, workers=4, verbose=40, ) #init model
# Struc2Vec_model.train(window_size = 5, iter = 3)# train model
# Struc2Vec_embeddings = Struc2Vec_model.get_embeddings()# get embedding vectors

In [24]:
time_dict = collections.defaultdict(float)
exp_time = 2
file_score = "time.csv"
time_score = open(file_score, "w")
time_score.write('Walk_length, Training_time\n')
for walk_len in [1,5,25,125]:
    for i in range(exp_time):
        DeepWalk_model = DeepWalk(G, walk_length=walk_len,num_walks=10,workers=1)#init model
        start = time.time()
        DeepWalk_model.train(embed_size = 128, window_size=2,iter=3)# train model
        time_dict[walk_len]+= (time.time()-start)
        DeepWalk_embeddings = DeepWalk_model.get_embeddings()# get embedding vectors

    time_score.write("{}, {}\n".format(walk_len, time_dict[walk_len]/exp_time))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.1s finished


Learning embedding vectors...
Learning embedding vectors done!


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Learning embedding vectors...
Learning embedding vectors done!


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.3s finished


Learning embedding vectors...
Learning embedding vectors done!


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.6s finished


Learning embedding vectors...
Learning embedding vectors done!


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.4s finished


Learning embedding vectors...
Learning embedding vectors done!


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   10.5s finished


Learning embedding vectors...
Learning embedding vectors done!


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.3min finished


Learning embedding vectors...
Learning embedding vectors done!


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.1min finished


Learning embedding vectors...
Learning embedding vectors done!
