# Edge Embedding ( with Karate Dataset )

In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt

# 1. Import Dataset

### (1) Original Dataset

In [2]:
edge = pd.read_csv('karate_club.edgelist', sep=' ', names=['x','y','w'])

In [3]:
connected_edges = edge[['x','y']].values

In [4]:
graph = nx.Graph()
for i in range(edge.shape[0]):
    graph.add_node(node_for_adding = edge['x'][i])
    graph.add_node(node_for_adding = edge['y'][i])
    graph.add_edge(edge['x'][i], edge['y'][i])

In [5]:
A = nx.to_numpy_matrix(graph, nodelist=sorted(graph.nodes()))

### (2) Embedded Dataset

In [6]:
karate = pd.read_csv('Karate_embedded_node2vec.csv',index_col=0)

### (3) Full Edges ( Positive & Negative )

In [7]:
from itertools import combinations

In [8]:
total_edges = list(combinations(np.arange(0,34), 2))

In [10]:
def connected(i,j):
    return A[i,j]

In [11]:
edges_df = pd.DataFrame(total_edges)
edges_df.columns = ['node1','node2']
edges_df['y'] = edges_df.apply(lambda x: connected(x['node1'], x['node2']), axis=1)

In [12]:
neg_edges_df= edges_df[edges_df['y']==0]
pos_edges_df= edges_df[edges_df['y']==1]

# 2. Define Functions

### (1) edge embedding

In [13]:
def edge_embedding(edges,type=1):
    n1 = edges[0]
    n2 = edges[1]
    node1 = karate.iloc[n1].values
    node2 = karate.iloc[n2].values
    if type==0: # Average
        embed = (node1+node2)/2
    elif type==1: # Hadamard
        embed = node1*node2
    elif type==2: # Weighted-L1
        embed = np.abs(node1-node2)
    elif type==3 : # Weighte-L2
        ebmed = np.square(node1-node2)
    return embed  
    

### (2) negative sampling

In [14]:
def sample_50(neg_edges):
    num = pos_edges_df.shape[0]//2
    neg_sample = neg_edges.sample(num,)
    return neg_sample

# 3. Edge sampling

### (1) sample negative edges

In [16]:
neg = sample_50(neg_edges_df)
neg_for_test = neg_edges_df[~neg_edges_df.isin(neg)].dropna().sample(39) 

### (2) masking postive edges

In [17]:
pos_masked = sample_50(pos_edges_df)
pos_notmasked = pos_edges_df[~pos_edges_df.isin(pos_masked)].dropna()

### (3) make training & test dataset

#### train

In [18]:
train = pd.concat([neg,pos_notmasked]).sample(frac=1).astype('int')

In [19]:
train_embedded = train[['node1','node2']].apply(lambda x: edge_embedding(x[['node1','node2']],1),axis=1)
train_embedded.columns = ['emb_x1','emb_x2']

In [20]:
final_train = pd.concat([train,train_embedded],axis=1)[['emb_x1','emb_x2','y']]

#### test

In [25]:
test = pd.concat([neg_for_test,pos_masked]).sample(frac=1).astype('int')

In [26]:
test_embedded = test[['node1','node2']].apply(lambda x: edge_embedding(x[['node1','node2']],1),axis=1)
test_embedded.columns = ['emb_x1','emb_x2']

In [27]:
final_test = pd.concat([test,test_embedded],axis=1)[['emb_x1','emb_x2','y']]

# 4. Classification

In [22]:
import xgboost as xgb

### (1) modeling with TRAIN data

In [23]:
train_x = final_train[['emb_x1','emb_x2']]
train_y = final_train[['y']]

In [24]:
xgb_clf = xgb.XGBClassifier()
xgb_clf.fit(train_x,train_y)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
              max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
              n_jobs=1, nthread=None, objective='binary:logistic',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=None, silent=True, subsample=1)

### (2) evaluate with TEST data

In [28]:
test_x = final_test[['emb_x1','emb_x2']]
test_y = final_test[['y']]

In [29]:
test_pred = xgb_clf.predict(test_x)
test_actual = test_y.values.flatten()

# 5. Result

In [30]:
from sklearn.metrics import accuracy_score,f1_score,roc_auc_score

In [31]:
accuracy_score(test_pred,test_actual)

0.6025641025641025

In [32]:
f1_score(test_pred,test_actual)

0.537313432835821