# Edge Embedding ( with Football Dataset )

In [2]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt

# 1. Import Dataset

### (1) Original Dataset

In [3]:
H = nx.read_gml('football.gml')

In [4]:
A = nx.to_numpy_matrix(H,nodelist=H.nodes())

In [5]:
A

matrix([[0., 1., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

### (2) Embedded Dataset

In [6]:
football = pd.read_csv('Football_embedded_node2vec.csv',index_col=0)

In [8]:
football = football.iloc[:,0:2]

In [9]:
football

Unnamed: 0,0,1
0,0.174312,0.076093
1,1.181329,0.641597
2,-0.858219,-0.382604
3,-0.833804,0.630451
4,-0.128206,-0.176692
...,...,...
110,-0.332728,-0.545576
111,-0.961298,-0.506655
112,0.708785,-0.036194
113,0.798002,0.909555


### (3) Full Edges ( Positive & Negative )

In [10]:
from itertools import combinations

In [11]:
total_edges = list(combinations(np.arange(0,115), 2))

In [12]:
total_edges

[(0, 1),
 (0, 2),
 (0, 3),
 (0, 4),
 (0, 5),
 (0, 6),
 (0, 7),
 (0, 8),
 (0, 9),
 (0, 10),
 (0, 11),
 (0, 12),
 (0, 13),
 (0, 14),
 (0, 15),
 (0, 16),
 (0, 17),
 (0, 18),
 (0, 19),
 (0, 20),
 (0, 21),
 (0, 22),
 (0, 23),
 (0, 24),
 (0, 25),
 (0, 26),
 (0, 27),
 (0, 28),
 (0, 29),
 (0, 30),
 (0, 31),
 (0, 32),
 (0, 33),
 (0, 34),
 (0, 35),
 (0, 36),
 (0, 37),
 (0, 38),
 (0, 39),
 (0, 40),
 (0, 41),
 (0, 42),
 (0, 43),
 (0, 44),
 (0, 45),
 (0, 46),
 (0, 47),
 (0, 48),
 (0, 49),
 (0, 50),
 (0, 51),
 (0, 52),
 (0, 53),
 (0, 54),
 (0, 55),
 (0, 56),
 (0, 57),
 (0, 58),
 (0, 59),
 (0, 60),
 (0, 61),
 (0, 62),
 (0, 63),
 (0, 64),
 (0, 65),
 (0, 66),
 (0, 67),
 (0, 68),
 (0, 69),
 (0, 70),
 (0, 71),
 (0, 72),
 (0, 73),
 (0, 74),
 (0, 75),
 (0, 76),
 (0, 77),
 (0, 78),
 (0, 79),
 (0, 80),
 (0, 81),
 (0, 82),
 (0, 83),
 (0, 84),
 (0, 85),
 (0, 86),
 (0, 87),
 (0, 88),
 (0, 89),
 (0, 90),
 (0, 91),
 (0, 92),
 (0, 93),
 (0, 94),
 (0, 95),
 (0, 96),
 (0, 97),
 (0, 98),
 (0, 99),
 (0, 100),
 (0, 101

In [13]:
def connected(i,j):
    return A[i,j]

In [14]:
edges_df = pd.DataFrame(total_edges)
edges_df.columns = ['node1','node2']
edges_df['y'] = edges_df.apply(lambda x: connected(x['node1'], x['node2']), axis=1)

In [16]:
neg_edges_df= edges_df[edges_df['y']==0]
pos_edges_df= edges_df[edges_df['y']==1]

# 2. Define Functions

### (1) edge embedding

In [18]:
def edge_embedding(edges,type=1):
    n1 = edges[0]
    n2 = edges[1]
    node1 = football.iloc[n1].values
    node2 = football.iloc[n2].values
    if type==0: # Average
        embed = (node1+node2)/2
    elif type==1: # Hadamard
        embed = node1*node2
    elif type==2: # Weighted-L1
        embed = np.abs(node1-node2)
    elif type==3 : # Weighte-L2
        ebmed = np.square(node1-node2)
    return embed  
    

### (2) negative sampling

In [19]:
def sample_50(neg_edges):
    num = pos_edges_df.shape[0]//2
    neg_sample = neg_edges.sample(num,)
    return neg_sample

# 3. Edge sampling

### (1) sample negative edges

In [20]:
neg = sample_50(neg_edges_df)
neg_for_test = neg_edges_df[~neg_edges_df.isin(neg)].dropna().sample(39) 

### (2) masking postive edges

In [21]:
pos_masked = sample_50(pos_edges_df)
pos_notmasked = pos_edges_df[~pos_edges_df.isin(pos_masked)].dropna()

### (3) make training & test dataset

#### train

In [22]:
train = pd.concat([neg,pos_notmasked]).sample(frac=1).astype('int')

In [23]:
train_embedded = train[['node1','node2']].apply(lambda x: edge_embedding(x[['node1','node2']],1),axis=1)
train_embedded.columns = ['emb_x1','emb_x2']

In [24]:
final_train = pd.concat([train,train_embedded],axis=1)[['emb_x1','emb_x2','y']]

#### test

In [25]:
test = pd.concat([neg_for_test,pos_masked]).sample(frac=1).astype('int')

In [26]:
test_embedded = test[['node1','node2']].apply(lambda x: edge_embedding(x[['node1','node2']],1),axis=1)
test_embedded.columns = ['emb_x1','emb_x2']

In [27]:
final_test = pd.concat([test,test_embedded],axis=1)[['emb_x1','emb_x2','y']]

# 4. Classification

In [28]:
import xgboost as xgb

### (1) modeling with TRAIN data

In [29]:
train_x = final_train[['emb_x1','emb_x2']]
train_y = final_train[['y']]

In [30]:
xgb_clf = xgb.XGBClassifier()
xgb_clf.fit(train_x,train_y)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
              max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
              n_jobs=1, nthread=None, objective='binary:logistic',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=None, silent=True, subsample=1)

### (2) evaluate with TEST data

In [31]:
test_x = final_test[['emb_x1','emb_x2']]
test_y = final_test[['y']]

In [32]:
test_pred = xgb_clf.predict(test_x)
test_actual = test_y.values.flatten()

# 5. Result

In [33]:
from sklearn.metrics import accuracy_score,f1_score,roc_auc_score

In [34]:
accuracy_score(test_pred,test_actual)

0.7855072463768116

In [35]:
f1_score(test_pred,test_actual)

0.8724137931034482