# [ First Order Proximity with Negative Sampling ]

## 1. Import Dataset & Libraries

In [1]:
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
import random
import pandas as pd
from random import shuffle
from copy import copy

%matplotlib inline

#### 1) adjacent matrix & one-hot encoded matrix

In [2]:
H = nx.read_gml('football.gml')

In [3]:
A = nx.to_numpy_matrix(H,nodelist=H.nodes())

In [4]:
OH = np.identity(A.shape[0])

## 2. Define Functions

#### 1) first_prox 
- 2개 node의 vector를 input하면 first-order proximity 계산

In [6]:
def first_prox(node_vec1,node_vec2):
    mul = np.dot(node_vec1.T,node_vec2)
    return 1/(1+np.exp(-mul))

## 3. First-order Proximity with Negative Sampling

In [7]:
def FirstOrder_neg_sampling(dim,num_neg,epoch,lr):
    low_dim = np.random.uniform(-2,2,(OH.shape[0],dim))
    #low_dim = np.random.rand(OH.shape[0],dim)
    
    for ep in range(epoch+1):
        embed_vec =np.matmul(OH,low_dim)        
        
        for node in range(embed_vec.shape[0]):
            ######################### find Neighbours & Non-Neighbours ##########################
            
            neighbours_index = np.nonzero(A[node])[1]
            not_neighbours_index = np.setdiff1d(np.arange(embed_vec.shape[0]), np.nonzero(A[node])[1])
            sampled_not_neighbours_index = np.random.choice(not_neighbours_index,num_neg,replace=False)
            
            ########## find vectors of (1) main node, (2) neighbours, (3) non-neighbours ##########
            
            node_vec = embed_vec[node]  
            
            for positive in neighbours_index:
                pos_vec = embed_vec[positive]
                neg_vecs = embed_vec[sampled_not_neighbours_index]
                
            ############################# UPDATE ###############################################
                embed_vec[node] -= lr*(first_prox(node_vec,pos_vec)-1)*pos_vec
        
                for neg_vec in neg_vecs:
                    embed_vec[node] -= lr*(first_prox(node_vec,neg_vec))*neg_vec
                    
        if ep>0 and ep%20==0:
            print('epoch ',ep,'is done.')
    print('training finished')                
    return embed_vec

In [9]:
Emb = FirstOrder_neg_sampling(dim=10,num_neg=5,epoch=80,lr=0.025)

epoch  20 is done.
epoch  40 is done.
epoch  60 is done.
epoch  80 is done.
training finished


In [10]:
Emb_df = pd.DataFrame(Emb)
Emb_df['Label'] = dict(H.node('value')).values()

In [11]:
Emb_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,Label
0,0.865126,0.732464,0.654681,-0.280288,-0.416516,0.77929,1.989182,0.944528,0.75891,0.924716,7
1,-0.315168,-1.665299,-0.98481,1.077798,0.511267,0.939566,1.635527,-0.366913,-0.451699,1.780345,0
2,-0.569846,-0.199044,1.78497,0.186517,2.154936,-0.550533,-0.93743,0.107572,1.074133,0.32642,2
3,0.832763,0.221549,-0.575225,-0.686977,-1.096524,0.453152,-0.012188,0.983878,0.942373,0.57072,3
4,0.117742,0.502898,0.749028,0.396632,-0.188808,0.286299,1.366271,-0.073079,-0.786144,-0.255506,7


## 4. Visualization

### in case of "dimension 2"

### in case of "dimension 3"

In [12]:
Emb_df.to_csv('[Football]Embedded_with_FirstOrder.csv')