### Loading the libraries

In [1]:
import random
import numpy as np
import igraph
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn import preprocessing
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import csv
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
from igraph.clustering import *
from time import time
from sklearn.metrics.pairwise import cosine_similarity

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rkroc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rkroc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Loading the data

In [3]:
nltk.download('punkt') # for tokenization
nltk.download('stopwords')
stpwds = set(nltk.corpus.stopwords.words("english"))
stemmer = nltk.stem.PorterStemmer()

with open("data/testing_set.txt", "r") as f:
    reader = csv.reader(f)
    testing_set  = list(reader)

testing_set = [element[0].split(" ") for element in testing_set]

with open("data/training_set.txt", "r") as f:
    reader = csv.reader(f)
    training_set  = list(reader)

training_set = [element[0].split(" ") for element in training_set]

with open("data/node_information.csv", "r") as f:
    reader = csv.reader(f)
    node_info  = list(reader)
IDs = [i[0] for i in node_info]
year=[int(i[1]) for i in node_info]
title=[i[2] for i in node_info]
authors=[i[3] for i in node_info]
name_journal=[i[4] for i in node_info]
abstract=[i[5] for i in node_info]

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rkroc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rkroc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [288]:
node_information = pd.read_csv('data/node_information.csv', header=None, names=['ID', 'Year', 'Title', 'Authors', 'Journal', 'Abstract'])
train = pd.read_csv('data/training_set.txt', header=None, names=['Source', 'Target', 'Edge'], delim_whitespace=True)
test = pd.read_csv('data/testing_set.txt', header=None, names=['Source', 'Target'], delim_whitespace=True)

In [289]:
node_information.isnull().sum()

ID             0
Year           0
Title          0
Authors     4033
Journal     7472
Abstract       0
dtype: int64

<a id="2"></a>
## 2. Feature engineering

<a id="2a"></a>
### A1 - Semantic features train

#### Creation of the features

In [7]:
%%time
# we will use three basic features:

# number of overlapping words in title
overlap_title = []

# number of overlapping words in abstract
overlap_abstract = []

# temporal distance between the papers
temp_diff = []

# number of common authors
comm_auth = []

# is in the same journal
comm_journal = []

# Cosine similarity between abstracts
cos_similarity = []

# Sum of authors in abstract
author_abstract = []

# LSA distance
lsa_distance_euc = []

counter = 0

#preparation phase : 
for i in range(len(training_set)):
    #print("skfjdzx")
    source = training_set[i][0]
    target = training_set[i][1]

    
    index_source = IDs.index(source)
    index_target = IDs.index(target)
    
    #print(index_source,index_target)
    source_info = [element for element in node_info if int(element[0])==source][0]
    target_info = [element for element in node_info if int(element[0])==target][0]

    # convert to lowercase and tokenize
    source_title = source_info[2].lower().split(" ")
    source_title = [token for token in source_title if token not in stpwds]
    source_title = [stemmer.stem(token) for token in source_title]
    
    target_title = target_info[2].lower().split(" ")
    target_title = [token for token in target_title if token not in stpwds]
    target_title = [stemmer.stem(token) for token in target_title]
    
    source_abstract = source_info[5].lower().split(" ")
    source_abstract = [token for token in source_abstract if token not in stpwds]
    source_abstract = [stemmer.stem(token) for token in source_abstract]
    
    target_abstract = target_info[5].lower().split(" ")
    target_abstract = [token for token in target_abstract if token not in stpwds]
    target_abstract = [stemmer.stem(token) for token in target_abstract]
    
    source_auth = source_info[3].split(",")
    target_auth = target_info[3].split(",")
      
    source_auths = set(source_auth)
    target_auths = set(target_auth)
    
    source_journal = source_info[4].split(".")
    target_journal = target_info[4].split(".")
    
    #tfidf cosine similarity
    tfidf_source = tfidf_matrix[index_source]# in case tfidf mat is so large that it's stored as a sparse matrix
    tfidf_target = tfidf_matrix[index_target]# in case tfidf mat is so largs that it's stared as a sparse matrix
    tfidf_sim    = cosine_similarity(tfidf_source, tfidf_source)[0][0]

    author_abstract_count =0
    author_abstract_count += len(source_auths.intersection(target_abstract))
    author_abstract_count += len(target_auths.intersection(source_abstract))
    
    overlap_title.append(len(set(source_title).intersection(set(target_title))))
    overlap_abstract.append(len(set(source_abstract).intersection(set(target_abstract))))
    temp_diff.append(int(source_info[1]) - int(target_info[1]))
    comm_auth.append(len(set(source_auth).intersection(set(target_auth))))
    comm_journal.append(int(source_journal == target_journal))
    cos_similarity.append(tfidf_sim)
    author_abstract.append(author_abstract_count)
    lsa_distance_euc.append(np.linalg.norm(LSA[index_source]-LSA[index_target] ))
   
    counter += 1
    if counter % 1000 == True:
        print(counter, "training examples processsed")

1 training examples processsed
1001 training examples processsed
2001 training examples processsed
3001 training examples processsed
4001 training examples processsed
5001 training examples processsed
6001 training examples processsed
7001 training examples processsed
8001 training examples processsed
9001 training examples processsed
10001 training examples processsed
11001 training examples processsed
12001 training examples processsed
13001 training examples processsed
14001 training examples processsed
15001 training examples processsed
16001 training examples processsed
17001 training examples processsed
18001 training examples processsed
19001 training examples processsed
20001 training examples processsed
21001 training examples processsed
22001 training examples processsed
23001 training examples processsed
24001 training examples processsed
25001 training examples processsed
26001 training examples processsed
27001 training examples processsed
28001 training examples processse

231001 training examples processsed
232001 training examples processsed
233001 training examples processsed
234001 training examples processsed
235001 training examples processsed
236001 training examples processsed
237001 training examples processsed
238001 training examples processsed
239001 training examples processsed
240001 training examples processsed
241001 training examples processsed
242001 training examples processsed
243001 training examples processsed
244001 training examples processsed
245001 training examples processsed
246001 training examples processsed
247001 training examples processsed
248001 training examples processsed
249001 training examples processsed
250001 training examples processsed
251001 training examples processsed
252001 training examples processsed
253001 training examples processsed
254001 training examples processsed
255001 training examples processsed
256001 training examples processsed
257001 training examples processsed
258001 training examples pro

459001 training examples processsed
460001 training examples processsed
461001 training examples processsed
462001 training examples processsed
463001 training examples processsed
464001 training examples processsed
465001 training examples processsed
466001 training examples processsed
467001 training examples processsed
468001 training examples processsed
469001 training examples processsed
470001 training examples processsed
471001 training examples processsed
472001 training examples processsed
473001 training examples processsed
474001 training examples processsed
475001 training examples processsed
476001 training examples processsed
477001 training examples processsed
478001 training examples processsed
479001 training examples processsed
480001 training examples processsed
481001 training examples processsed
482001 training examples processsed
483001 training examples processsed
484001 training examples processsed
485001 training examples processsed
486001 training examples pro

In [15]:
train['Title overlap'] = overlap_title
train['Abstract overlap'] = overlap_abstract
train['Temporal difference'] = temp_diff
train['Common authors'] = comm_auth
train['Common journal'] = comm_journal
train['Cosine similarity'] = cos_similarity
train['Authors in abstract'] = author_abstract
train['LSA distance'] = lsa_distance_euc

### A2 - Semantic features test

In [19]:
%%time
# we will use these basic features:

# number of overlapping words in title
overlap_title_test = []

# number of overlapping words in abstract
overlap_abstract_test = []

# temporal distance between the papers
temp_diff_test = []

# number of common authors
comm_auth_test = []

# is in the same journal
comm_journal_test = []

# Cosine similarity between abstracts
cos_similarity_test = []

# Sum of authors in abstract
author_abstract_test = []

# LSA distance
lsa_distance_euc_test = []

counter = 0

#preparation phase : 
for i in range(len(testing_set)):
#for i in xrange(len(testing_set_reduced)):
    source = testing_set[i][0]
    target = testing_set[i][1]
    #source = testing_set_reduced[i][0]
    #target = testing_set_reduced[i][1]

    index_source = IDs.index(source)
    index_target = IDs.index(target)
    
    source_info = [element for element in node_info if int(element[0])==source][0]
    target_info = [element for element in node_info if int(element[0])==target][0]
    
    # convert to lowercase and tokenize
    source_title = source_info[2].lower().split(" ")
    # remove stopwords
    source_title = [token for token in source_title if token not in stpwds]
    source_title = [stemmer.stem(token) for token in source_title]
    
    target_title = target_info[2].lower().split(" ")
    target_title = [token for token in target_title if token not in stpwds]
    target_title = [stemmer.stem(token) for token in target_title]
    
    source_abstract = source_info[5].lower().split(" ")
    source_abstract = [token for token in source_abstract if token not in stpwds]
    source_abstract = [stemmer.stem(token) for token in source_abstract]
    
    target_abstract = target_info[5].lower().split(" ")
    target_abstract = [token for token in target_abstract if token not in stpwds]
    target_abstract = [stemmer.stem(token) for token in target_abstract]
    
    source_auth = source_info[3].split(",")
    target_auth = target_info[3].split(",")
    source_auths = set(source_auth)
    target_auths = set(target_auth)
    
    source_journal = source_info[4].split(".")
    target_journal = target_info[4].split(".")
    
    tfidf_source = tfidf_matrix[index_source]
    tfidf_target = tfidf_matrix[index_target]
    tfidf_sim    = cosine_similarity(tfidf_source, tfidf_target)[0][0]

    author_abstract_count =0
    author_abstract_count += len(source_auths.intersection(target_abstract))
    author_abstract_count += len(target_auths.intersection(source_abstract))
    
    overlap_title_test.append(len(set(source_title).intersection(set(target_title))))
    overlap_abstract_test.append(len(set(source_abstract).intersection(set(target_abstract))))
    temp_diff_test.append(int(source_info[1]) - int(target_info[1]))
    comm_auth_test.append(len(set(source_auth).intersection(set(target_auth))))
    comm_journal_test.append(int(source_journal == target_journal))
    cos_similarity_test.append(tfidf_sim)
    author_abstract_test.append(author_abstract_count)
    lsa_distance_euc_test.append(np.linalg.norm(LSA[index_source]-LSA[index_target] ))
   
    counter += 1
    if counter % 1000 == True:
        print(counter, "testing examples processsed")

1 testing examples processsed
1001 testing examples processsed
2001 testing examples processsed
3001 testing examples processsed
4001 testing examples processsed
5001 testing examples processsed
6001 testing examples processsed
7001 testing examples processsed
8001 testing examples processsed
9001 testing examples processsed
10001 testing examples processsed
11001 testing examples processsed
12001 testing examples processsed
13001 testing examples processsed
14001 testing examples processsed
15001 testing examples processsed
16001 testing examples processsed
17001 testing examples processsed
18001 testing examples processsed
19001 testing examples processsed
20001 testing examples processsed
21001 testing examples processsed
22001 testing examples processsed
23001 testing examples processsed
24001 testing examples processsed
25001 testing examples processsed
26001 testing examples processsed
27001 testing examples processsed
28001 testing examples processsed
29001 testing examples proc

In [20]:
test['Title overlap'] = overlap_title_test
test['Abstract overlap'] = overlap_abstract_test
test['Temporal difference'] = temp_diff_test
test['Common authors'] = comm_auth_test
test['Common journal'] = comm_journal_test
test['Cosine similarity'] = cos_similarity_test
test['Authors in abstract'] = author_abstract_test
test['LSA distance'] = lsa_distance_euc_test

In [145]:
train.to_csv('train_semantic.csv',index=False)
test.to_csv('test_semantic.csv',index=False)
# train = pd.read_csv('train_semantic.csv')
# test =  pd.read_csv('test_semantic.csv')

In [146]:
train.head()

Unnamed: 0,Target,Source,Edge,Title overlap,Abstract overlap,Temporal difference,Common authors,Common journal,Cosine similarity,Authors in abstract,LSA distance
0,9510123,9502114,1,2,4,0,0,1,0.039132,0,0.176704
1,9707075,9604178,1,1,7,1,0,0,0.015247,0,0.178874
2,9312155,9506142,0,0,6,-2,0,0,0.008888,0,0.27899
3,9911255,302165,0,0,8,-4,0,0,0.00474,0,0.325877
4,9701033,209076,0,0,8,-5,0,0,0.027379,0,0.230838


In [147]:
test.head()

Unnamed: 0,Target,Source,Title overlap,Abstract overlap,Temporal difference,Common authors,Common journal,Cosine similarity,Authors in abstract,LSA distance
0,9807076,9807139,0,7,0,0,0,0.055452,0,0.149897
1,109162,1182,2,6,1,0,1,0.11067,0,0.246126
2,9702187,9510135,1,4,2,0,1,0.043831,0,0.280983
3,111048,110115,1,13,0,0,1,0.054856,0,0.246031
4,9910176,9410073,0,4,5,0,0,0.147222,0,0.20347


<a id="2b"></a>
### B - Topological features 

#### Using basic igraph library

In [45]:
## the following shows how to construct a graph with igraph
## even though in this baseline we don't use it
## look at http://igraph.org/python/doc/igraph.Graph-class.html for feature ideas
edges = [(element[0],element[1]) for element in training_set if element[2]=="1"]

## some nodes may not be connected to any other node
## hence the need to create the nodes of the graph from node_info.csv,
## not just from the edge list
nodes = IDs

#create empty directed graph
g = igraph.Graph(directed=True)
 
## add vertices
g.add_vertices(nodes)
 
## add edges
g.add_edges(edges)

In [46]:
%%time
betweenness_info = g.betweenness()

Wall time: 27.7 s


In [47]:
%%time
communities = g.community_leading_eigenvector()
cluster_info = communities.membership

Wall time: 14 s


  membership, _, q = GraphBase.community_leading_eigenvector(


In [48]:
pageranks=g.personalized_pagerank(damping=0.5)

### TOPOLOGICAL Feature train

In [49]:
%%time

# Betweenness centrality
bet_centrality = []

# Is same cluster
is_same_cluster = []


#Page rank
page_rank=[]

counter = 0

#preparation phase : 
for i in range(len(training_set)):
#for i in xrange(len(training_set_reduced)):
    source = training_set[i][0]
    target = training_set[i][1]
    #source = training_set_reduced[i][0]
    #target = training_set_reduced[i][1]

    index_source = IDs.index(source)
    index_target = IDs.index(target)
    
    betweenness_source = betweenness_info[index_source]
    betweenness_target = betweenness_info[index_target]
    
    source_page_rank=pageranks[index_source]
    target_page_rank=pageranks[index_target]
    
    bet_centrality.append(betweenness_source - betweenness_target)
    is_same_cluster.append(int(cluster_info[index_source] == cluster_info[index_target]))
    page_rank.append(source_page_rank+target_page_rank)
   
    counter += 1
    if counter % 10000 == True:
        print(counter, "training examples processsed")

1 training examples processsed
10001 training examples processsed
20001 training examples processsed
30001 training examples processsed
40001 training examples processsed
50001 training examples processsed
60001 training examples processsed
70001 training examples processsed
80001 training examples processsed
90001 training examples processsed
100001 training examples processsed
110001 training examples processsed
120001 training examples processsed
130001 training examples processsed
140001 training examples processsed
150001 training examples processsed
160001 training examples processsed
170001 training examples processsed
180001 training examples processsed
190001 training examples processsed
200001 training examples processsed
210001 training examples processsed
220001 training examples processsed
230001 training examples processsed
240001 training examples processsed
250001 training examples processsed
260001 training examples processsed
270001 training examples processsed
280001

In [58]:
train['Betweenness centrality'] = bet_centrality
train['Same cluster'] = is_same_cluster
train['Page rank'] = page_rank

### TOPOLOGICAL Feature test

In [51]:
%%time

# Betweenness centrality
bet_centrality_test = []

# Is same cluster
is_same_cluster_test = []


#Page rank
page_rank_test=[]

counter = 0

#preparation phase : 
for i in range(len(testing_set)):
#for i in xrange(len(testing_set_reduced)):
    source = testing_set[i][0]
    target = testing_set[i][1]
    #source = testing_set_reduced[i][0]
    #target = testing_set_reduced[i][1]

    index_source = IDs.index(source)
    index_target = IDs.index(target)
    
    betweenness_source = betweenness_info[index_source]
    betweenness_target = betweenness_info[index_target]
    
    source_page_rank=pageranks[index_source]
    target_page_rank=pageranks[index_target]
    
    bet_centrality_test.append(betweenness_source - betweenness_target)
    is_same_cluster_test.append(int(cluster_info[index_source] == cluster_info[index_target]))
    page_rank_test.append(source_page_rank+target_page_rank)
   
    counter += 1
    if counter % 1000 == True:
        print(counter, "testing examples processsed")

1 testing examples processsed
1001 testing examples processsed
2001 testing examples processsed
3001 testing examples processsed
4001 testing examples processsed
5001 testing examples processsed
6001 testing examples processsed
7001 testing examples processsed
8001 testing examples processsed
9001 testing examples processsed
10001 testing examples processsed
11001 testing examples processsed
12001 testing examples processsed
13001 testing examples processsed
14001 testing examples processsed
15001 testing examples processsed
16001 testing examples processsed
17001 testing examples processsed
18001 testing examples processsed
19001 testing examples processsed
20001 testing examples processsed
21001 testing examples processsed
22001 testing examples processsed
23001 testing examples processsed
24001 testing examples processsed
25001 testing examples processsed
26001 testing examples processsed
27001 testing examples processsed
28001 testing examples processsed
29001 testing examples proc

In [59]:
test['Betweenness centrality'] = bet_centrality_test
test['Same cluster'] = is_same_cluster_test
test['Page rank'] = page_rank_test

In [62]:
train.to_csv('train_semantic_topo1.csv',index=False)
test.to_csv('test_semantic_topo1.csv',index=False)

In [63]:
train.head()

Unnamed: 0,Target,Source,Edge,Title overlap,Abstract overlap,Temporal difference,Common authors,Common journal,Cosine similarity,Authors in abstract,LSA distance,Betweenness centrality,Same cluster,Page rank
0,9510123,9502114,1,2,4,0,0,1,0.039132,0,0.176704,8166.884091,0,5.8e-05
1,9707075,9604178,1,1,7,1,0,0,0.015247,0,0.178874,31162.082411,1,0.000144
2,9312155,9506142,0,0,6,-2,0,0,0.008888,0,0.27899,-10559.734281,1,4.9e-05
3,9911255,302165,0,0,8,-4,0,0,0.00474,0,0.325877,611.223395,0,4.4e-05
4,9701033,209076,0,0,8,-5,0,0,0.027379,0,0.230838,-501.379284,1,0.000157


In [64]:
test.head()

Unnamed: 0,Target,Source,Title overlap,Abstract overlap,Temporal difference,Common authors,Common journal,Cosine similarity,Authors in abstract,LSA distance,Betweenness centrality,Same cluster,Page rank
0,9807076,9807139,0,7,0,0,0,0.055452,0,0.149897,213620.5,0,8.6e-05
1,109162,1182,2,6,1,0,1,0.11067,0,0.246126,1121793.0,1,0.00016
2,9702187,9510135,1,4,2,0,1,0.043831,0,0.280983,230572.7,0,0.001309
3,111048,110115,1,13,0,0,1,0.054856,0,0.246031,698948.6,1,5.2e-05
4,9910176,9410073,0,4,5,0,0,0.147222,0,0.20347,-26005.92,0,0.000345


#### Using the more advanced library networkx

In [65]:
def create_graph(X, y):
    graph = nx.Graph()
    edges=[]
    nodes=set()
    for i in range(len(X)):
        source = X[i][0]
        target = X[i][1]
        nodes.add(source)
        nodes.add(target)
        if y[i]==1:
            edges.append((source, target))
    graph.add_nodes_from(nodes)
    graph.add_edges_from(edges)
    return graph

def create_directed_graph(X, y):
    graph = nx.DiGraph()
    edges=[]
    nodes=set()
    for i in range(len(X)):
        source = X[i][0]
        target = X[i][1]
        nodes.add(source)
        nodes.add(target)
        if y[i]==1:
            edges.append((source, target))
    graph.add_nodes_from(nodes)
    graph.add_edges_from(edges)
    return graph

### topological feature train part 2

In [66]:
%%time
X = train[['Target', 'Source']].values
y = train[['Edge']].values
graph = create_graph(X,y) 

res_alloc_index = np.asarray(list(nx.resource_allocation_index(graph, X)))[:,2]
jac_coef = np.asarray(list(nx.jaccard_coefficient(graph, X)))[:,2]
ad_adar_idx = np.asarray(list(nx.adamic_adar_index(graph, X)))[:,2]
pref_att = np.asarray(list(nx.preferential_attachment(graph, X)))[:,2]

train['Ressource allocation'] = list(res_alloc_index)
train['Jaccard coefficient'] = list(jac_coef)
train['Adamic Adar'] = list(ad_adar_idx)
train['Preferential attachment'] = list(pref_att)

Wall time: 7min 59s


### topological feature test part 2

In [67]:
%%time
X_test = test[['Target', 'Source']].values

res_alloc_index_test = np.asarray(list(nx.resource_allocation_index(graph, X_test)))[:,2]
jac_coef_test = np.asarray(list(nx.jaccard_coefficient(graph, X_test)))[:,2]
ad_adar_idx_test = np.asarray(list(nx.adamic_adar_index(graph, X_test)))[:,2]
pref_att_test = np.asarray(list(nx.preferential_attachment(graph, X_test)))[:,2]

test['Ressource allocation'] = list(res_alloc_index_test)
test['Jaccard coefficient'] = list(jac_coef_test)
test['Adamic Adar'] = list(ad_adar_idx_test)
test['Preferential attachment'] = list(pref_att_test)

Wall time: 24.8 s


In [68]:
train.to_csv('train_semantic_topo2.csv',index=False)
test.to_csv('test_semantic_topo2.csv',index=False)

In [69]:
train.head()

Unnamed: 0,Target,Source,Edge,Title overlap,Abstract overlap,Temporal difference,Common authors,Common journal,Cosine similarity,Authors in abstract,LSA distance,Betweenness centrality,Same cluster,Page rank,Ressource allocation,Jaccard coefficient,Adamic Adar,Preferential attachment
0,9510123,9502114,1,2,4,0,0,1,0.039132,0,0.176704,8166.884091,0,5.8e-05,0.142857,0.058824,0.513898,72
1,9707075,9604178,1,1,7,1,0,0,0.015247,0,0.178874,31162.082411,1,0.000144,0.226401,0.097087,4.320366,11613
2,9312155,9506142,0,0,6,-2,0,0,0.008888,0,0.27899,-10559.734281,1,4.9e-05,0.0,0.0,0.0,5
3,9911255,302165,0,0,8,-4,0,0,0.00474,0,0.325877,611.223395,0,4.4e-05,0.0,0.0,0.0,280
4,9701033,209076,0,0,8,-5,0,0,0.027379,0,0.230838,-501.379284,1,0.000157,0.0,0.0,0.0,168


### topological feature train part 3

In [70]:
def neighbor_calc(graph, v):
    neighbors_in = graph.predecessors(v)
    neighbors_out = graph.successors(v)
    neighbors = list(set(neighbors_in).union(neighbors_out))
   
    return graph.in_degree(v), graph.out_degree(v), neighbors_in, neighbors_out, neighbors
        
            
X = train[['Target', 'Source']].values
y = train[['Edge']].values
target_feats=np.empty((train.shape[0], 2))
source_feats=np.empty((train.shape[0], 2))
edge_feats=np.empty((train.shape[0], 2))
print("Creating graph")
graph = create_directed_graph(X, y)
print("Generating vertex features")
l = X.shape[0]
t1 = time()
for i, x in enumerate(X):
    t=x[0]
    s=x[1]
    in_d_t, out_d_t, n_in_t, n_out_t, n_t = neighbor_calc(graph, t)
    in_d_s, out_d_s, n_in_s, n_out_s, n_s = neighbor_calc(graph, s)
    com_in = len(set(n_in_t).intersection(n_in_s))
    com_on = len(set(n_out_t).intersection(n_out_s))

    target_feats[i]=[in_d_t, out_d_t]
    source_feats[i]=[in_d_s, out_d_s]
    edge_feats[i]=[com_in, com_on]
    if i%10000==0:
        print(i, l)
        t2=time()
        print(t2-t1)
        t1=t2

train['Target_indegree'] = target_feats[:,0]
train['Target_outdegree'] = target_feats[:,1]

train['Source_indegree'] = source_feats[:,0]
train['Source_outdegree'] = source_feats[:,1]

train['Common_in'] = edge_feats[:,0]
train['Common_out'] = edge_feats[:,1]

Creating graph
Generating vertex features
0 615512
0.0
10000 615512
1.0940577983856201
20000 615512
1.088376522064209
30000 615512
1.0729472637176514
40000 615512
1.0806050300598145
50000 615512
1.0903539657592773
60000 615512
1.086927890777588
70000 615512
1.0824799537658691
80000 615512
1.0766847133636475
90000 615512
1.125166416168213
100000 615512
1.0938587188720703
110000 615512
1.168952465057373
120000 615512
1.0457518100738525
130000 615512
1.0962486267089844
140000 615512
1.0635712146759033
150000 615512
1.076563835144043
160000 615512
1.0373296737670898
170000 615512
1.0600595474243164
180000 615512
1.0586533546447754
190000 615512
1.0809354782104492
200000 615512
1.0812690258026123
210000 615512
1.0504188537597656
220000 615512
1.0547397136688232
230000 615512
1.0824577808380127
240000 615512
1.0564625263214111
250000 615512
1.0427577495574951
260000 615512
1.0693409442901611
270000 615512
1.062920093536377
280000 615512
1.0592999458312988
290000 615512
1.0886385440826416
300

### topological feature test part 3

In [71]:
%%time
X_test = test[['Target', 'Source']].values
target_feats=np.empty((test.shape[0], 2))
source_feats=np.empty((test.shape[0], 2))
edge_feats=np.empty((test.shape[0], 2))
t1 = time()
print("Creating graph")
graph = create_directed_graph(X, y)
print("Generating vertex features")
t1 = time()
for i, x in enumerate(X_test):
    t=x[0]
    s=x[1]
    in_d_t, out_d_t, n_in_t, n_out_t, n_t = neighbor_calc(graph, t)
    in_d_s, out_d_s, n_in_s, n_out_s, n_s = neighbor_calc(graph, s)
    com_in = len(set(n_in_t).intersection(n_in_s))
    com_on = len(set(n_out_t).intersection(n_out_s))

    target_feats[i]=[in_d_t, out_d_t]
    source_feats[i]=[in_d_s, out_d_s]
    edge_feats[i]=[com_in, com_on]
    if i%10000==0:
        print(i, l)
        t2=time()
        print(t2-t1)
        t1=t2
        
test['Target_indegree'] = target_feats[:,0]
test['Target_outdegree'] = target_feats[:,1]

test['Source_indegree'] = source_feats[:,0]
test['Source_outdegree'] = source_feats[:,1]

test['Common_in'] = edge_feats[:,0]
test['Common_out'] = edge_feats[:,1]

Creating graph
Generating vertex features
0 615512
0.0
10000 615512
1.226555585861206
20000 615512
1.365011215209961
30000 615512
679.5116457939148
Wall time: 11min 28s


In [152]:
train.head()

Unnamed: 0,Target,Source,Edge,Title overlap,Abstract overlap,Temporal difference,Common authors,Common journal,Cosine similarity,Authors in abstract,...,Ressource allocation,Jaccard coefficient,Adamic Adar,Preferential attachment,Target_indegree,Target_outdegree,Source_indegree,Source_outdegree,Common_in,Common_out
0,9510123,9502114,1,2,4,0,0,1,0.039132,0,...,0.142857,0.058824,0.513898,72,3.0,3.0,8.0,4.0,0.0,0.0
1,9707075,9604178,1,1,7,1,0,0,0.015247,0,...,0.226401,0.097087,4.320366,11613,11.0,68.0,124.0,23.0,0.0,0.0
2,9312155,9506142,0,0,6,-2,0,0,0.008888,0,...,0.0,0.0,0.0,5,1.0,0.0,2.0,3.0,0.0,0.0
3,9911255,302165,0,0,8,-4,0,0,0.00474,0,...,0.0,0.0,0.0,280,4.0,16.0,2.0,12.0,0.0,0.0
4,9701033,209076,0,0,8,-5,0,0,0.027379,0,...,0.0,0.0,0.0,168,7.0,0.0,2.0,22.0,0.0,0.0


In [153]:
test.head()

Unnamed: 0,Target,Source,Title overlap,Abstract overlap,Temporal difference,Common authors,Common journal,Cosine similarity,Authors in abstract,LSA distance,...,Ressource allocation,Jaccard coefficient,Adamic Adar,Preferential attachment,Target_indegree,Target_outdegree,Source_indegree,Source_outdegree,Common_in,Common_out
0,9807076,9807139,0,7,0,0,0,0.055452,0,0.149897,...,0.0,0.0,0.0,1062,49.0,10.0,3.0,15.0,0.0,0.0
1,109162,1182,2,6,1,0,1,0.11067,0,0.246126,...,0.311535,0.074303,5.377973,13590,100.0,203.0,39.0,6.0,0.0,0.0
2,9702187,9510135,1,4,2,0,1,0.043831,0,0.280983,...,1.342594,0.065338,15.053612,164797,209.0,14.0,726.0,13.0,0.0,0.0
3,111048,110115,1,13,0,0,1,0.054856,0,0.246031,...,0.298419,0.221053,4.899424,3315,11.0,40.0,16.0,49.0,0.0,0.0
4,9910176,9410073,0,4,5,0,0,0.147222,0,0.20347,...,0.0,0.0,0.0,1050,1.0,6.0,144.0,6.0,0.0,0.0


In [30]:
train.head()

Unnamed: 0,Target,Source,Edge,Title overlap,Abstract overlap,Temporal difference,Common authors,Common journal,Cosine similarity,Authors in abstract,...,Ressource allocation,Jaccard coefficient,Adamic Adar,Preferential attachment,Target_indegree,Target_outdegree,Source_indegree,Source_outdegree,Common_in,Common_out
0,9510123,9502114,1,2,4,0,0,1,0.039132,0,...,0.142857,0.058824,0.513898,72,3.0,3.0,8.0,4.0,0.0,0.0
1,9707075,9604178,1,1,7,1,0,0,0.015247,0,...,0.226401,0.097087,4.320366,11613,11.0,68.0,124.0,23.0,0.0,0.0
2,9312155,9506142,0,0,6,-2,0,0,0.008888,0,...,0.0,0.0,0.0,5,1.0,0.0,2.0,3.0,0.0,0.0
3,9911255,302165,0,0,8,-4,0,0,0.00474,0,...,0.0,0.0,0.0,280,4.0,16.0,2.0,12.0,0.0,0.0
4,9701033,209076,0,0,8,-5,0,0,0.027379,0,...,0.0,0.0,0.0,168,7.0,0.0,2.0,22.0,0.0,0.0


In [7]:
train.to_csv('train_complete.csv',index=False)
test.to_csv('test_complete.csv',index=False)
# train = pd.read_csv('train_complete.csv',header=0)
# test = pd.read_csv('test_complete.csv',header=0)