# Link prediction in citation networks : feature engineering

**Plan :**

[1. Loading the libraries and the data](#1)  
[2. Feature engineering](#2)  
> [2A - Semantic features](#2a)  
> [2B - Topological features](#2b)  
> [2C - Defining the training and testing features, and the labels](#2c) 

[3. Classification with a basic SVM](#3)

<a id="1"></a>
## 1. Loading the libraries and the data

### Loading the libraries

In [1]:
import random
import numpy as np
import igraph
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn import preprocessing
from sklearn import cross_validation
import nltk
import csv
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
import networkx as nx
from igraph.clustering import *
from time import time



In [2]:
# Setting the seed for the rest of the work
random.seed(0)

### Loading the data

In [3]:
nltk.download('punkt') # for tokenization
nltk.download('stopwords')
stpwds = set(nltk.corpus.stopwords.words("english"))
stemmer = nltk.stem.PorterStemmer()

with open("data/testing_set.txt", "r") as f:
    reader = csv.reader(f)
    testing_set  = list(reader)

testing_set = [element[0].split(" ") for element in testing_set]

###################
# random baseline #
###################

random_predictions = np.random.choice([0, 1], size=len(testing_set))
random_predictions = zip(range(len(testing_set)),random_predictions)

with open("data/random_predictions.csv","wb") as pred:
    csv_out = csv.writer(pred)
    for row in random_predictions:
        csv_out.writerow(row)
        
# note: Kaggle requires that you add "ID" and "category" column headers

###############################
# beating the random baseline #
###############################

# the following script gets an F1 score of approximately 0.66

# data loading and preprocessing 

# the columns of the data frame below are: 
# (1) paper unique ID (integer)
# (2) publication year (integer)
# (3) paper title (string)
# (4) authors (strings separated by ,)
# (5) name of journal (optional) (string)
# (6) abstract (string) - lowercased, free of punctuation except intra-word dashes

with open("data/training_set.txt", "r") as f:
    reader = csv.reader(f)
    training_set  = list(reader)

training_set = [element[0].split(" ") for element in training_set]

with open("data/node_information.csv", "r") as f:
    reader = csv.reader(f)
    node_info  = list(reader)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/delavergne/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/delavergne/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
node_information = pd.read_csv('data/node_information.csv', header=None, names=['ID', 'Year', 'Title', 'Authors', 'Journal', 'Abstract'])
train = pd.read_csv('data/training_set.txt', header=None, names=['Target', 'Source', 'Edge'], delim_whitespace=True)
test = pd.read_csv('data/testing_set.txt', header=None, names=['Target', 'Source'], delim_whitespace=True)

In [5]:
node_information.head()

Unnamed: 0,ID,Year,Title,Authors,Journal,Abstract
0,1001,2000,compactification geometry and duality,Paul S. Aspinwall,,these are notes based on lectures given at tas...
1,1002,2000,domain walls and massive gauged supergravity p...,"M. Cvetic, H. Lu, C.N. Pope",Class.Quant.Grav.,we point out that massive gauged supergravity ...
2,1003,2000,comment on metric fluctuations in brane worlds,"Y.S. Myung, Gungwon Kang",,recently ivanov and volovich hep-th 9912242 cl...
3,1004,2000,moving mirrors and thermodynamic paradoxes,Adam D. Helfer,Phys.Rev.,quantum fields responding to moving mirrors ha...
4,1005,2000,bundles of chiral blocks and boundary conditio...,"J. Fuchs, C. Schweigert",,proceedings of lie iii clausthal july 1999 var...


<a id="2"></a>
## 2. Feature engineering

<a id="2a"></a>
### A - Semantic features

In [6]:
%%time
IDs = [element[0] for element in node_info]

# compute TFIDF vector of each paper
corpus = [element[5] for element in node_info]
vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1,2), min_df=2)
features_TFIDF = vectorizer.fit_transform(corpus)
lsa=TruncatedSVD(n_components=25,n_iter=5)
LSA = lsa.fit_transform(features_TFIDF)

CPU times: user 16.7 s, sys: 752 ms, total: 17.4 s
Wall time: 16.3 s


In [7]:
tfidf_matrix = features_TFIDF.toarray()

#### Creation of the features

In [None]:
#for each training example we need to compute features
# in this baseline we will train the model on only 5% of the training set
# randomly select 5% of training set
#to_keep = random.sample(range(len(training_set)), k=int(round(len(training_set)*1.0)))
#training_set_reduced = [training_set[i] for i in to_keep]

In [9]:
%%time
# we will use three basic features:

# number of overlapping words in title
overlap_title = []

# number of overlapping words in abstract
overlap_abstract = []

# temporal distance between the papers
temp_diff = []

# number of common authors
comm_auth = []

# is in the same journal
comm_journal = []

# Cosine similarity between abstracts
cosine_similarity = []

# Sum of authors in abstract
author_abstract = []

# LSA distance
lsa_distance_euc = []

counter = 0

#preparation phase : 
for i in xrange(len(training_set)):
#for i in xrange(len(training_set_reduced)):
    source = training_set[i][0]
    target = training_set[i][1]
    #source = training_set_reduced[i][0]
    #target = training_set_reduced[i][1]

    index_source = IDs.index(source)
    index_target = IDs.index(target)
    
    source_info = [element for element in node_info if element[0]==source][0]
    target_info = [element for element in node_info if element[0]==target][0]
    
    # convert to lowercase and tokenize
    source_title = source_info[2].lower().split(" ")
    # remove stopwords
    source_title = [token for token in source_title if token not in stpwds]
    source_title = [stemmer.stem(token) for token in source_title]
    
    target_title = target_info[2].lower().split(" ")
    target_title = [token for token in target_title if token not in stpwds]
    target_title = [stemmer.stem(token) for token in target_title]
    
    source_abstract = source_info[5].lower().split(" ")
    source_abstract = [token for token in source_abstract if token not in stpwds]
    source_abstract = [stemmer.stem(token) for token in source_abstract]
    
    target_abstract = target_info[5].lower().split(" ")
    target_abstract = [token for token in target_abstract if token not in stpwds]
    target_abstract = [stemmer.stem(token) for token in target_abstract]
    
    source_auth = source_info[3].split(",")
    target_auth = target_info[3].split(",")
    source_auths = set(source_auth)
    target_auths = set(target_auth)
    
    source_journal = source_info[4].split(".")
    target_journal = target_info[4].split(".")
    
    tfidf_source = tfidf_matrix[index_source,:]
    tfidf_target = tfidf_matrix[index_target, :]
    cosine_simil = tfidf_source.dot(tfidf_target)/(np.linalg.norm(tfidf_source)*np.linalg.norm(tfidf_target))

    author_abstract_count =0
    author_abstract_count += len(source_auths.intersection(target_abstract))
    author_abstract_count += len(target_auths.intersection(source_abstract))
    
    overlap_title.append(len(set(source_title).intersection(set(target_title))))
    overlap_abstract.append(len(set(source_abstract).intersection(set(target_abstract))))
    temp_diff.append(int(source_info[1]) - int(target_info[1]))
    comm_auth.append(len(set(source_auth).intersection(set(target_auth))))
    comm_journal.append(int(source_journal == target_journal))
    cosine_similarity.append(cosine_simil)
    author_abstract.append(author_abstract_count)
    lsa_distance_euc.append(np.linalg.norm(LSA[index_source]-LSA[index_target] ))
   
    counter += 1
    if counter % 10000 == True:
        print counter, "training examples processsed"

1 training examples processsed
10001 training examples processsed
20001 training examples processsed
30001 training examples processsed
40001 training examples processsed
50001 training examples processsed
60001 training examples processsed
70001 training examples processsed
80001 training examples processsed
90001 training examples processsed
100001 training examples processsed
110001 training examples processsed
120001 training examples processsed
130001 training examples processsed
140001 training examples processsed
150001 training examples processsed
160001 training examples processsed
170001 training examples processsed
180001 training examples processsed
190001 training examples processsed
200001 training examples processsed
210001 training examples processsed
220001 training examples processsed
230001 training examples processsed
240001 training examples processsed
250001 training examples processsed
260001 training examples processsed
270001 training examples processsed
280001

In [10]:
train['Title overlap'] = overlap_title
train['Abstract overlap'] = overlap_abstract
train['Temporal difference'] = temp_diff
train['Common authors'] = comm_auth
train['Common journal'] = comm_journal
train['Cosine similarity'] = cosine_similarity
train['Authors in abstract'] = author_abstract
train['LSA distance'] = lsa_distance_euc

In [12]:
%%time
# we will use three basic features:

# number of overlapping words in title
overlap_title_test = []

# number of overlapping words in abstract
overlap_abstract_test = []

# temporal distance between the papers
temp_diff_test = []

# number of common authors
comm_auth_test = []

# is in the same journal
comm_journal_test = []

# Cosine similarity between abstracts
cosine_similarity_test = []

# Sum of authors in abstract
author_abstract_test = []

# LSA distance
lsa_distance_euc_test = []

counter = 0

#preparation phase : 
for i in xrange(len(testing_set)):
#for i in xrange(len(testing_set_reduced)):
    source = testing_set[i][0]
    target = testing_set[i][1]
    #source = testing_set_reduced[i][0]
    #target = testing_set_reduced[i][1]

    index_source = IDs.index(source)
    index_target = IDs.index(target)
    
    source_info = [element for element in node_info if element[0]==source][0]
    target_info = [element for element in node_info if element[0]==target][0]
    
    # convert to lowercase and tokenize
    source_title = source_info[2].lower().split(" ")
    # remove stopwords
    source_title = [token for token in source_title if token not in stpwds]
    source_title = [stemmer.stem(token) for token in source_title]
    
    target_title = target_info[2].lower().split(" ")
    target_title = [token for token in target_title if token not in stpwds]
    target_title = [stemmer.stem(token) for token in target_title]
    
    source_abstract = source_info[5].lower().split(" ")
    source_abstract = [token for token in source_abstract if token not in stpwds]
    source_abstract = [stemmer.stem(token) for token in source_abstract]
    
    target_abstract = target_info[5].lower().split(" ")
    target_abstract = [token for token in target_abstract if token not in stpwds]
    target_abstract = [stemmer.stem(token) for token in target_abstract]
    
    source_auth = source_info[3].split(",")
    target_auth = target_info[3].split(",")
    source_auths = set(source_auth)
    target_auths = set(target_auth)
    
    source_journal = source_info[4].split(".")
    target_journal = target_info[4].split(".")
    
    tfidf_source = tfidf_matrix[index_source,:]
    tfidf_target = tfidf_matrix[index_target, :]
    cosine_simil = tfidf_source.dot(tfidf_target)/(np.linalg.norm(tfidf_source)*np.linalg.norm(tfidf_target))

    author_abstract_count =0
    author_abstract_count += len(source_auths.intersection(target_abstract))
    author_abstract_count += len(target_auths.intersection(source_abstract))
    
    overlap_title_test.append(len(set(source_title).intersection(set(target_title))))
    overlap_abstract_test.append(len(set(source_abstract).intersection(set(target_abstract))))
    temp_diff_test.append(int(source_info[1]) - int(target_info[1]))
    comm_auth_test.append(len(set(source_auth).intersection(set(target_auth))))
    comm_journal_test.append(int(source_journal == target_journal))
    cosine_similarity_test.append(cosine_simil)
    author_abstract_test.append(author_abstract_count)
    lsa_distance_euc_test.append(np.linalg.norm(LSA[index_source]-LSA[index_target] ))
   
    counter += 1
    if counter % 1000 == True:
        print counter, "testing examples processsed"

1 testing examples processsed
1001 testing examples processsed
2001 testing examples processsed
3001 testing examples processsed
4001 testing examples processsed
5001 testing examples processsed
6001 testing examples processsed
7001 testing examples processsed
8001 testing examples processsed
9001 testing examples processsed
10001 testing examples processsed
11001 testing examples processsed
12001 testing examples processsed
13001 testing examples processsed
14001 testing examples processsed
15001 testing examples processsed
16001 testing examples processsed
17001 testing examples processsed
18001 testing examples processsed
19001 testing examples processsed
20001 testing examples processsed
21001 testing examples processsed
22001 testing examples processsed
23001 testing examples processsed
24001 testing examples processsed
25001 testing examples processsed
26001 testing examples processsed
27001 testing examples processsed
28001 testing examples processsed
29001 testing examples proc

In [13]:
test['Title overlap'] = overlap_title_test
test['Abstract overlap'] = overlap_abstract_test
test['Temporal difference'] = temp_diff_test
test['Common authors'] = comm_auth_test
test['Common journal'] = comm_journal_test
test['Cosine similarity'] = cosine_similarity_test
test['Authors in abstract'] = author_abstract_test
test['LSA distance'] = lsa_distance_euc_test

In [14]:
train.to_csv('train_semantic.csv')
test.to_csv('test_semantic.csv')

In [15]:
train.head()

Unnamed: 0,Target,Source,Edge,Title overlap,Abstract overlap,Temporal difference,Common authors,Common journal,Cosine similarity,Authors in abstract,LSA distance
0,9510123,9502114,1,2,4,0,0,1,0.039132,0,0.176262
1,9707075,9604178,1,1,7,1,0,0,0.015247,0,0.185291
2,9312155,9506142,0,0,6,-2,0,0,0.008888,0,0.281976
3,9911255,302165,0,0,8,-4,0,0,0.00474,0,0.306004
4,9701033,209076,0,0,8,-5,0,0,0.027379,0,0.218788


In [16]:
test.head()

Unnamed: 0,Target,Source,Title overlap,Abstract overlap,Temporal difference,Common authors,Common journal,Cosine similarity,Authors in abstract,LSA distance
0,9807076,9807139,0,7,0,0,0,0.055452,0,0.137422
1,109162,1182,2,6,1,0,1,0.11067,0,0.246302
2,9702187,9510135,1,4,2,0,1,0.043831,0,0.281266
3,111048,110115,1,13,0,0,1,0.054856,0,0.251033
4,9910176,9410073,0,4,5,0,0,0.147222,0,0.199901


<a id="2b"></a>
### B - Topological features 

#### Using basic igraph library

In [17]:
## the following shows how to construct a graph with igraph
## even though in this baseline we don't use it
## look at http://igraph.org/python/doc/igraph.Graph-class.html for feature ideas
edges = [(element[0],element[1]) for element in training_set if element[2]=="1"]

## some nodes may not be connected to any other node
## hence the need to create the nodes of the graph from node_info.csv,
## not just from the edge list
nodes = IDs

#create empty directed graph
g = igraph.Graph(directed=True)
 
## add vertices
g.add_vertices(nodes)
 
## add edges
g.add_edges(edges)

In [18]:
%%time
betweenness_info = g.betweenness()

CPU times: user 1min 14s, sys: 554 ms, total: 1min 14s
Wall time: 1min 15s


In [19]:
%%time
communities = g.community_leading_eigenvector()
cluster_info = communities.membership

CPU times: user 38.3 s, sys: 234 ms, total: 38.5 s
Wall time: 38.6 s


  membership, _, q = GraphBase.community_leading_eigenvector(self, clusters, **kwds)


In [20]:
pageranks=g.personalized_pagerank(damping=0.5)

In [21]:
%%time

# Betweenness centrality
bet_centrality = []

# Is same cluster
is_same_cluster = []


#Page rank
page_rank=[]

counter = 0

#preparation phase : 
for i in xrange(len(training_set)):
#for i in xrange(len(training_set_reduced)):
    source = training_set[i][0]
    target = training_set[i][1]
    #source = training_set_reduced[i][0]
    #target = training_set_reduced[i][1]

    index_source = IDs.index(source)
    index_target = IDs.index(target)
    
    betweenness_source = betweenness_info[index_source]
    betweenness_target = betweenness_info[index_target]
    
    source_page_rank=pageranks[index_source]
    target_page_rank=pageranks[index_target]
    
    bet_centrality.append(betweenness_source - betweenness_target)
    is_same_cluster.append(int(cluster_info[index_source] == cluster_info[index_target]))
    page_rank.append(source_page_rank+target_page_rank)
   
    counter += 1
    if counter % 10000 == True:
        print counter, "training examples processsed"

1 training examples processsed
10001 training examples processsed
20001 training examples processsed
30001 training examples processsed
40001 training examples processsed
50001 training examples processsed
60001 training examples processsed
70001 training examples processsed
80001 training examples processsed
90001 training examples processsed
100001 training examples processsed
110001 training examples processsed
120001 training examples processsed
130001 training examples processsed
140001 training examples processsed
150001 training examples processsed
160001 training examples processsed
170001 training examples processsed
180001 training examples processsed
190001 training examples processsed
200001 training examples processsed
210001 training examples processsed
220001 training examples processsed
230001 training examples processsed
240001 training examples processsed
250001 training examples processsed
260001 training examples processsed
270001 training examples processsed
280001

In [22]:
train['Betweenness centrality'] = bet_centrality
train['Same cluster'] = is_same_cluster
train['Page rank'] = page_rank

In [23]:
%%time

# Betweenness centrality
bet_centrality_test = []

# Is same cluster
is_same_cluster_test = []


#Page rank
page_rank_test=[]

counter = 0

#preparation phase : 
for i in xrange(len(testing_set)):
#for i in xrange(len(testing_set_reduced)):
    source = testing_set[i][0]
    target = testing_set[i][1]
    #source = testing_set_reduced[i][0]
    #target = testing_set_reduced[i][1]

    index_source = IDs.index(source)
    index_target = IDs.index(target)
    
    betweenness_source = betweenness_info[index_source]
    betweenness_target = betweenness_info[index_target]
    
    source_page_rank=pageranks[index_source]
    target_page_rank=pageranks[index_target]
    
    bet_centrality_test.append(betweenness_source - betweenness_target)
    is_same_cluster_test.append(int(cluster_info[index_source] == cluster_info[index_target]))
    page_rank_test.append(source_page_rank+target_page_rank)
   
    counter += 1
    if counter % 1000 == True:
        print counter, "testing examples processsed"

1 testing examples processsed
1001 testing examples processsed
2001 testing examples processsed
3001 testing examples processsed
4001 testing examples processsed
5001 testing examples processsed
6001 testing examples processsed
7001 testing examples processsed
8001 testing examples processsed
9001 testing examples processsed
10001 testing examples processsed
11001 testing examples processsed
12001 testing examples processsed
13001 testing examples processsed
14001 testing examples processsed
15001 testing examples processsed
16001 testing examples processsed
17001 testing examples processsed
18001 testing examples processsed
19001 testing examples processsed
20001 testing examples processsed
21001 testing examples processsed
22001 testing examples processsed
23001 testing examples processsed
24001 testing examples processsed
25001 testing examples processsed
26001 testing examples processsed
27001 testing examples processsed
28001 testing examples processsed
29001 testing examples proc

In [24]:
test['Betweenness centrality'] = bet_centrality_test
test['Same cluster'] = is_same_cluster_test
test['Page rank'] = page_rank_test

In [25]:
train.to_csv('train_semantic_topo1.csv')
test.to_csv('test_semantic_topo1.csv')

In [26]:
train.head()

Unnamed: 0,Target,Source,Edge,Title overlap,Abstract overlap,Temporal difference,Common authors,Common journal,Cosine similarity,Authors in abstract,LSA distance,Betweenness centrality,Same cluster,Page rank
0,9510123,9502114,1,2,4,0,0,1,0.039132,0,0.176262,8166.884091,0,5.8e-05
1,9707075,9604178,1,1,7,1,0,0,0.015247,0,0.185291,31162.082411,1,0.000144
2,9312155,9506142,0,0,6,-2,0,0,0.008888,0,0.281976,-10559.734281,1,4.9e-05
3,9911255,302165,0,0,8,-4,0,0,0.00474,0,0.306004,611.223395,0,4.4e-05
4,9701033,209076,0,0,8,-5,0,0,0.027379,0,0.218788,-501.379284,1,0.000157


In [27]:
test.head()

Unnamed: 0,Target,Source,Title overlap,Abstract overlap,Temporal difference,Common authors,Common journal,Cosine similarity,Authors in abstract,LSA distance,Betweenness centrality,Same cluster,Page rank
0,9807076,9807139,0,7,0,0,0,0.055452,0,0.137422,213620.5,0,8.6e-05
1,109162,1182,2,6,1,0,1,0.11067,0,0.246302,1121793.0,1,0.00016
2,9702187,9510135,1,4,2,0,1,0.043831,0,0.281266,230572.7,0,0.001309
3,111048,110115,1,13,0,0,1,0.054856,0,0.251033,698948.6,1,5.2e-05
4,9910176,9410073,0,4,5,0,0,0.147222,0,0.199901,-26005.92,0,0.000345


#### Using the more advanced library networkx

In [28]:
def create_graph(X, y):
    graph = nx.Graph()
    edges=[]
    nodes=set()
    for i in range(len(X)):
        source = X[i][0]
        target = X[i][1]
        nodes.add(source)
        nodes.add(target)
        if y[i]==1:
            edges.append((source, target))
    graph.add_nodes_from(nodes)
    graph.add_edges_from(edges)
    return graph

def create_directed_graph(X, y):
    graph = nx.DiGraph()
    edges=[]
    nodes=set()
    for i in range(len(X)):
        source = X[i][0]
        target = X[i][1]
        nodes.add(source)
        nodes.add(target)
        if y[i]==1:
            edges.append((source, target))
    graph.add_nodes_from(nodes)
    graph.add_edges_from(edges)
    return graph

In [29]:
%%time
X = train[['Target', 'Source']].values
y = train[['Edge']].values
graph = create_graph(X,y) 

res_alloc_index = np.asarray(list(nx.resource_allocation_index(graph, X)))[:,2]
jac_coef = np.asarray(list(nx.jaccard_coefficient(graph, X)))[:,2]
ad_adar_idx = np.asarray(list(nx.adamic_adar_index(graph, X)))[:,2]
pref_att = np.asarray(list(nx.preferential_attachment(graph, X)))[:,2]

train['Ressource allocation'] = list(res_alloc_index)
train['Jaccard coefficient'] = list(jac_coef)
train['Adamic Adar'] = list(ad_adar_idx)
train['Preferential attachment'] = list(pref_att)

CPU times: user 5min 49s, sys: 6.5 s, total: 5min 56s
Wall time: 5min 53s


In [30]:
%%time
X_test = test[['Target', 'Source']].values

res_alloc_index_test = np.asarray(list(nx.resource_allocation_index(graph, X_test)))[:,2]
jac_coef_test = np.asarray(list(nx.jaccard_coefficient(graph, X_test)))[:,2]
ad_adar_idx_test = np.asarray(list(nx.adamic_adar_index(graph, X_test)))[:,2]
pref_att_test = np.asarray(list(nx.preferential_attachment(graph, X_test)))[:,2]

test['Ressource allocation'] = list(res_alloc_index_test)
test['Jaccard coefficient'] = list(jac_coef_test)
test['Adamic Adar'] = list(ad_adar_idx_test)
test['Preferential attachment'] = list(pref_att_test)

CPU times: user 17.6 s, sys: 385 ms, total: 18 s
Wall time: 17.8 s


In [31]:
train.to_csv('train_semantic_topo2.csv')
test.to_csv('test_semantic_topo2.csv')

In [32]:
train.head()

Unnamed: 0,Target,Source,Edge,Title overlap,Abstract overlap,Temporal difference,Common authors,Common journal,Cosine similarity,Authors in abstract,LSA distance,Betweenness centrality,Same cluster,Page rank,Ressource allocation,Jaccard coefficient,Adamic Adar,Preferential attachment
0,9510123,9502114,1,2,4,0,0,1,0.039132,0,0.176262,8166.884091,0,5.8e-05,0.142857,0.058824,0.513898,72
1,9707075,9604178,1,1,7,1,0,0,0.015247,0,0.185291,31162.082411,1,0.000144,0.226401,0.097087,4.320366,11613
2,9312155,9506142,0,0,6,-2,0,0,0.008888,0,0.281976,-10559.734281,1,4.9e-05,0.0,0.0,0.0,5
3,9911255,302165,0,0,8,-4,0,0,0.00474,0,0.306004,611.223395,0,4.4e-05,0.0,0.0,0.0,280
4,9701033,209076,0,0,8,-5,0,0,0.027379,0,0.218788,-501.379284,1,0.000157,0.0,0.0,0.0,168


In [33]:
test.head()

Unnamed: 0,Target,Source,Title overlap,Abstract overlap,Temporal difference,Common authors,Common journal,Cosine similarity,Authors in abstract,LSA distance,Betweenness centrality,Same cluster,Page rank,Ressource allocation,Jaccard coefficient,Adamic Adar,Preferential attachment
0,9807076,9807139,0,7,0,0,0,0.055452,0,0.137422,213620.5,0,8.6e-05,0.0,0.0,0.0,1062
1,109162,1182,2,6,1,0,1,0.11067,0,0.246302,1121793.0,1,0.00016,0.311535,0.074303,5.377973,13590
2,9702187,9510135,1,4,2,0,1,0.043831,0,0.281266,230572.7,0,0.001309,1.342594,0.065338,15.053612,164797
3,111048,110115,1,13,0,0,1,0.054856,0,0.251033,698948.6,1,5.2e-05,0.298419,0.221053,4.899424,3315
4,9910176,9410073,0,4,5,0,0,0.147222,0,0.199901,-26005.92,0,0.000345,0.0,0.0,0.0,1050


In [34]:
def neighbor_calc(graph, v):
    neighbors_in = graph.predecessors(v)
    neighbors_out = graph.successors(v)
    neighbors = list(set(neighbors_in).union(neighbors_out))
   
    return graph.in_degree(v), graph.out_degree(v), neighbors_in, neighbors_out, neighbors
        
            
X = train[['Target', 'Source']].values
y = train[['Edge']].values
target_feats=np.empty((train.shape[0], 2))
source_feats=np.empty((train.shape[0], 2))
edge_feats=np.empty((train.shape[0], 2))
print("Creating graph")
graph = create_directed_graph(X, y)
print("Generating vertex features")
l = X.shape[0]
t1 = time()
for i, x in enumerate(X):
    t=x[0]
    s=x[1]
    in_d_t, out_d_t, n_in_t, n_out_t, n_t = neighbor_calc(graph, t)
    in_d_s, out_d_s, n_in_s, n_out_s, n_s = neighbor_calc(graph, s)
    com_in = len(set(n_in_t).intersection(n_in_s))
    com_on = len(set(n_out_t).intersection(n_out_s))

    target_feats[i]=[in_d_t, out_d_t]
    source_feats[i]=[in_d_s, out_d_s]
    edge_feats[i]=[com_in, com_on]
    if i%10000==0:
        print(i, l)
        t2=time()
        print(t2-t1)
        t1=t2

train['Target_indegree'] = target_feats[:,0]
train['Target_outdegree'] = target_feats[:,1]

train['Source_indegree'] = source_feats[:,0]
train['Source_outdegree'] = source_feats[:,1]

train['Common_in'] = edge_feats[:,0]
train['Common_out'] = edge_feats[:,1]

Creating graph
Generating vertex features
(0, 615512)
0.000660181045532
(10000, 615512)
0.658829927444
(20000, 615512)
0.656039953232
(30000, 615512)
0.650640964508
(40000, 615512)
0.653007030487
(50000, 615512)
0.660380125046
(60000, 615512)
0.660248041153
(70000, 615512)
0.656295776367
(80000, 615512)
0.655925035477
(90000, 615512)
0.665270090103
(100000, 615512)
0.65466094017
(110000, 615512)
0.669471025467
(120000, 615512)
0.745337963104
(130000, 615512)
0.766157150269
(140000, 615512)
0.765100955963
(150000, 615512)
0.759486913681
(160000, 615512)
0.732270002365
(170000, 615512)
0.752366065979
(180000, 615512)
0.749993085861
(190000, 615512)
0.749178886414
(200000, 615512)
0.760818004608
(210000, 615512)
0.742151975632
(220000, 615512)
0.751518011093
(230000, 615512)
0.754862070084
(240000, 615512)
0.745887994766
(250000, 615512)
0.756701946259
(260000, 615512)
0.815802097321
(270000, 615512)
0.719415903091
(280000, 615512)
0.840296983719
(290000, 615512)
0.740258932114
(300000, 6

In [36]:
%%time
X_test = test[['Target', 'Source']].values
target_feats=np.empty((test.shape[0], 2))
source_feats=np.empty((test.shape[0], 2))
edge_feats=np.empty((test.shape[0], 2))
t1 = time()
print("Creating graph")
graph = create_directed_graph(X, y)
print("Generating vertex features")
t1 = time()
for i, x in enumerate(X_test):
    t=x[0]
    s=x[1]
    in_d_t, out_d_t, n_in_t, n_out_t, n_t = neighbor_calc(graph, t)
    in_d_s, out_d_s, n_in_s, n_out_s, n_s = neighbor_calc(graph, s)
    com_in = len(set(n_in_t).intersection(n_in_s))
    com_on = len(set(n_out_t).intersection(n_out_s))

    target_feats[i]=[in_d_t, out_d_t]
    source_feats[i]=[in_d_s, out_d_s]
    edge_feats[i]=[com_in, com_on]
    if i%10000==0:
        print(i, l)
        t2=time()
        print(t2-t1)
        t1=t2
        
test['Target_indegree'] = target_feats[:,0]
test['Target_outdegree'] = target_feats[:,1]

test['Source_indegree'] = source_feats[:,0]
test['Source_outdegree'] = source_feats[:,1]

test['Common_in'] = edge_feats[:,0]
test['Common_out'] = edge_feats[:,1]

Creating graph
Generating vertex features
(0, 615512)
0.000159025192261
(10000, 615512)
0.765555858612
(20000, 615512)
0.757110118866
(30000, 615512)
0.744807004929
CPU times: user 5.9 s, sys: 504 ms, total: 6.41 s
Wall time: 6.15 s


In [37]:
train.head()

Unnamed: 0,Target,Source,Edge,Title overlap,Abstract overlap,Temporal difference,Common authors,Common journal,Cosine similarity,Authors in abstract,...,Ressource allocation,Jaccard coefficient,Adamic Adar,Preferential attachment,Target_indegree,Target_outdegree,Source_indegree,Source_outdegree,Common_in,Common_out
0,9510123,9502114,1,2,4,0,0,1,0.039132,0,...,0.142857,0.058824,0.513898,72,3.0,3.0,8.0,4.0,0.0,0.0
1,9707075,9604178,1,1,7,1,0,0,0.015247,0,...,0.226401,0.097087,4.320366,11613,11.0,68.0,124.0,23.0,0.0,0.0
2,9312155,9506142,0,0,6,-2,0,0,0.008888,0,...,0.0,0.0,0.0,5,1.0,0.0,2.0,3.0,0.0,0.0
3,9911255,302165,0,0,8,-4,0,0,0.00474,0,...,0.0,0.0,0.0,280,4.0,16.0,2.0,12.0,0.0,0.0
4,9701033,209076,0,0,8,-5,0,0,0.027379,0,...,0.0,0.0,0.0,168,7.0,0.0,2.0,22.0,0.0,0.0


In [38]:
test.head()

Unnamed: 0,Target,Source,Title overlap,Abstract overlap,Temporal difference,Common authors,Common journal,Cosine similarity,Authors in abstract,LSA distance,...,Ressource allocation,Jaccard coefficient,Adamic Adar,Preferential attachment,Target_indegree,Target_outdegree,Source_indegree,Source_outdegree,Common_in,Common_out
0,9807076,9807139,0,7,0,0,0,0.055452,0,0.137422,...,0.0,0.0,0.0,1062,49.0,10.0,3.0,15.0,0.0,0.0
1,109162,1182,2,6,1,0,1,0.11067,0,0.246302,...,0.311535,0.074303,5.377973,13590,100.0,203.0,39.0,6.0,0.0,0.0
2,9702187,9510135,1,4,2,0,1,0.043831,0,0.281266,...,1.342594,0.065338,15.053612,164797,209.0,14.0,726.0,13.0,0.0,0.0
3,111048,110115,1,13,0,0,1,0.054856,0,0.251033,...,0.298419,0.221053,4.899424,3315,11.0,40.0,16.0,49.0,0.0,0.0
4,9910176,9410073,0,4,5,0,0,0.147222,0,0.199901,...,0.0,0.0,0.0,1050,1.0,6.0,144.0,6.0,0.0,0.0


In [39]:
train.to_csv('train_complete.csv')
test.to_csv('test_complete.csv')

<a id="2c"></a>
### C. Defining the training and testing features, and the labels

In [40]:
features = ['Title overlap', 'Abstract overlap', 'Temporal difference', 'Common authors', 'Common journal',
            'Cosine similarity', 'Authors in abstract', 'LSA distance', # Semantic features
            'Betweenness centrality', 'Same cluster', 'Page rank', 'Ressource allocation', 'Jaccard coefficient',
            'Adamic Adar', 'Preferential attachment', 'Target_indegree', 'Target_outdegree',
            'Source_indegree', 'Source_outdegree', 'Common_in', 'Common_out' #Topological features
           ]

In [41]:
training_features = train[features]
testing_features = test[features]

In [42]:
labels_array = train['Edge']

In [43]:
print(training_features.shape)
print(testing_features.shape)

(615512, 21)
(32648, 21)


<a id="3"></a>
## 3. Classification with a basic SVM

### Scaling the data

In [44]:
min_max_scaler = preprocessing.MinMaxScaler()
training_features = min_max_scaler.fit_transform(training_features)
testing_features = min_max_scaler.transform(testing_features)

### SVM

In [45]:
# initialize basic SVM
classifier = svm.LinearSVC()

In [46]:
%%time
np.mean(cross_validation.cross_val_score(classifier, training_features, labels_array, cv=3, scoring ="f1"))

CPU times: user 22.1 s, sys: 776 ms, total: 22.9 s
Wall time: 23.3 s


0.96441523689045106

In [47]:
%%time
#train
classifier.fit(training_features, labels_array)

CPU times: user 11.4 s, sys: 224 ms, total: 11.6 s
Wall time: 11.9 s


LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [48]:
# issue predictions
predictions_SVM = list(classifier.predict(testing_features))

# write predictions to .csv file suitable for Kaggle (just make sure to add the column names)
predictions_SVM = zip(range(len(testing_set)), predictions_SVM)
test = pd.DataFrame(predictions_SVM)
sub = test.copy()
sub['id']=sub.index
sub['category'] = sub[1]
sub = sub[['id', 'category']]

In [49]:
sub

Unnamed: 0,id,category
0,0,0
1,1,1
2,2,1
3,3,1
4,4,1
5,5,0
6,6,0
7,7,1
8,8,0
9,9,1


In [32]:
#sub.to_csv('sub/sub_SVM.csv', index=False)