In [1]:
%matplotlib notebook
%pylab inline 
import csv
import nltk
import numpy as np
import networkx as nx
import random
nltk.download('punkt')
nltk.download('stopwords')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

Populating the interactive namespace from numpy and matplotlib


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rkroc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rkroc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### loading the node_information data, train_data and the test data

In [104]:
 ## Loads the node information
with open("node_info.csv", "r") as f:
    next(f)
    file = csv.reader(f)
    node = list(file)

ID = [int(i[0]) for i in node]
year=[int(i[1]) for i in node]
title=[i[2] for i in node]
authors=[i[3] for i in node]
name_journal=[i[4] for i in node]
abstract=[i[5] for i in node]



with open("training_set.txt", "r") as f:
    file =csv.reader(f, delimiter='\t')
    set_file=list(file)
set= np.array([values[0].split(" ") for values in set_file]).astype(int)


with open("testing_set.txt", "r") as f:
    file =csv.reader(f, delimiter='\t')
    set_file=list(file)
set_test= np.array([values[0].split(" ") for values in set_file]).astype(int)

### Graph creation using networkx package

In [105]:
## Creates the oriented graph
diG=nx.DiGraph()
#adds the list of papers' IDs
diG.add_nodes_from(ID)
#adds the corresponding links between the paper (training set), links when link_test==1
for ID_source_train,ID_sink_train,link_train in set:
    if link_train==1:
        diG.add_edge(ID_source_train,ID_sink_train)

  ## Checks the number of edges and creates the non-oriented graph G
G = nx.Graph(diG)

#### following features are created using textual information and graphical orient of the data
* Indegree and out degree for target and sorurce node(node here is paper id)
* jaccard_coef, adamic_adar_coef, pref_attachement_coef, common_neig
* co_occurence_abstract, same_authors, co_occurence_title, years_diff, same_journal, tfidf_similarity between abstract
*  some heuristic graph features such as page rank hub score
*  some addition features using edge that goes to common neighbours from both source and target node

In [4]:
# ## Useful graph-based features computed at once
page_rank = nx.pagerank_scipy(G)
hub_score, authority_score = nx.hits(G)

##One_hot vectors on abstract (usefull for co_occurence computations in features construction function)
one_hot = CountVectorizer(stop_words="english")
one_hot_matrix = one_hot.fit_transform(abstract)#.todense()

## One_hot vectors on authors (usefull for co_occurence computations in features construction function)
onehot_authors= CountVectorizer()
onehot_authors_matrix=onehot_authors.fit_transform(authors)

##One_hot vectors on titles (usefull for co_occurence computations in features construction function)
onehot_titles= CountVectorizer()
onehot_titles_matrix=onehot_titles.fit_transform(title)

##TF-IDF cosine similarity
tfidf_vectorizer = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf_vectorizer.fit_transform(abstract)


In [None]:
degree_features = [diG.in_degree(paper1), diG.out_degree(paper1), diG.in_degree(paper2), diG.out_degree(paper2)]
heuristic_graph_features = [jaccard_coef, adamic_adar_coef, pref_attachement_coef, common_neig] 
node_info_features = [co_occurence_abstract, same_authors, co_occurence_title, years_diff, same_journal, tfidf_sim]
heuristic_graph_features.append(page_rank[paper2])
heuristic_graph_features.append(hub_score[paper1])
heuristic_graph_features.append(authority_score[paper2])

In [6]:
def features(paper1,paper2):
    """
        Outputs the array of the features to input in the prediction models
    """
    idx_paper1,idx_paper2=ID.index(paper1),ID.index(paper2)

    ## Features from contextual information of the nodes
    co_occurence_abstract=np.dot(one_hot_matrix[idx_paper1],one_hot_matrix[idx_paper2].T).toarray()[0][0]
    same_authors=np.dot(onehot_authors_matrix[idx_paper1],onehot_authors_matrix[idx_paper2].T).toarray()[0][0]
    co_occurence_title=np.dot(onehot_titles_matrix[idx_paper1],onehot_titles_matrix[idx_paper2].T).toarray()[0][0]

    #tfidf cosine similarity
    tf1 = tfidf_matrix[idx_paper1]# in case tfidf mat is so large that it's stored as a sparse matrix
    tf2 = tfidf_matrix[idx_paper2]# in case tfidf mat is so largs that it's stared as a sparse matrix
    tfidf_sim = cosine_similarity(tf1, tf2)[0][0]


    same_journal = int(name_journal[idx_paper1] == name_journal[idx_paper2])

    years_diff=int(year[idx_paper1])-int(year[idx_paper2])

    ## Features over the graph
    jaccard = nx.jaccard_coefficient(G, [(paper1, paper2)])
    for u, v, p in jaccard:
        jaccard_coef= p
    adamic_adar=nx.adamic_adar_index(G, [(paper1, paper2)])
    for u, v, p in adamic_adar:
        adamic_adar_coef= p
    pref_attachement = nx.preferential_attachment(G, [(paper1, paper2)])
    for u, v, p in pref_attachement:
        pref_attachement_coef= p
    common_neig=len(sorted(nx.common_neighbors(G, paper1, paper2)))

    ## features over the directed graph
    triad_features = [0.0]*8
    for w in sorted(nx.common_neighbors(G, paper1, paper2)):
        if G.has_edge(paper1, w) and G.has_edge(w, paper2):
            triad_features[0]+=1
        if G.has_edge(paper1, w) and G.has_edge(paper2, w):
            triad_features[1]+=1
        if G.has_edge(w, paper1) and G.has_edge(w, paper2):
            triad_features[2] += 1
        if G.has_edge(w, paper1) and G.has_edge(paper2, w):
            triad_features[3] += 1
    for i in range(4, 8):
        if triad_features[i-4]!=0:
            triad_features[i] = triad_features[i-4]/common_neig

    ## Sum up of all features
    degree_features = [diG.in_degree(paper1), diG.out_degree(paper1), diG.in_degree(paper2), diG.out_degree(paper2)]
    heuristic_graph_features = [jaccard_coef, adamic_adar_coef, pref_attachement_coef, common_neig] 
    node_info_features = [co_occurence_abstract, same_authors, co_occurence_title, years_diff, same_journal, tfidf_sim]
    heuristic_graph_features.append(page_rank[paper2])
    heuristic_graph_features.append(hub_score[paper1])
    heuristic_graph_features.append(authority_score[paper2])

    return node_info_features + heuristic_graph_features + degree_features + triad_features  ## 25 features in total

In [141]:
## To save the X_train,y_train matrices. Expansive to compute
saved = True

train_features= []
if saved:
    train_features= np.load("train_features_full.npy")
y_train=[]
print("Features construction for Learning...")
step=0
for source,sink,link in set:
    step+=1
    if step%1000==0:    print("Step:",step,"/",len(set))
    if not saved:
        train_features.append(features(source,sink))
    y_train.append(link)
train_features=np.array(train_features)
train_features = preprocessing.scale(train_features)
y_train=np.array(y_train)
if not saved:
    np.save("train_features_full.npy", train_features)


Features construction for Learning...
Step: 1000 / 615512
Step: 2000 / 615512
Step: 3000 / 615512
Step: 4000 / 615512
Step: 5000 / 615512
Step: 6000 / 615512
Step: 7000 / 615512
Step: 8000 / 615512
Step: 9000 / 615512
Step: 10000 / 615512
Step: 11000 / 615512
Step: 12000 / 615512
Step: 13000 / 615512
Step: 14000 / 615512
Step: 15000 / 615512
Step: 16000 / 615512
Step: 17000 / 615512
Step: 18000 / 615512
Step: 19000 / 615512
Step: 20000 / 615512
Step: 21000 / 615512
Step: 22000 / 615512
Step: 23000 / 615512
Step: 24000 / 615512
Step: 25000 / 615512
Step: 26000 / 615512
Step: 27000 / 615512
Step: 28000 / 615512
Step: 29000 / 615512
Step: 30000 / 615512
Step: 31000 / 615512
Step: 32000 / 615512
Step: 33000 / 615512
Step: 34000 / 615512
Step: 35000 / 615512
Step: 36000 / 615512
Step: 37000 / 615512
Step: 38000 / 615512
Step: 39000 / 615512
Step: 40000 / 615512
Step: 41000 / 615512
Step: 42000 / 615512
Step: 43000 / 615512
Step: 44000 / 615512
Step: 45000 / 615512
Step: 46000 / 615512
Step:

Step: 496000 / 615512
Step: 497000 / 615512
Step: 498000 / 615512
Step: 499000 / 615512
Step: 500000 / 615512
Step: 501000 / 615512
Step: 502000 / 615512
Step: 503000 / 615512
Step: 504000 / 615512
Step: 505000 / 615512
Step: 506000 / 615512
Step: 507000 / 615512
Step: 508000 / 615512
Step: 509000 / 615512
Step: 510000 / 615512
Step: 511000 / 615512
Step: 512000 / 615512
Step: 513000 / 615512
Step: 514000 / 615512
Step: 515000 / 615512
Step: 516000 / 615512
Step: 517000 / 615512
Step: 518000 / 615512
Step: 519000 / 615512
Step: 520000 / 615512
Step: 521000 / 615512
Step: 522000 / 615512
Step: 523000 / 615512
Step: 524000 / 615512
Step: 525000 / 615512
Step: 526000 / 615512
Step: 527000 / 615512
Step: 528000 / 615512
Step: 529000 / 615512
Step: 530000 / 615512
Step: 531000 / 615512
Step: 532000 / 615512
Step: 533000 / 615512
Step: 534000 / 615512
Step: 535000 / 615512
Step: 536000 / 615512
Step: 537000 / 615512
Step: 538000 / 615512
Step: 539000 / 615512
Step: 540000 / 615512
Step: 5410

In [142]:
train_features.shape

(615512, 25)

In [143]:
### Load the set to work on for kaggle prediction
saved=True
test_features=[]
if saved:
    test_features=np.load("test_features_full.npy")
y_test=[]

print("Features construction for Testing...")
step=0
for source,sink in set_test: ##set_test: ##
    step+=1
    if step%1000==0:    print("Step:",step,"/",len(set_test))
    if not saved:
        test_features.append(features(source,sink))
test_features=np.array(test_features)
test_features = preprocessing.scale(test_features)
if not saved:
    np.save("test_features_full.npy", test_features)

Features construction for Testing...
Step: 1000 / 32648
Step: 2000 / 32648
Step: 3000 / 32648
Step: 4000 / 32648
Step: 5000 / 32648
Step: 6000 / 32648
Step: 7000 / 32648
Step: 8000 / 32648
Step: 9000 / 32648
Step: 10000 / 32648
Step: 11000 / 32648
Step: 12000 / 32648
Step: 13000 / 32648
Step: 14000 / 32648
Step: 15000 / 32648
Step: 16000 / 32648
Step: 17000 / 32648
Step: 18000 / 32648
Step: 19000 / 32648
Step: 20000 / 32648
Step: 21000 / 32648
Step: 22000 / 32648
Step: 23000 / 32648
Step: 24000 / 32648
Step: 25000 / 32648
Step: 26000 / 32648
Step: 27000 / 32648
Step: 28000 / 32648
Step: 29000 / 32648
Step: 30000 / 32648
Step: 31000 / 32648
Step: 32000 / 32648
