In [2]:
import networkx as nx
import gensim
import csv
import numpy as np
from random import randint
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle
import nltk
from gensim.models import Word2Vec
from gensim.models import Doc2Vec
from collections import Counter
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity as cosim

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [47]:
# Create a graph
G = nx.read_edgelist('edgelist.txt', delimiter=',', create_using=nx.Graph(), nodetype=int)
diG = nx.read_edgelist('edgelist.txt', delimiter=',', create_using=nx.DiGraph(), nodetype=int)
nodes = list(G.nodes())
n = G.number_of_nodes()
m = G.number_of_edges()
print('Number of nodes:', n)
print('Number of edges:', m)


Number of nodes: 138499
Number of edges: 1091955


In [5]:
n2v = Word2Vec.load('n2v_model.model')

In [6]:
# Read the abstract of each paper
abstracts = dict()
with open('abstracts.txt', 'r', encoding='utf-8') as f:
    for line in f:
        node, abstract = line.split('|--|')
        abstracts[int(node)] = abstract

In [7]:
# Read the authors of each paper
authors = dict()
with open('authors.txt', 'r', encoding='utf-8') as f:
    for line in f:
        node, author = line.split('|--|')
        authors[int(node)] = author

In [8]:
stop_words = set(stopwords.words('english'))
stop_words.update(['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}',"'d", "'ll", "'re", "'s", "'ve", '``', 'could', 'might', 'must', "n't", 'need', 'sha', 'wo', 'would','”','“'])

In [9]:
lemmatizer = WordNetLemmatizer()

In [10]:
tokenized_authors = dict()
for node in authors:
    temp = authors[node].split(',')
    tokenized_authors[node] = []
    for author in temp:
        tokenized_authors[node].append(author.strip())

In [11]:
tokenized_abstracts = dict()
for node in abstracts:
    tokenized_abstracts[node] = []
    for sent in sent_tokenize(abstracts[node]):
        for i in word_tokenize(sent):
            word = i.lower()
            if word in stop_words:
                continue
            else:
                tokenized_abstracts[node].append(lemmatizer.lemmatize(word))

In [12]:
for node in abstracts:
    abstracts[node] = set(tokenized_abstracts[node])

In [13]:
for node in authors:
    authors[node] = set(tokenized_authors[node])

In [14]:
import math
epsilon = 1e-6
def counter_cosine_similarity(c1, c2):
    terms = set(c1).union(c2)
    dotprod = sum(c1.get(k, 0) * c2.get(k, 0) for k in terms)
    magA = math.sqrt(sum(c1.get(k, 0)**2 for k in terms))
    magB = math.sqrt(sum(c2.get(k, 0)**2 for k in terms))
    return dotprod / ((magA * magB) + epsilon)

Calculating the cosine similarity between the similar words of the abstracts
The vectors represent the frequency of the words in each abstracts

In [15]:
import igraph as ig
ig_G = ig.Graph.from_networkx(G)

In [16]:
cluster = nx.clustering(G)

In [17]:
rank = nx.pagerank(G)

In [18]:
h,a = nx.hits(G)

In [19]:
triangles = nx.triangles(G)

In [20]:
model = Word2Vec(abstracts.values(), window=20, min_count=1, workers=-1,sg=1)

In [21]:
model_authors = Word2Vec(authors.values(), window=5, min_count=1, workers=-1,sg=1)

In [22]:
#import taggeddocument
from gensim.models.doc2vec import TaggedDocument
from tqdm import tqdm

In [23]:
docs = [TaggedDocument(words=tokenized_abstracts[node], tags=[node]) for node in abstracts]
ath_doca = [TaggedDocument(words=tokenized_authors[node], tags=[node]) for node in authors]
d2v = Doc2Vec(docs, vector_size=100, window=10, min_count=1, workers= -1)
d2v_ath = Doc2Vec(ath_doca, vector_size=100, window=5, min_count=1, workers= -1)

In [24]:
from scipy.spatial.distance import cosine

In [25]:
for node in tokenized_abstracts:
    if tokenized_abstracts[node] == []:
        tokenized_abstracts[node] = ['none']

In [48]:
bet = ig_G.betweenness(directed=False, cutoff=5)

In [50]:
bet[1]

205924.46228046715

In [52]:
# its class label is 1 if it corresponds to an edge and 0, otherwise.
# Use the following 3 features for each pair of nodes:
# (1) sum of number of unique terms of the two nodes' abstracts
# (2) absolute value of difference of number of unique terms of the two nodes' abstracts
# (3) number of common terms between the abstracts of the two nodes
# (4) sum of number of unique terms of the two nodes' authors
# (5) absolute value of difference of number of unique terms of the two nodes' authors
# (6) 

X_train = np.zeros((2*m, 33))
y_train = np.zeros(2*m)
n = G.number_of_nodes()
for i,edge in tqdm(enumerate(G.edges()), desc='Training', total=m):
    # an edge
    X_train[i,0] = len(abstracts[edge[0]]) + len(abstracts[edge[1]])
    X_train[i,1] = abs(len(abstracts[edge[0]]) - len(abstracts[edge[1]]))
    X_train[i,2] = len(abstracts[edge[0]].intersection(abstracts[edge[1]]))
    X_train[i,3] = len(authors[edge[0]]) + len(authors[edge[1]])
    X_train[i,4] = abs(len(authors[edge[0]]) - len(authors[edge[1]]))
    X_train[i,5] = len(authors[edge[0]].intersection(authors[edge[1]]))
    X_train[i,6] = counter_cosine_similarity(Counter(tokenized_authors[edge[0]]), Counter(tokenized_authors[edge[1]]))
    X_train[i,7] = counter_cosine_similarity(Counter(tokenized_abstracts[edge[0]]), Counter(tokenized_abstracts[edge[1]]))
    X_train[i,8] = rank[edge[0]] + rank[edge[1]]
    X_train[i,9] = abs(rank[edge[0]] - rank[edge[1]])
    X_train[i,10] = cluster[edge[0]] + cluster[edge[1]]
    X_train[i,11] = abs(cluster[edge[0]] - cluster[edge[1]])
    X_train[i,12] = h[edge[0]] + h[edge[1]]
    X_train[i,13] = abs(h[edge[0]] - h[edge[1]])
    X_train[i,14] = triangles[edge[0]] + triangles[edge[1]]
    X_train[i,15] = abs(triangles[edge[0]] - triangles[edge[1]])
    X_train[i,16] = nx.degree(G, edge[0]) + nx.degree(G, edge[1])
    X_train[i,17] = abs(nx.degree(G, edge[0]) - nx.degree(G, edge[1]))
    X_train[i,18] = len(list(nx.common_neighbors(G, edge[0], edge[1])))
    X_train[i,19] = n2v.wv.n_similarity(G[edge[0]], G[edge[1]])
    X_train[i,20] = n2v.wv.similarity(edge[0], edge[1])
    X_train[i,21] = model.wv.n_similarity(tokenized_abstracts[edge[0]], tokenized_abstracts[edge[1]])
    X_train[i,22] = model_authors.wv.n_similarity(tokenized_authors[edge[0]], tokenized_authors[edge[1]])
    X_train[i,23] = a[edge[0]] + a[edge[1]]
    X_train[i,24] = abs(a[edge[0]] - a[edge[1]])
    X_train[i,25] = cosine(d2v[edge[0]], d2v[edge[1]])
    X_train[i,26] = cosine(d2v_ath[edge[0]], d2v_ath[edge[1]])
    X_train[i,27] = diG.in_degree(edge[0]) + diG.in_degree(edge[1])
    X_train[i,28] = abs(diG.in_degree(edge[0]) - diG.in_degree(edge[1]))
    X_train[i,29] = diG.out_degree(edge[0]) + diG.out_degree(edge[1])
    X_train[i,30] = abs(diG.out_degree(edge[0]) - diG.out_degree(edge[1]))
    X_train[i,31] = bet[edge[0]] + bet[edge[1]]
    X_train[i,32] = abs(bet[edge[0]] - bet[edge[1]])
    y_train[i] = 1

    # a randomly generated pair of nodes
    n1 = randint(0, n-1)
    n2 = randint(0, n-1)
    while G.has_edge(n1, n2):
        n1 = randint(0, n-1)
        n2 = randint(0, n-1)
    X_train[m+i,0] = len(abstracts[n1]) + len(abstracts[n2])
    X_train[m+i,1] = abs(len(abstracts[n1]) - len(abstracts[n2]))
    X_train[m+i,2] = len(abstracts[n1].intersection(abstracts[n2]))
    X_train[m+i,3] = len(authors[n1]) + len(authors[n2])
    X_train[m+i,4] = abs(len(authors[n1]) - len(authors[n2]))
    X_train[m+i,5] = len(authors[n1].intersection(authors[n2]))
    X_train[m+i,6] = counter_cosine_similarity(Counter(tokenized_authors[n1]), Counter(tokenized_authors[n2]))
    X_train[m+i,7] = counter_cosine_similarity(Counter(tokenized_abstracts[n1]), Counter(tokenized_abstracts[n2]))
    X_train[m+i,8] = rank[n1] + rank[n2] # sum of ranks of the two nodes
    X_train[m+i,9] = abs(rank[n1] - rank[n2]) # absolute value of difference of ranks of the two nodes
    X_train[m+i,10] = cluster[n1] + cluster[n2] # sum of clusters of the two nodes
    X_train[m+i,11] = abs(cluster[n1] - cluster[n2]) # absolute value of difference of clusters of the two nodes
    X_train[m+i,12] = h[n1] + h[n2] # sum of hubs of the two nodes
    X_train[m+i,13] = abs(h[n1] - h[n2]) # absolute value of difference of hubs of the two nodes
    X_train[m+i,14] = triangles[n1] + triangles[n2] # sum of triangles of the two nodes
    X_train[m+i,15] = abs(triangles[n1] - triangles[n2]) # absolute value of difference of triangles of the two nodes
    X_train[m+i,16] = nx.degree(G, n1) + nx.degree(G, n2) # sum of degrees of the two nodes
    X_train[m+i,17] = abs(nx.degree(G, n1) - nx.degree(G, n2)) # absolute value of difference of degrees of the two nodes
    X_train[m+i,18] = len(list(nx.common_neighbors(G, n1, n2))) # number of common neighbors of the two nodes
    X_train[m+i,19] = n2v.wv.n_similarity(G[n1], G[n2]) # cosine similarity between the two nodes
    X_train[m+i,20] = n2v.wv.similarity(n1, n2) # cosine similarity between the two nodes
    X_train[m+i,21] = model.wv.n_similarity(tokenized_abstracts[n1], tokenized_abstracts[n2]) # cosine similarity between the two nodes
    X_train[m+i,22] = model_authors.wv.n_similarity(tokenized_authors[n1], tokenized_authors[n2]) # cosine similarity between the two nodes
    X_train[m+i,23] = a[n1] + a[n2] # sum of authors of the two nodes
    X_train[m+i,24] = abs(a[n1] - a[n2]) # absolute value of difference of authors of the two nodes
    X_train[m+i,25] = cosine(d2v[n1], d2v[n2]) # cosine similarity between the two nodes
    X_train[m+i,26] = cosine(d2v_ath[n1], d2v_ath[n2]) # cosine similarity between the two nodes
    X_train[m+i,27] = diG.in_degree(n1) + diG.in_degree(n2) # sum of in-degrees of the two nodes
    X_train[m+i,28] = abs(diG.in_degree(n1) - diG.in_degree(n2)) # absolute value of difference of in-degrees of the two nodes
    X_train[m+i,29] = diG.out_degree(n1) + diG.out_degree(n2) # sum of out-degrees of the two nodes
    X_train[m+i,30] = abs(diG.out_degree(n1) - diG.out_degree(n2)) # absolute value of difference of out-degrees of the two nodes
    X_train[m+i,31] = bet[n1] + bet[n2] # sum of betweenness centrality of the two nodes
    X_train[m+i,32] = abs(bet[n1] - bet[n2]) # absolute value of difference of betweenness centrality of the two nodesed 
    y_train[m+i] = 0

Training: 100%|██████████| 1091955/1091955 [45:33<00:00, 399.54it/s] 


In [53]:
print('Size of training matrix:', X_train.shape)

Size of training matrix: (2183910, 33)


In [54]:
# Read test data. Each sample is a pair of nodes
node_pairs = list()
with open('test.txt', 'r') as f:
    for line in f:
        t = line.split(',')
        node_pairs.append((int(t[0]), int(t[1])))

In [56]:
# Create the test matrix. Use the same 4 features as above
X_test = np.zeros((len(node_pairs), 33))
for i,node_pair in tqdm(enumerate(node_pairs), desc='Creating test matrix', total=len(node_pairs)):
    X_test[i,0] = len(abstracts[node_pair[0]]) + len(abstracts[node_pair[1]])
    X_test[i,1] = abs(len(abstracts[node_pair[0]]) - len(abstracts[node_pair[1]]))
    X_test[i,2] = len(abstracts[node_pair[0]].intersection(abstracts[node_pair[1]]))
    X_test[i,3] = len(authors[node_pair[0]]) + len(authors[node_pair[1]])
    X_test[i,4] = abs(len(authors[node_pair[0]]) - len(authors[node_pair[1]]))
    X_test[i,5] = len(authors[node_pair[0]].intersection(authors[node_pair[1]]))
    X_test[i,6] = counter_cosine_similarity(Counter(tokenized_authors[node_pair[0]]), Counter(tokenized_authors[node_pair[1]]))
    X_test[i,7] = counter_cosine_similarity(Counter(tokenized_abstracts[node_pair[0]]), Counter(tokenized_abstracts[node_pair[1]]))
    X_test[i,8] = rank[node_pair[0]] + rank[node_pair[1]]
    X_test[i,9] = abs(rank[node_pair[0]] - rank[node_pair[1]])
    X_test[i,10] = cluster[node_pair[0]] + cluster[node_pair[1]]
    X_test[i,11] = abs(cluster[node_pair[0]] - cluster[node_pair[1]])
    X_test[i,12] = h[node_pair[0]] + h[node_pair[1]]
    X_test[i,13] = abs(h[node_pair[0]] - h[node_pair[1]])
    X_test[i,14] = triangles[node_pair[0]] + triangles[node_pair[1]]
    X_test[i,15] = abs(triangles[node_pair[0]] - triangles[node_pair[1]])
    X_test[i,16] = nx.degree(G, node_pair[0]) + nx.degree(G, node_pair[1])
    X_test[i,17] = abs(nx.degree(G, node_pair[0]) - nx.degree(G, node_pair[1]))
    X_test[i,18] = len(list(nx.common_neighbors(G, node_pair[0], node_pair[1])))
    X_test[i,19] = n2v.wv.n_similarity(G[node_pair[0]], G[node_pair[1]])
    X_test[i,20] = n2v.wv.similarity(node_pair[0], node_pair[1])
    X_test[i,21] = model.wv.n_similarity(tokenized_abstracts[node_pair[0]], tokenized_abstracts[node_pair[1]])
    X_test[i,22] = model_authors.wv.n_similarity(tokenized_authors[node_pair[0]], tokenized_authors[node_pair[1]])
    X_test[i,23] = a[node_pair[0]] + a[node_pair[1]]
    X_test[i,24] = abs(a[node_pair[0]] - a[node_pair[1]])
    X_test[i,25] = cosine(d2v[node_pair[0]], d2v[node_pair[1]])
    X_test[i,26] = cosine(d2v_ath[node_pair[0]], d2v_ath[node_pair[1]])
    X_test[i,27] = diG.in_degree(node_pair[0]) + diG.in_degree(node_pair[1])
    X_test[i,28] = abs(diG.in_degree(node_pair[0]) - diG.in_degree(node_pair[1]))
    X_test[i,29] = diG.out_degree(node_pair[0]) + diG.out_degree(node_pair[1])
    X_test[i,30] = abs(diG.out_degree(node_pair[0]) - diG.out_degree(node_pair[1]))
    X_test[i,31] = bet[node_pair[0]] + bet[node_pair[1]]
    X_test[i,32] = abs(bet[node_pair[0]] - bet[node_pair[1]])
print('Size of test matrix:', X_test.shape)

Creating test matrix: 100%|██████████| 106692/106692 [02:19<00:00, 767.22it/s]

Size of test matrix: (106692, 33)





In [57]:
X_train, y_train = shuffle(X_train, y_train)

In [58]:
import lightgbm as lgb

lb = lgb.LGBMClassifier(objective='binary', device='gpu',random_sate=47)
lb.fit(X_train, y_train)

y_pred_lgb = lb.predict_proba(X_test)
y_pred_lgb = y_pred_lgb[:,1]



In [59]:
# Write predictions to a file
predictions = zip(range(len(ypred)), ypred)
with open("submissions.csv","w") as pred:
    csv_out = csv.writer(pred)
    csv_out.writerow(['id','predicted'])
    for row in predictions:
        csv_out.writerow(row)