## Predicting Missing links in a citation network

In [2]:
# global imports 
import random 
import numpy as np 
import pandas as pd
import jgraph ## this was previously known as igraph
import csv 
import matplotlib.pyplot as plt

# machine learning imports
from sklearn import svm 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel 
from sklearn import preprocessing 

import spacy

### Import datasets

In [3]:
# function to read data from txt files
nodes_info_df = pd.read_csv('./data/node_information.csv')
random_preds_df = pd.read_csv('./data/random_predictions.csv') 
test_set = pd.read_csv('./data/testing_set.txt', sep = ' ', header = None)
train_set = pd.read_csv('./data/training_set.txt', sep = ' ', header = None)
test_set.columns = ['source_id', 'target_id']
train_set.columns = ['source_id', 'target_id', 'label']
nodes_info_df.columns = ['paper_id', 'publication_year', 'title', 'author', 'journal_name', 'abstract']

In [4]:
train_set.head()

Unnamed: 0,source_id,target_id,label
0,9510123,9502114,1
1,9707075,9604178,1
2,9312155,9506142,0
3,9911255,302165,0
4,9701033,209076,0


In [5]:
test_set.head()

Unnamed: 0,source_id,target_id
0,9807076,9807139
1,109162,1182
2,9702187,9510135
3,111048,110115
4,9910176,9410073


In [6]:
nodes_info_df.tail()

Unnamed: 0,paper_id,publication_year,title,author,journal_name,abstract
27764,9912289,2002,gauge fixing in the chain by chain method,"A Shirzad, F Loran",,in a recent work we showed that for a hamilton...
27765,9912290,2000,shuffling quantum field theory,Dirk Kreimer,Lett.Math.Phys.,we discuss shuffle identities between feynman ...
27766,9912291,1999,small object limit of casimir effect and the s...,"O. Kenneth, S. Nussinov",Phys.Rev.,we show a simple way of deriving the casimir p...
27767,9912292,1999,1 4 pbgs and superparticle actions,"F.Delduc, E. Ivanov, S. Krivonos",,karpacz poland september 21-25 1999 we constru...
27768,9912293,2000,corrections to the abelian born-infeld action ...,L. Cornalba (I.H.E.S.),JHEP,noncommutative geometry in a recent paper seib...


In [7]:
random_preds_df.head()

Unnamed: 0,id,category
0,0,0
1,1,0
2,2,1
3,3,1
4,4,1


## Exploratory Analysis

In [8]:
print('Unique papers: ', len(set(nodes_info_df['paper_id'])))
sym_diff = set(test_set['source_id'].append(test_set['target_id'])).symmetric_difference(set(nodes_info_df['paper_id']))
print('Unknown papers in test set (with nodes_info):', len(sym_diff))

Unique papers:  27769
Unknown papers in test set (with nodes_info): 4369


In [9]:
# get distribution of journal names 
nodes_info_df['journal_name'] = nodes_info_df['journal_name'].fillna('unknown')
nodes_info_df.journal_name.value_counts()[:15]

unknown                  7471
Phys.Lett.               3575
Nucl.Phys.               3571
Phys.Rev.                3170
JHEP                     1957
Int.J.Mod.Phys.           938
Mod.Phys.Lett.            936
Class.Quant.Grav.         556
J.Phys.                   536
J.Math.Phys.              532
Phys.Rev.Lett.            388
Commun.Math.Phys.         377
Phys.                     377
Nucl.Phys.Proc.Suppl.     296
Prog.Theor.Phys.          281
Name: journal_name, dtype: int64

In [10]:
nodes_info_df.author

0             M. Cvetic, H. Lu, C.N. Pope
1                Y.S. Myung, Gungwon Kang
2                          Adam D. Helfer
3                 J. Fuchs, C. Schweigert
4                             Rudolf Haag
                       ...               
27764                  A Shirzad, F Loran
27765                        Dirk Kreimer
27766             O. Kenneth, S. Nussinov
27767    F.Delduc, E. Ivanov, S. Krivonos
27768              L. Cornalba (I.H.E.S.)
Name: author, Length: 27769, dtype: object

## Feature generation

### Text features generation 

In [11]:
import re 
import math

def isNaN(string):
    return string != string

def filter_bad(alphabet):
    bad = [',', None]

    if(alphabet in bad):
        return False
    else:
        return True
## possible formats of authors:
# several authors: separation via ','
# sometimes mentions the university eg '(montpellier)'
# sometimes mentions the first name 
# sometimes format is: firstname letter. lastname

def author_normalisation(authors):
    if isNaN(authors) == False:
        #print(authors)
        authors = authors.lower()
        final_authors = list()
        
        # remove universities and last space
        if '(' in authors:
            authors = re.sub(r'\(+.*\)', '', authors).strip() 
        
        # remove extra spaces
        authors = authors.split()
        authors = ' '.join(filter(filter_bad, authors))
          
        # get all authors of one paper 
        for author in authors.split(', '): 
            author.strip()            
            # get the names of an author
            names = author.split(' ')
            author_names = list()        
            if len(names) == 2:
                # check if first element is 'letter.' format:
                if re.match('\w\.', names[0]):
                    author_names.append(names[0])
                else:
                    author_names.append(names[0][0] + '.')

            if len(names) == 3:
                if re.match('\w\.', names[0]):
                    author_names.append(names[0])
                else:
                    author_names.append(names[0][0] + '.')

                if re.match('\w\.', names[1]):
                    author_names.append(names[1])
                else:
                    author_names.append(names[1][0] + '.')

            author_names.append(names[-1])
            if len(author_names) > 1:
                author_names = ' '.join(author_names)
            else:
                author_names = author_names[0]
            # append last name
            final_authors.append(author_names)


        number_of_authors = len(final_authors)
        if number_of_authors == 0:
            return np.NaN
        #final_authors = ', '.join(final_authors)
        return final_authors
    
    return np.NaN

def common_authors(string1, string2):
    if isNaN(string1):
        return False
    if isNaN(string2):
        return False
    
    #a_set = set(string1.split(','))
    #b_set = set(string2.split(','))
    a_set = set(string1)
    b_set = set(string2)
    
    if (a_set & b_set): 
        return True 
    else: 
        return False
    
def number_common_authors(string1, string2):
    if isNaN(string1):
        return False
    if isNaN(string2):
        return False
    
    #a_set = set(string1.split(','))
    #b_set = set(string2.split(','))
    a_set = set(string1)
    b_set = set(string2)
    
    if (a_set & b_set): 
        return len(a_set & b_set) 
    else: 
        return 0

def author_feature():
    pass

def get_earliest_latest_publication_years_of_author():
    pass

def tfidf_abstract():
    pass

def journal_names_feature():
    pass

# etc

In [12]:
# reaye source and target info datasets
train_source_info = train_set.merge(nodes_info_df, left_on='source_id', right_on='paper_id')
train_target_info = train_set.merge(nodes_info_df, left_on='target_id', right_on='paper_id')

test_source_info = test_set.merge(nodes_info_df, left_on='source_id', right_on='paper_id')
test_target_info = test_set.merge(nodes_info_df, left_on='target_id', right_on='paper_id')


In [13]:
train_source_info.head()

Unnamed: 0,source_id,target_id,label,paper_id,publication_year,title,author,journal_name,abstract
0,9510123,9502114,1,9510123,1995,an infinite number of potentials surrounding 2...,,Phys.Lett.,we found an infinite number of potentials surr...
1,9510123,9408144,1,9510123,1995,an infinite number of potentials surrounding 2...,,Phys.Lett.,we found an infinite number of potentials surr...
2,9510123,9302075,1,9510123,1995,an infinite number of potentials surrounding 2...,,Phys.Lett.,we found an infinite number of potentials surr...
3,9510123,9411220,0,9510123,1995,an infinite number of potentials surrounding 2...,,Phys.Lett.,we found an infinite number of potentials surr...
4,9510123,108214,0,9510123,1995,an infinite number of potentials surrounding 2...,,Phys.Lett.,we found an infinite number of potentials surr...


In [14]:
train_target_info.head()

Unnamed: 0,source_id,target_id,label,paper_id,publication_year,title,author,journal_name,abstract
0,9510123,9502114,1,9502114,1995,stability analysis of the dilatonic black hole...,"Won T. Kim, Julian Lee, , Young Jai Park",Phys.Lett.,we explicitly show that the net number of degr...
1,9509112,9502114,1,9502114,1995,stability analysis of the dilatonic black hole...,"Won T. Kim, Julian Lee, , Young Jai Park",Phys.Lett.,we explicitly show that the net number of degr...
2,9911015,9502114,0,9502114,1995,stability analysis of the dilatonic black hole...,"Won T. Kim, Julian Lee, , Young Jai Park",Phys.Lett.,we explicitly show that the net number of degr...
3,9912022,9502114,0,9502114,1995,stability analysis of the dilatonic black hole...,"Won T. Kim, Julian Lee, , Young Jai Park",Phys.Lett.,we explicitly show that the net number of degr...
4,9507085,9502114,0,9502114,1995,stability analysis of the dilatonic black hole...,"Won T. Kim, Julian Lee, , Young Jai Park",Phys.Lett.,we explicitly show that the net number of degr...


In [15]:
## apply the features to training set 
train_set['source_authors'] = train_source_info.author.apply(lambda x: author_normalisation(x))
train_set['target_authors'] = train_target_info.author.apply(lambda x: author_normalisation(x))
train_set['common_authors'] = train_set.apply(lambda row: common_authors(row['source_authors'], row['target_authors']), axis=1)
train_set['number_common_authors'] = train_set.apply(lambda row: number_common_authors(row['source_authors'], row['target_authors']), axis=1)

train_set['source_publication_year'] = train_source_info.publication_year
train_set['target_publication_year'] = train_target_info.publication_year
train_set['same_publication_year'] = np.where(train_set.source_publication_year == train_set.target_publication_year, True, False)

## apply the features to test set
test_set['source_authors'] = test_source_info.author.apply(lambda x: author_normalisation(x))
test_set['target_authors'] = test_target_info.author.apply(lambda x: author_normalisation(x))
test_set['common_authors'] = test_set.apply(lambda row: common_authors(row['source_authors'], row['target_authors']), axis=1)
test_set['number_common_authors'] = test_set.apply(lambda row: number_common_authors(row['source_authors'], row['target_authors']), axis=1)

test_set['source_publication_year'] = test_source_info.publication_year
test_set['target_publication_year'] = test_target_info.publication_year
test_set['same_publication_year'] = np.where(test_set.source_publication_year == test_set.target_publication_year, True, False)

In [16]:
train_set.iloc[725:]

Unnamed: 0,source_id,target_id,label,source_authors,target_authors,common_authors,number_common_authors,source_publication_year,target_publication_year,same_publication_year
725,212135,211206,1,,,False,False,1997.0,1995.0,False
726,9509007,9711114,0,,,False,False,1997.0,1995.0,False
727,9612130,9601029,1,,,False,False,1997.0,1995.0,False
728,301119,9202017,0,,,False,False,1997.0,1995.0,False
729,7197,9904062,0,,,False,False,1997.0,1995.0,False
...,...,...,...,...,...,...,...,...,...,...
615507,9704211,9311015,1,,,False,False,,,False
615508,9709133,9202062,0,,,False,False,,,False
615509,7141,9610152,0,,,False,False,,,False
615510,207232,111196,1,,,False,False,,,False


In [17]:
train_set.loc[(train_set.common_authors == True) & (train_set.number_common_authors > 1)]

Unnamed: 0,source_id,target_id,label,source_authors,target_authors,common_authors,number_common_authors,source_publication_year,target_publication_year,same_publication_year
32429,11094,7175,1,"[a. giveon, d. kutasov]","[s. elitzur, a. giveon, d. kutasov, d. tsabar]",True,2,1998.0,1998.0,True
32430,8249,9802032,1,"[a. giveon, d. kutasov]","[s. elitzur, a. giveon, d. kutasov, d. tsabar]",True,2,1998.0,1998.0,True
32431,9809069,9705096,0,"[a. giveon, d. kutasov]","[s. elitzur, a. giveon, d. kutasov, d. tsabar]",True,2,1998.0,1998.0,True
32432,9610221,9903197,0,"[a. giveon, d. kutasov]","[s. elitzur, a. giveon, d. kutasov, d. tsabar]",True,2,1998.0,1998.0,True
32433,9609076,9607107,1,"[a. giveon, d. kutasov]","[s. elitzur, a. giveon, d. kutasov, d. tsabar]",True,2,1998.0,1998.0,True
...,...,...,...,...,...,...,...,...,...,...
596567,9508162,9810025,0,"[h. aratyn, l.a. ferreira, a.h. zimerman]","[h. aratyn, l.a. ferreira, j.f. gomes, a.h. zi...",True,3,1999.0,1993.0,False
610910,9801182,9706039,1,"[h. lu, c.n. pope, s. schrans, k.w. xu]","[h. lu, c.n. pope]",True,2,1992.0,1996.0,False
610911,9709114,211214,0,"[h. lu, c.n. pope, s. schrans, k.w. xu]","[h. lu, c.n. pope]",True,2,1992.0,1996.0,False
610912,9408018,9306091,1,"[h. lu, c.n. pope, s. schrans, k.w. xu]","[h. lu, c.n. pope]",True,2,1992.0,1996.0,False


### Graph features generation 

In [23]:
# get some elements and then assign the attributes 


In [19]:
import networkx as nx 
import matplotlib.pyplot as plt 

train_G = nx.from_pandas_edgelist(train_set, source='source_id', target='target_id', edge_attr=None,)

### Final clean (i.e replacing nans etc)

In [40]:
test_set

Unnamed: 0,source_id,target_id,source_authors,target_authors,common_authors,number_common_authors,source_publication_year,target_publication_year,same_publication_year
0,9807076,9807139,"[s. kachru, j. kumar, e. silverstein]","[j. x. lu, s. roy]",False,0,1998.0,1998.0,True
1,109162,1182,"[s. kachru, j. kumar, e. silverstein]","[j. x. lu, s. roy]",False,0,1998.0,1998.0,True
2,9702187,9510135,"[s. kachru, j. kumar, e. silverstein]","[h. grosse, t. krajewski, r. wulkenhaar]",False,0,1998.0,2000.0,False
3,111048,110115,[r. j. szabo],"[h. grosse, t. krajewski, r. wulkenhaar]",False,0,2001.0,2000.0,False
4,9910176,9410073,[r. j. szabo],"[h. grosse, t. krajewski, r. wulkenhaar]",False,0,2001.0,2000.0,False
...,...,...,...,...,...,...,...,...,...
32643,9705209,9305083,[j. gaite],[k. suzuki],False,0,1995.0,2001.0,False
32644,9307023,9503118,,[a.a. tseytlin],False,False,,1992.0,False
32645,9608095,9205058,,"[d. marti, a. pomarol]",False,False,,2001.0,False
32646,9407008,106256,,,False,False,,,False


In [49]:
# fill nas in some way
train_set.target_publication_year.fillna(-1, inplace=True)
train_set.source_publication_year.fillna(-1, inplace=True)
train_set.fillna('unknown', inplace=True)

test_set.fillna('unknown', inplace=True)
test_set.target_publication_year.fillna(-1, inplace=True)
test_set.source_publication_year.fillna(-1, inplace=True)

In [50]:
train_set

Unnamed: 0,source_id,target_id,label,source_authors,target_authors,common_authors,number_common_authors,source_publication_year,target_publication_year,same_publication_year
0,9510123,9502114,1,unknown,"[w. t. kim, j. lee, y. j. park]",False,False,1995.0,1995.0,True
1,9707075,9604178,1,unknown,"[w. t. kim, j. lee, y. j. park]",False,False,1995.0,1995.0,True
2,9312155,9506142,0,unknown,"[w. t. kim, j. lee, y. j. park]",False,False,1995.0,1995.0,True
3,9911255,302165,0,unknown,"[w. t. kim, j. lee, y. j. park]",False,False,1995.0,1995.0,True
4,9701033,209076,0,unknown,"[w. t. kim, j. lee, y. j. park]",False,False,1995.0,1995.0,True
...,...,...,...,...,...,...,...,...,...,...
615507,9704211,9311015,1,unknown,unknown,False,False,-1.0,-1.0,False
615508,9709133,9202062,0,unknown,unknown,False,False,-1.0,-1.0,False
615509,7141,9610152,0,unknown,unknown,False,False,-1.0,-1.0,False
615510,207232,111196,1,unknown,unknown,False,False,-1.0,-1.0,False


## Learning Stuff

In [51]:
# separate features from labels:
X = train_set.loc[:, (train_set.columns != 'label') & (train_set.columns != 'source_authors') & (train_set.columns != 'target_authors')]
y = train_set['label']

In [55]:
## Train different models and compare the performance 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.metrics import  f1_score, confusion_matrix
from sklearn.model_selection import cross_validate

model = AdaBoostClassifier(n_estimators=75, learning_rate=1)
scores = cross_validate(model, X, y, scoring='f1', 
                        cv=10, n_jobs=-1) # n_jobs is the number of cpus to use -1 => all
scores

{'fit_time': array([93.59084606, 95.51652479, 93.66064382, 94.24474454, 94.05313778,
        92.06788516, 94.35529661, 92.78198671, 60.26874852, 59.89054894]),
 'score_time': array([4.20957279, 3.68140864, 4.33065462, 4.51374149, 4.8653307 ,
        4.12986469, 4.27757502, 4.57768273, 1.94765139, 1.99875259]),
 'test_score': array([0.73473214, 0.7368546 , 0.74864418, 0.73214892, 0.74855691,
        0.74845762, 0.7535947 , 0.7386482 , 0.74172551, 0.73713318])}

In [56]:
# describe results from scores
from scipy import stats 
stats.describe(scores['test_score'])

DescribeResult(nobs=10, minmax=(0.7321489222730242, 0.7535947036309094), mean=0.7420495970406745, variance=5.2731999197191086e-05, skewness=0.215197398451841, kurtosis=-1.3546554484684943)

## prior to authors:
DescribeResult(nobs=10, minmax=(0.7092423428264374, 0.7505859928392963), mean=0.7330286516063008, variance=0.0002449243278408503, skewness=-0.16892931758355367, kurtosis=-1.5003847605685021)

### Predicting using final model 

In [109]:
# 1: retrain the complete model -> don't forget to change this to optimal one @ end
final_model = AdaBoostClassifier(n_estimators=50, learning_rate=1)
final_model.fit(X, y)
predictions = final_model.predict(test_set)

# 2: predict on the test set
out_df = test_set.copy()
data = {'id': list(out_df.index), 'category': predictions}
final_df = pd.DataFrame(data)


# 3: write file out
final_df.to_csv('submission.csv',index=False, sep=',')

## The end