## Predicting Missing links in a citation network

In [68]:
# global imports 
import random 
import numpy as np 
import pandas as pd
import jgraph ## this was previously known as igraph
import csv 
import matplotlib.pyplot as plt

# machine learning imports
from sklearn import svm 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel 
from sklearn import preprocessing 

import spacy

### Import datasets

In [111]:
# function to read data from txt files
nodes_info_df = pd.read_csv('./data/node_information.csv')
random_preds_df = pd.read_csv('./data/random_predictions.csv') 
test_set = pd.read_csv('./data/testing_set.txt', sep = ' ', header = None)
train_set = pd.read_csv('./data/training_set.txt', sep = ' ', header = None)
test_set.columns = ['source_id', 'target_id']
train_set.columns = ['source_id', 'target_id', 'label']
nodes_info_df.columns = ['paper_id', 'publication_year', 'title', 'author', 'journal_name', 'abstract']

In [94]:
train_set.head()

Unnamed: 0,source_id,target_id,label
0,9510123,9502114,1
1,9707075,9604178,1
2,9312155,9506142,0
3,9911255,302165,0
4,9701033,209076,0


In [95]:
test_set.head()

Unnamed: 0,source_id,target_id
0,9807076,9807139
1,109162,1182
2,9702187,9510135
3,111048,110115
4,9910176,9410073


In [96]:
nodes_info_df.tail()

Unnamed: 0,paper_id,publication_year,title,author,journal_name,abstract
27764,9912289,2002,gauge fixing in the chain by chain method,"A Shirzad, F Loran",,in a recent work we showed that for a hamilton...
27765,9912290,2000,shuffling quantum field theory,Dirk Kreimer,Lett.Math.Phys.,we discuss shuffle identities between feynman ...
27766,9912291,1999,small object limit of casimir effect and the s...,"O. Kenneth, S. Nussinov",Phys.Rev.,we show a simple way of deriving the casimir p...
27767,9912292,1999,1 4 pbgs and superparticle actions,"F.Delduc, E. Ivanov, S. Krivonos",,karpacz poland september 21-25 1999 we constru...
27768,9912293,2000,corrections to the abelian born-infeld action ...,L. Cornalba (I.H.E.S.),JHEP,noncommutative geometry in a recent paper seib...


In [97]:
random_preds_df.head()

Unnamed: 0,id,category
0,0,0
1,1,0
2,2,1
3,3,1
4,4,1


## Exploratory Analysis

In [98]:
print('Unique papers: ', len(set(nodes_info_df['paper_id'])))
sym_diff = set(test_set['source_id'].append(test_set['target_id'])).symmetric_difference(set(nodes_info_df['paper_id']))
print('Unknown papers in test set (with nodes_info):', len(sym_diff))

Unique papers:  27769
Unknown papers in test set (with nodes_info): 4369


In [99]:
# get distribution of journal names 
nodes_info_df['journal_name'] = nodes_info_df['journal_name'].fillna('unknown')
nodes_info_df.journal_name.value_counts()[:15]

unknown                  7471
Phys.Lett.               3575
Nucl.Phys.               3571
Phys.Rev.                3170
JHEP                     1957
Int.J.Mod.Phys.           938
Mod.Phys.Lett.            936
Class.Quant.Grav.         556
J.Phys.                   536
J.Math.Phys.              532
Phys.Rev.Lett.            388
Commun.Math.Phys.         377
Phys.                     377
Nucl.Phys.Proc.Suppl.     296
Prog.Theor.Phys.          281
Name: journal_name, dtype: int64

In [None]:
## etc

## Feature generation

### Text features generation 

In [None]:
def get_author():
    pass

def get_earliest_latest_publication_years_of_author():
    pass

def tfidf_abstract():
    pass

def journal_names_feature():
    pass

# etc

### Graph features generation 

In [None]:
def get_nodes_weights():
    pass

# get shortest path between two nodes
def shortest_path_len():
    pass

# get in-betweeness
def betweeness():
    pass

# etc

## Learning Stuff

In [89]:
# separate features from labels:
X = train_set.loc[:, train_set.columns != 'label']
y = train_set['label']

In [30]:
## Train different models and compare the performance 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.metrics import  f1_score, confusion_matrix
from sklearn.model_selection import cross_validate

model = AdaBoostClassifier(n_estimators=50, learning_rate=1)
scores = cross_validate(model, X, y, scoring='f1', 
                        cv=10, n_jobs=-1) # n_jobs is the number of cpus to use -1 => all
scores

{'fit_time': array([40.51319671, 41.4107492 , 41.01260662, 40.26542211, 39.93710351,
        40.21896553, 39.99013019, 40.75845671, 22.1292243 , 21.88988447]),
 'score_time': array([0.96123648, 0.55183744, 0.87470293, 0.98230648, 0.95293164,
        0.94798112, 1.00288892, 0.79165316, 0.53994036, 0.54101014]),
 'test_score': array([0.70924234, 0.73411468, 0.74994179, 0.71633565, 0.73428369,
        0.72075992, 0.75058599, 0.74589011, 0.71969778, 0.74943456])}

In [32]:
# describe results from scores
from scipy import stats 
stats.describe(scores['test_score'])

DescribeResult(nobs=10, minmax=(0.7092423428264374, 0.7505859928392963), mean=0.7330286516063008, variance=0.0002449243278408503, skewness=-0.16892931758355367, kurtosis=-1.5003847605685021)

### Predicting using final model 

In [109]:
# 1: retrain the complete model -> don't forget to change this to optimal one @ end
final_model = AdaBoostClassifier(n_estimators=50, learning_rate=1)
final_model.fit(X, y)
predictions = final_model.predict(test_set)

# 2: predict on the test set
out_df = test_set.copy()
data = {'id': list(out_df.index), 'category': predictions}
final_df = pd.DataFrame(data)


# 3: write file out
final_df.to_csv('submission.csv',index=False, sep=',')

## The end