In [1]:
from os.path import join as j
import littleballoffur
import pandas as pd
import networkx as nx
import random

# Load data

In [2]:
with open("../data_dirs.txt", "r") as f: # your dataset folder should  be in a text file here
    DATA_DIRS = f.readlines()
    DATA_DIRS = [p.strip() for p in DATA_DIRS]

In [3]:
APS_CIT_PATH = j(DATA_DIRS[0], "derived", "journal_citations", "aps_all_2010_citations.csv")

In [4]:
cols = ["CITED_DOI", "CITING_DOI", "CITING_YEAR", "CITED_YEAR"]
dtypes = dict(zip(cols, ["str"]*2+["int"]*2))
cit = pd.read_csv(APS_CIT_PATH, usecols=cols, dtype=dtypes)

In [5]:
cit

Unnamed: 0,CITING_DOI,CITED_DOI,CITING_YEAR,CITED_YEAR
0,10.1103/PhysRevSeriesI.11.215,10.1103/PhysRevSeriesI.1.1,1900,1893
1,10.1103/PhysRevSeriesI.11.215,10.1103/PhysRevSeriesI.2.161,1900,1894
2,10.1103/PhysRevSeriesI.17.65,10.1103/PhysRevSeriesI.2.161,1903,1894
3,10.1103/PhysRevSeriesI.12.121,10.1103/PhysRevSeriesI.1.166,1901,1893
4,10.1103/PhysRevSeriesI.7.93,10.1103/PhysRevSeriesI.1.166,1898,1893
...,...,...,...,...
4931138,10.1103/PhysRevD.82.066009,10.1103/PhysRevD.81.046004,2010,2010
4931139,10.1103/PhysRevD.82.105007,10.1103/PhysRevD.81.046004,2010,2010
4931140,10.1103/PhysRevSTAB.13.070402,10.1103/PhysRevSTAB.12.110401,2010,2009
4931141,10.1103/PhysRevB.82.212401,10.1103/PhysRevB.81.134421,2010,2010


In [6]:
len(cit[cit.CITING_YEAR == 2010])

298318

In [7]:
cit_2010 = cit[cit.CITING_YEAR == 2010]

# Merge with abstract data

In [8]:
APS_ABS_PATH = j(DATA_DIRS[0], "raw", "WOS", "wos_aps_abstracts.bz2")

In [9]:
abstracts = pd.read_csv(APS_ABS_PATH, dtype="str", index_col="DOI")

In [10]:
abstracts

Unnamed: 0_level_0,ABSTRACT
DOI,Unnamed: 1_level_1
10.1103/PhysRevE.63.056310,Heat transport in He-3 above its critical temp...
10.1103/PhysRevB.64.235210,We study cyclotron resonance line shapes (CRLS...
10.1103/PhysRevLett.87.051601,The widely used relation <(<alpha>)over tilde>...
10.1103/PhysRevE.63.021115,We study the thermally activated barrier cross...
10.1103/PhysRevLett.87.016104,Unstable thin liquid films on solid substrates...
...,...
10.1103/PhysRevB.46.4693,The quantum-size-effect structure of ultrathin...
10.1103/PhysRevLett.68.1236,Y-89 NMR spectra in an oriented sample of YBa2...
10.1103/PhysRevB.46.7559,In a p+-n+-p-n- device driven by a dc-voltage ...
10.1103/PhysRevB.45.4227,A series of single-quantum-well samples doped ...


In [11]:
pd.isna(abstracts).sum()

ABSTRACT    27605
dtype: int64

In [12]:
abstracts = abstracts[(~pd.isna(abstracts.ABSTRACT)) & (abstracts.ABSTRACT != "")]

In [13]:
abstracts.index.nunique()

329600

In [14]:
abstracts.reset_index().DOI.value_counts()

10.1103/PhysRevD.58.024017    3
10.1103/PhysRevA.73.031402    2
10.1103/PhysRevX.8.031024     2
10.1103/PhysRevB.78.195115    2
10.1103/PhysRevA.73.031602    2
                             ..
10.1103/PhysRevX.8.041032     1
10.1103/PhysRevC.85.045205    1
10.1103/PhysRevB.79.104428    1
10.1103/PhysRevE.78.051901    1
10.1103/PhysRevB.89.140504    1
Name: DOI, Length: 329600, dtype: int64

In [15]:
abstracts.loc["10.1103/PhysRevD.58.024017"]

Unnamed: 0_level_0,ABSTRACT
DOI,Unnamed: 1_level_1
10.1103/PhysRevD.58.024017,We study analytically the initial value proble...
10.1103/PhysRevD.58.024017,We study the nonlinear gravitational collapse ...
10.1103/PhysRevD.58.024017,We study analytically the initial value proble...


In [16]:
len(abstracts.drop_duplicates())

330802

In [17]:
len(abstracts.reset_index().drop_duplicates(subset="DOI"))

329601

In [18]:
abstracts = abstracts.reset_index().drop_duplicates(subset="DOI", keep="first").set_index("DOI") # why the duplicates?

In [19]:
cit.merge(abstracts, left_on="CITING_DOI", right_index=True).merge(abstracts, left_on="CITED_DOI", right_index=True)

Unnamed: 0,CITING_DOI,CITED_DOI,CITING_YEAR,CITED_YEAR,ABSTRACT_x,ABSTRACT_y
1298535,10.1103/PhysRevLett.101.194503,10.1103/PhysRevE.55.3143,2008,1997,We use the Onsager principle to derive a two-p...,Simple expressions are given for the Newtonian...
1298538,10.1103/PhysRevE.75.031201,10.1103/PhysRevE.55.3143,2007,1997,We present molecular dynamics calculations of ...,Simple expressions are given for the Newtonian...
1298540,10.1103/PhysRevE.78.051404,10.1103/PhysRevE.55.3143,2008,1997,The coupled activated dynamics in dense mixtur...,Simple expressions are given for the Newtonian...
1298537,10.1103/PhysRevE.61.2967,10.1103/PhysRevE.55.3143,2000,1997,We evaluate theoretical expressions for the lo...,Simple expressions are given for the Newtonian...
1298536,10.1103/PhysRevE.81.051402,10.1103/PhysRevE.55.3143,2010,1997,This paper addresses the relative viscosity of...,Simple expressions are given for the Newtonian...
...,...,...,...,...,...,...
4931133,10.1103/PhysRevB.81.054406,10.1103/PhysRevB.80.024113,2010,2009,Magnetically induced reorientation (MIR) of tw...,This is an in situ measurement of the full str...
4931134,10.1103/PhysRevD.80.025019,10.1103/PhysRevD.80.025018,2009,2009,Classical isometrodynamics is quantized in the...,A generalization of non-Abelian gauge theories...
4931135,10.1103/PhysRevE.82.031114,10.1103/PhysRevE.80.021106,2010,2009,In many applications one is interested in find...,The problem of estimating a Markov transition ...
4931140,10.1103/PhysRevSTAB.13.070402,10.1103/PhysRevSTAB.12.110401,2010,2009,The Blumlein pulse forming line (BPFL) consist...,The flat voltage of the main pulse on the load...


In [20]:
cit_abs = cit.merge(abstracts, left_on="CITING_DOI", right_index=True).merge(abstracts, left_on="CITED_DOI", right_index=True)

In [21]:
cit_abs = cit_abs.rename(columns={"ABSTRACT_x": "CITING_ABSTRACT", "ABSTRACT_y": "CITED_ABSTRACT"})

In [22]:
len(cit_abs)/len(cit)

0.18363470700403536

In [23]:
cit_abs_2010 = cit_abs[cit_abs.CITING_YEAR == 2010]

In [24]:
len(cit_abs_2010)

172258

# Generate graph

1. Create citation network G of APS papers published **up to** 2010
2. Sample negative edges: sample x% of negative citing edges from 2010 papers
3. Sample existing edges where the citing paper was publised _in_ 2010
5. Get largest connected component, and final list of added/removed edges

In [25]:
G = nx.DiGraph()
G.add_edges_from(cit_abs[["CITING_DOI", "CITED_DOI"]].values)

In [26]:
len(G.nodes())

153731

In [27]:
len(cit_abs.CITING_DOI.append(cit_abs.CITED_DOI).unique())

153731

In [28]:
len(G.edges())

905529

In [29]:
len(cit_abs.drop_duplicates(subset=["CITING_DOI", "CITED_DOI"]))

905529

In [30]:
REMOVE_EDGE_PCT = 0.5
REMOVE_EDGE_NO = int(len(cit_abs_2010) * REMOVE_EDGE_PCT)

In [31]:
REMOVE_EDGE_NO

86129

## Remove true edges

In [32]:
to_remove = cit_abs_2010.sample(REMOVE_EDGE_NO)

In [33]:
len(to_remove)

86129

### Check removed nodes/edges

In [34]:
cit_abs_2010

Unnamed: 0,CITING_DOI,CITED_DOI,CITING_YEAR,CITED_YEAR,CITING_ABSTRACT,CITED_ABSTRACT
1298536,10.1103/PhysRevE.81.051402,10.1103/PhysRevE.55.3143,2010,1997,This paper addresses the relative viscosity of...,Simple expressions are given for the Newtonian...
1298562,10.1103/PhysRevE.81.011403,10.1103/PhysRevE.65.041405,2010,2002,The rheology of dense amorphous materials unde...,Measurements of the low-shear viscosity eta(o)...
1298558,10.1103/PhysRevE.81.051402,10.1103/PhysRevE.65.041405,2010,2002,This paper addresses the relative viscosity of...,Measurements of the low-shear viscosity eta(o)...
369065,10.1103/PhysRevB.81.081407,10.1103/PhysRevB.49.14251,2010,1994,We present first-principles calculations of el...,We present ab initio quantum-mechanical molecu...
368827,10.1103/PhysRevB.82.235423,10.1103/PhysRevB.49.14251,2010,1994,We demonstrate from density-functional theory ...,We present ab initio quantum-mechanical molecu...
...,...,...,...,...,...,...
4931132,10.1103/PhysRevSTAB.13.104402,10.1103/PhysRevSTAB.12.061002,2010,2009,We use a Vlasov-Fokker-Planck program and a li...,Microwave instability in the low energy ring o...
4931133,10.1103/PhysRevB.81.054406,10.1103/PhysRevB.80.024113,2010,2009,Magnetically induced reorientation (MIR) of tw...,This is an in situ measurement of the full str...
4931135,10.1103/PhysRevE.82.031114,10.1103/PhysRevE.80.021106,2010,2009,In many applications one is interested in find...,The problem of estimating a Markov transition ...
4931140,10.1103/PhysRevSTAB.13.070402,10.1103/PhysRevSTAB.12.110401,2010,2009,The Blumlein pulse forming line (BPFL) consist...,The flat voltage of the main pulse on the load...


In [35]:
to_remove

Unnamed: 0,CITING_DOI,CITED_DOI,CITING_YEAR,CITED_YEAR,CITING_ABSTRACT,CITED_ABSTRACT
3507987,10.1103/PhysRevE.82.046707,10.1103/PhysRevB.80.024422,2010,2009,We discuss methods that allow us to increase t...,We have investigated the phase transition in t...
2106653,10.1103/PhysRevD.81.043001,10.1103/PhysRevLett.102.231301,2010,2009,We consider the diffeomorphism invariant gravi...,We study primordial gravitational waves produc...
4924224,10.1103/PhysRevB.81.241304,10.1103/PhysRevB.78.054107,2010,2008,"PbSe quantum dots with sizes of 2.8(1), 3.7(1)...",Understanding the loss of ferroelectricity in ...
3363322,10.1103/PhysRevB.82.035446,10.1103/PhysRevB.76.085330,2010,2007,We theoretically investigate the time-dependen...,We investigate nonstationary electronic transp...
3073063,10.1103/PhysRevA.81.053823,10.1103/PhysRevLett.101.043901,2010,2008,There has been a surge of interest in the subw...,Here we explore the radiation features of opti...
...,...,...,...,...,...,...
681175,10.1103/PhysRevB.81.115131,10.1103/PhysRevE.81.036206,2010,2010,We use nonequilibrium dynamical mean-field the...,"By means of full exact diagonalization, we stu..."
1756055,10.1103/PhysRevB.81.060505,10.1103/PhysRevLett.102.147001,2010,2009,"The London penetration depth, lambda, is direc...",In several iron-arsenide superconductors there...
4333531,10.1103/PhysRevD.81.054005,10.1103/PhysRevD.65.074021,2010,2002,We study the finite temperature behavior of li...,Chiral quark models with nonlocal covariant se...
556796,10.1103/PhysRevA.82.023424,10.1103/PhysRevLett.100.013903,2010,2008,The experimental finding of significant enhanc...,By analyzing accurate theoretical results from...


### Remove edges from graph

In [36]:
Gr = G.copy()
Gr.remove_edges_from(to_remove[["CITING_DOI", "CITED_DOI"]].values)

In [37]:
len(G.edges()) - len(Gr.edges())

86129

## Check largest CC

In [38]:
len(max(nx.weakly_connected_components(Gr), key=len))

150489

In [39]:
len(G.nodes())

153731

In [40]:
len(G.edges())

905529

In [41]:
Gcc = Gr.subgraph(max(nx.weakly_connected_components(Gr), key=len)).copy()

In [42]:
len(Gcc.nodes())

150489

In [43]:
len(Gcc.edges())

818022

In [44]:
len(Gr.nodes())

153731

In [45]:
len(Gr.edges())

819400

**let's use the full graph with removed edges (not the GCC)**

# Final list of edges

In [46]:
to_remove = to_remove.drop(labels=["CITING_YEAR", "CITED_YEAR"], axis=1)

In [47]:
#for_embed = pd.DataFrame(list(Gcc.edges()), columns=["CITING_DOI","CITED_DOI"])
for_embed = pd.DataFrame(list(Gr.edges()), columns=["CITING_DOI","CITED_DOI"])

In [48]:
for_embed

Unnamed: 0,CITING_DOI,CITED_DOI
0,10.1103/PhysRevLett.101.194503,10.1103/PhysRevE.55.3143
1,10.1103/PhysRevLett.101.194503,10.1103/PhysRevE.65.041405
2,10.1103/PhysRevE.55.3143,10.1103/PhysRevA.46.7723
3,10.1103/PhysRevE.75.031201,10.1103/PhysRevE.55.3143
4,10.1103/PhysRevE.75.031201,10.1103/PhysRevLett.94.025901
...,...,...
819395,10.1103/PhysRevSTAB.13.104402,10.1103/PhysRevSTAB.12.061002
819396,10.1103/PhysRevB.81.054406,10.1103/PhysRevB.80.024113
819397,10.1103/PhysRevD.80.025019,10.1103/PhysRevD.80.025018
819398,10.1103/PhysRevSTAB.13.070402,10.1103/PhysRevSTAB.12.110401


In [49]:
for_embed.merge(cit_abs[["CITING_DOI","CITED_DOI", "CITING_ABSTRACT", "CITED_ABSTRACT"]], on=["CITING_DOI", "CITED_DOI"])

Unnamed: 0,CITING_DOI,CITED_DOI,CITING_ABSTRACT,CITED_ABSTRACT
0,10.1103/PhysRevLett.101.194503,10.1103/PhysRevE.55.3143,We use the Onsager principle to derive a two-p...,Simple expressions are given for the Newtonian...
1,10.1103/PhysRevLett.101.194503,10.1103/PhysRevE.65.041405,We use the Onsager principle to derive a two-p...,Measurements of the low-shear viscosity eta(o)...
2,10.1103/PhysRevE.55.3143,10.1103/PhysRevA.46.7723,Simple expressions are given for the Newtonian...,We develop a phenomenological theory of the dy...
3,10.1103/PhysRevE.75.031201,10.1103/PhysRevE.55.3143,We present molecular dynamics calculations of ...,Simple expressions are given for the Newtonian...
4,10.1103/PhysRevE.75.031201,10.1103/PhysRevLett.94.025901,We present molecular dynamics calculations of ...,Researchers have been perplexed for the past f...
...,...,...,...,...
819395,10.1103/PhysRevSTAB.13.104402,10.1103/PhysRevSTAB.12.061002,We use a Vlasov-Fokker-Planck program and a li...,Microwave instability in the low energy ring o...
819396,10.1103/PhysRevB.81.054406,10.1103/PhysRevB.80.024113,Magnetically induced reorientation (MIR) of tw...,This is an in situ measurement of the full str...
819397,10.1103/PhysRevD.80.025019,10.1103/PhysRevD.80.025018,Classical isometrodynamics is quantized in the...,A generalization of non-Abelian gauge theories...
819398,10.1103/PhysRevSTAB.13.070402,10.1103/PhysRevSTAB.12.110401,The Blumlein pulse forming line (BPFL) consist...,The flat voltage of the main pulse on the load...


In [50]:
for_embed = for_embed.merge(cit_abs[["CITING_DOI","CITED_DOI", "CITING_ABSTRACT", "CITED_ABSTRACT"]], on=["CITING_DOI", "CITED_DOI"])

In [51]:
to_remove

Unnamed: 0,CITING_DOI,CITED_DOI,CITING_ABSTRACT,CITED_ABSTRACT
3507987,10.1103/PhysRevE.82.046707,10.1103/PhysRevB.80.024422,We discuss methods that allow us to increase t...,We have investigated the phase transition in t...
2106653,10.1103/PhysRevD.81.043001,10.1103/PhysRevLett.102.231301,We consider the diffeomorphism invariant gravi...,We study primordial gravitational waves produc...
4924224,10.1103/PhysRevB.81.241304,10.1103/PhysRevB.78.054107,"PbSe quantum dots with sizes of 2.8(1), 3.7(1)...",Understanding the loss of ferroelectricity in ...
3363322,10.1103/PhysRevB.82.035446,10.1103/PhysRevB.76.085330,We theoretically investigate the time-dependen...,We investigate nonstationary electronic transp...
3073063,10.1103/PhysRevA.81.053823,10.1103/PhysRevLett.101.043901,There has been a surge of interest in the subw...,Here we explore the radiation features of opti...
...,...,...,...,...
681175,10.1103/PhysRevB.81.115131,10.1103/PhysRevE.81.036206,We use nonequilibrium dynamical mean-field the...,"By means of full exact diagonalization, we stu..."
1756055,10.1103/PhysRevB.81.060505,10.1103/PhysRevLett.102.147001,"The London penetration depth, lambda, is direc...",In several iron-arsenide superconductors there...
4333531,10.1103/PhysRevD.81.054005,10.1103/PhysRevD.65.074021,We study the finite temperature behavior of li...,Chiral quark models with nonlocal covariant se...
556796,10.1103/PhysRevA.82.023424,10.1103/PhysRevLett.100.013903,The experimental finding of significant enhanc...,By analyzing accurate theoretical results from...


In [53]:
for_embed.append(to_remove)

Unnamed: 0,CITING_DOI,CITED_DOI,CITING_ABSTRACT,CITED_ABSTRACT
0,10.1103/PhysRevLett.101.194503,10.1103/PhysRevE.55.3143,We use the Onsager principle to derive a two-p...,Simple expressions are given for the Newtonian...
1,10.1103/PhysRevLett.101.194503,10.1103/PhysRevE.65.041405,We use the Onsager principle to derive a two-p...,Measurements of the low-shear viscosity eta(o)...
2,10.1103/PhysRevE.55.3143,10.1103/PhysRevA.46.7723,Simple expressions are given for the Newtonian...,We develop a phenomenological theory of the dy...
3,10.1103/PhysRevE.75.031201,10.1103/PhysRevE.55.3143,We present molecular dynamics calculations of ...,Simple expressions are given for the Newtonian...
4,10.1103/PhysRevE.75.031201,10.1103/PhysRevLett.94.025901,We present molecular dynamics calculations of ...,Researchers have been perplexed for the past f...
...,...,...,...,...
681175,10.1103/PhysRevB.81.115131,10.1103/PhysRevE.81.036206,We use nonequilibrium dynamical mean-field the...,"By means of full exact diagonalization, we stu..."
1756055,10.1103/PhysRevB.81.060505,10.1103/PhysRevLett.102.147001,"The London penetration depth, lambda, is direc...",In several iron-arsenide superconductors there...
4333531,10.1103/PhysRevD.81.054005,10.1103/PhysRevD.65.074021,We study the finite temperature behavior of li...,Chiral quark models with nonlocal covariant se...
556796,10.1103/PhysRevA.82.023424,10.1103/PhysRevLett.100.013903,The experimental finding of significant enhanc...,By analyzing accurate theoretical results from...


In [54]:
len(for_embed.append(to_remove).drop_duplicates())

905529

# Save

In [55]:
NETWORK_PATH = j(DATA_DIRS[0], "derived", "journal_citations", "link_prediction", "abstract_aps_all_2010_link_pred_{}-pct_neg-0.gz".format(int(REMOVE_EDGE_PCT*100)))
REMOVE_PATH = j(DATA_DIRS[0], "derived", "journal_citations", "link_prediction", "abstract_aps_all_2010_link_pred_{}-pct_neg-0_pos.csv".format(int(REMOVE_EDGE_PCT*100)))

In [56]:
for_embed = for_embed.astype(str)
to_remove = to_remove.astype(str)

In [57]:
for_embed.to_csv(NETWORK_PATH, header=True, index=False)

In [58]:
to_remove.to_csv(REMOVE_PATH, header=True, index=False)

In [59]:
len(to_remove)

86129