# Analysis of Semantic Scholar's paper-author relations
(bipartite graph and collaboration complex)


Todo:
* describe the projected networks, e.g., degree distribution, clustering coefficient, mean path length, diameter
* correlation between author degree and #citations or #publications
* facet (upper) degrees: for a k-facet (i.e., a paper), number of incident k+1 simplices
    * many authors are in few groups and very few authors are in many groups
    * high facet degrees should correlate with high vertex degree and high publication rank => collaborate more with others, but also with more distinct sets of people


In [None]:
%matplotlib inline

In [None]:
import numpy as np
from scipy import sparse
import matplotlib as mpl
from matplotlib import pyplot as plt
import pandas as pd
import networkx as nx
from networkx.algorithms import bipartite as nxb
from IPython import display as ipd

import sys
sys.path.append('..')

from data.s2_5_bipartite_to_complex import build_features
#from data.s2_4_complex_to_matrices import load_matrices

In [None]:
plt.rcParams['figure.figsize'] = (17, 5)

In [None]:
def get_link(id, entity='paper'):
    api = 'https://api.semanticscholar.org/v1/{}/{}'
    webpage = 'https://www.semanticscholar.org/{}/{}'
    for base in [api, webpage]:
        link = base.format(entity, id)
        txt = f'<a href="{link}">{link}</a>'
        ipd.display(ipd.HTML(txt))

In [None]:
def bins(array):
    return np.arange(array.min(), array.max() + 2) - 0.5

## 1 Data loading

In [None]:
papers = pd.read_csv('../data/s2_2_bipartite_graph/papers.csv', index_col=0)
edges = pd.read_csv('../data/s2_2_bipartite_graph/paper_author_edges.csv')

print('paper table: {:,} papers, {:,} features'.format(*papers.shape))
print('edges table: {:,} edges'.format(edges.shape[0]))

In [None]:
# Uncomment to do (part of) the analysis on the full dataset.
# papers, edges = load('../data/s2_2_bipartite_graph/paper_author_full.pickle')

In [None]:
biadjacency = sparse.load_npz('../data/s2_2_bipartite_graph/paper_author_biadjacency.npz')

print('biadjacency matrix: {:,} papers, {:,} authors, {:,} edges'.format(
    *biadjacency.shape, biadjacency.nnz))

In [None]:
adjacency_papers = sparse.load_npz('../data/s2_2_bipartite_graph/papers_adjacency.npz')
adjacency_authors = sparse.load_npz('../data/s2_2_bipartite_graph/authors_adjacency.npz')

print('adjacency matrix: {:,} papers, {:,} edges'.format(adjacency_papers.shape[0], adjacency_papers.nnz // 2))
print('adjacency matrix: {:,} authors, {:,} edges'.format(adjacency_authors.shape[0], adjacency_authors.nnz // 2))

In [None]:
s_node=150250
simplices=np.load(f'../data/s2_3_collaboration_complex/{s_node}_simplices.npy',allow_pickle=True)
cochains = np.load(f'../data/s2_3_collaboration_complex/{s_node}_cochains.npy',allow_pickle=True)
features=build_features(simplices, cochains)

In [None]:
laplacians=np.load(f'../data/s2_3_collaboration_complex/{s_node}_laplacians.npy', allow_pickle=True)
boundaries=np.load(f'../data/s2_3_collaboration_complex/{s_node}_boundaries.npy', allow_pickle=True)


## 2 Size of collaborations

In [None]:
papers_per_author = edges.groupby('author')['paper'].count()
authors_per_paper = edges.groupby('paper')['author'].count()

print('Paper with the most authors ({}):'.format(authors_per_paper.max()))
get_link(authors_per_paper.idxmax(), 'paper')

print('Author with the most papers ({}):'.format(papers_per_author.max()))
get_link(papers_per_author.idxmax(), 'author')

fig, ax = plt.subplots()
ax.hist(papers_per_author, bins=bins(papers_per_author), log=True);
ax.set_title('number of papers per author');
ax.set_xlabel('number of papers');
ax.set_ylabel('number of authors');

fig, ax = plt.subplots()
ax.hist(authors_per_paper, bins=bins(authors_per_paper), log=True);
ax.set_title('number of authors per paper');
ax.set_xlabel('number of authors');
ax.set_ylabel('number of papers');

In [None]:
fig, ax = plt.subplots()
ax.hist(adjacency_authors.data, bins=bins(adjacency_authors.data), log=True);
ax.set_title('collaboration between authors');
ax.set_xlabel('number of papers in common (edge weight)');

fig, ax = plt.subplots()
ax.hist(adjacency_papers.data, bins=bins(adjacency_papers.data), log=True);
ax.set_title('collaboration between papers');
ax.set_xlabel('number of authors in common (edge weight)');

## 3 Publication year

In [None]:
ax = papers['year'].hist(bins=bins(papers['year']))
ax.set_title('number of papers published per year ({} - {})'.format(papers['year'].min(), papers['year'].max()));

## 4 Citations

In [None]:
print('papers have at least {:,} citations (2019)'.format(papers['citations_2019'].min()))
print('papers have at most {:,} citations (2019)'.format(papers['citations_2019'].max()))
print('there are {:,} citations in total (2019)'.format(papers['citations_2019'].sum()))
print()

# print('authors have at least {:,} citations'.format(authors['citations_2019'].min()))
# print('authors have at most {:,} citations'.format(authors['citations_2019'].max()))
# print('there are {:,} in total'.format(authors['citations_2019'].sum()))
# print()

print('Most cited paper ({:,} citations):'.format(papers['citations_2019'].max()))
get_link(papers['citations_2019'].idxmax(), 'paper')

# print('Most cited author ({:,} citations):'.format(authors['citations_2019'].max()))
# get_link(authors['citations_2019'].idxmax(), 'author')

In [None]:
years = [int(column[-4:]) for column in papers.columns if 'citations' in column[:-4]]
citations_per_year = np.zeros(len(years))
for i, year in enumerate(years):
    citations_per_year[i] = papers[f'citations_{year}'].sum()
fig, ax = plt.subplots()
ax.plot(years, citations_per_year, '.-')
ax.set_xticks(years)
ax.get_yaxis().set_major_formatter(
    mpl.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
ax.set_xlabel('year')
ax.set_ylabel('number of citations');

In [None]:
fig, ax = plt.subplots(figsize=(15, 5))
for year in years[::-1]:
    ax.hist(papers[f'citations_{year}'], bins=bins(papers['citations_2019']), log=True, label=f'Year {year}', alpha=0.4)
ax.set_xlabel('number of citations')
ax.set_ylabel('number of papers')
ax.legend();

## 5 References

In [None]:
print('papers have at least {:,} references'.format(papers['references'].min()))
print('papers have at most {:,} references'.format(papers['references'].max()))

print('most referencing paper ({:,} references):'.format(papers['references'].max()))
get_link(papers['references'].idxmax(), 'paper')

In [None]:
papers['references'].hist(bins=bins(papers['references']), log=True);

## 6 Collaboration complex between authors

In [None]:
dimension = len(simplices)
sizes = np.array([len(s) for s in simplices])

for k, size in enumerate(sizes):
    print(f'{size:,} {k}-simplices')
print('{:,} simplices in total'.format(np.sum(sizes)))

In [None]:
fig, ax = plt.subplots()
ax.plot(range(dimension), sizes, '.-')
ax.set_xlabel('simplex dimension')
ax.set_ylabel('number of simplices');

## 7 Operators (boundaries and Laplacians)

In [None]:
for dim, boundary in enumerate(boundaries):
    print('{}-boundary matrix: {:,} x {:,}, {:,} non-zeros ({:.2%})'.format(
        dim+1, *boundary.shape, boundary.nnz, boundary.nnz/np.prod(boundary.shape)))

In [None]:
def get_spectrum(laplacian, lowest=False, shift_invert=True):
    # Shift-invert mode is much faster, but can raise "factor is exactly singular".
    largest = sparse.linalg.eigsh(laplacian, k=1, which='LA', return_eigenvectors=False)
    if lowest:
        if shift_invert:
            lowest = sparse.linalg.eigsh(laplacian, k=1, sigma=0, which='LM', return_eigenvectors=False)
        else:
            lowest = sparse.linalg.eigsh(laplacian, k=1, which='SA', return_eigenvectors=False)
        return lowest[0], largest[0]
    else:
        return largest[0]

spectrums = [get_spectrum(laplacian) for laplacian in laplacians]

In [None]:
for dim, (laplacian, spectrum) in enumerate(zip(laplacians, spectrums)):
    print('{}-simplices: {:,} simplices, {:.2%} sparse, spectrum in [0, {:.0f}]'.format(
        dim, laplacian.shape[0], laplacian.nnz/np.prod(laplacian.shape), spectrum))
    assert laplacian.shape == (len(simplices[dim]), len(simplices[dim]))

## 8 Signals (cochains) on collaboration complex

In [None]:
# Rayleigh quotient between every pair of signals (for all dimensions).
n_features = features[0].shape[1]
rayleigh = np.empty((len(features), n_features, n_features))
for dim in range(len(features)):
    rayleigh[dim] = features[dim].T @ laplacians[dim] @ features[dim]
    # Division by zero will occur if a signal is all zeros.
    rayleigh[dim] /= features[dim].T @ features[dim]

Which signals are "smooth"?

In [None]:
#columns = ['citations_1994', 'citations_1999', 'citations_2004', 'citations_2009', 'citations_2014', 'citations_2019', 'references', 'year']
columns=['citations_2019']
fig, ax = plt.subplots()
ax.set_title('smoothness of signals')
ax.set_ylabel('Rayleigh quotient')
ax.set_xlabel("signal's name")
ax.semilogy(np.array([np.diag(r) for r in rayleigh]).T, 'o-')
ax.set_xticks(range(len(columns)))
ax.set_xticklabels(columns)
ax.legend([f'{dim}-cochains ($\lambda_{{max}} = {spectrums[dim]:.0f}$)' for dim in range(len(features))], loc='lower left')
fig.tight_layout();
#fig.savefig('smoothness.pdf');

Does a signal explain another?

In [None]:
fig, axes = plt.subplots(1, len(rayleigh))
for dim, (r, ax) in enumerate(zip(rayleigh, axes)):
    im = ax.imshow(r)
    ax.set_title(f'dimension {dim}')

## 9 Laplacians' spectra (and Fourier transform)

In [None]:
eigenvalues, eigenvectors = np.linalg.eigh(laplacians[0].toarray())

The spectrum of the 0-Laplacian has a couple of very high eigenvalues.

In [None]:
plt.plot(eigenvalues);
print(eigenvalues[:10])

Some eigenvalues have high multiplicity. Probably due to the connected cliques formed by the higher-dimensional simplices.

In [None]:
plt.plot(eigenvalues[:-100]);

In [None]:
multiplicities = [np.sum(eigenvalues == e) for e in range(20)]

fig, ax = plt.subplots()
ax.plot(multiplicities, 'o-')
ax.set_xticks(range(len(multiplicities)))
ax.set_title('eigenvalue multiplicities')
ax.set_ylabel('muliplicity')
ax.set_xlabel('eigenvalue');

Fourier transform of 0-cochains. Their spectral content should tell how smooth they are.

In [None]:
fourier = eigenvectors.T @ (features[0] / np.linalg.norm(features[0], axis=0))

In [None]:
idx_max = -100

plt.semilogy(eigenvalues[:idx_max], np.abs(fourier)[:idx_max], '.', alpha=0.8)
plt.legend(columns);