# Measure Impact using Semantic Scholar

### Neil D. Lawrence 7th June 2021

This notebook looks at the actual impact of the papers published using the Semantic Scholar data base for tracking citations.

In [None]:
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 22})

In [None]:
import cmtutils as cu
import cmtutils.nipsy as nipsy
import cmtutils.plot as plot

In [None]:
import pandas as pd
import numpy as np

In [None]:
papers = cu.Papers()

https://proceedings.neurips.cc/paper/2014 

In [None]:
date = "2021-06-11"

In [None]:
semantic_ids = nipsy.load_semantic_ids()

In [None]:
citations = nipsy.load_citation_counts(date=date)


In [None]:
citations_dict = citations.to_dict(orient='index')

In [None]:
sscholar = nipsy.download_citation_counts(citations_dict=citations_dict, semantic_ids=semantic_ids)

In [None]:
citations = pd.DataFrame.from_dict(citations_dict, orient="index")

In [None]:
citations.to_pickle(date + '-semantic-scholar-info.pickle')

In [None]:
decisions = nipsy.load_decisions()
nipsy.augment_decisions(decisions)

In [None]:
decisions.reject

In [None]:
joindf = nipsy.join_decisions_citations(decisions, citations)

In [None]:
import matplotlib.pyplot as plt
import mlai as ma

In [None]:
def plot_log_one_citations(column):
    fig, ax = plt.subplots(figsize=plot.big_wide_figsize)
    for index, symbol in zip(
            [joindf.accept, joindf.reject_not_arxiv, joindf.reject_arxiv],
            ['r.', 'g.', 'b.']
            ):
        ax.plot(joindf.loc[index][column], np.log10(1+joindf.loc[index]['numCitedBy']), symbol)
    ax.set_xlabel(column.replace("_", " "))
    ax.set_ylabel(r"log(1+citations)")

    ma.write_figure(filename=column.replace("_", "-")+"-vs-citations.svg",
                   directory="./neurips")

In [None]:
plot_log_one_citations('average_calibrated_quality')



In [None]:
plot_log_one_citations('average_impact')

In [None]:
joindf['average_impact'].corr(np.log(1+joindf['numCitedBy']))

In [None]:
joindf['average_calibrated_quality'].corr(np.log(1+joindf['numCitedBy']))

In [None]:
joindf.loc[joindf.accept]['average_calibrated_quality'].corr(np.log(1+joindf.loc[joindf.accept]['numCitedBy']))

In [None]:
joindf.loc[joindf.reject]['average_calibrated_quality'].corr(np.log(1+joindf.loc[joindf.reject]['numCitedBy']))

In [None]:
joindf.loc[joindf.accept]['average_confidence'].corr(np.log(1+joindf.loc[joindf.accept]['numCitedBy']))

In [None]:
joindf.loc[joindf.reject]['average_confidence'].corr(np.log(1+joindf.loc[joindf.reject]['numCitedBy']))

In [None]:
thresh = 6
joindf.loc[joindf.average_calibrated_quality>thresh]['average_confidence'].corr(np.log(1+joindf.loc[joindf.average_calibrated_quality>thresh]['numCitedBy']))

In [None]:
thresh = 6
joindf.loc[joindf.average_calibrated_quality<thresh]['average_confidence'].corr(np.log(1+joindf.loc[joindf.average_calibrated_quality<thresh]['numCitedBy']))

In [None]:
fig, ax = plt.subplots(figsize=(10,5))
venue_counts = joindf.loc[joindf.reject]['venue'].value_counts()
venue_counts[venue_counts>1].plot(kind="bar", ax=ax)

In [None]:
import os
import cmtutils.nipsy as nipsy
original_pairs = pd.read_csv(os.path.join(nipsy.review_store, 'Duplicate_PaperID_Pairs.csv'), dtype={'original': 'string'}, index_col='original').dropna()
duplicate_pairs = pd.read_csv(os.path.join(nipsy.review_store, 'Duplicate_PaperID_Pairs.csv'), dtype={'duplicate': 'string'}, index_col='duplicate').dropna()



In [None]:
dupjoin = original_pairs.join(joindf, how='inner')
dupjoin2 = duplicate_pairs.join(joindf, how='inner')

In [None]:
dupjoin.loc[dupjoin.accept]['average_calibrated_quality'].corr(np.log(1+dupjoin.loc[dupjoin.accept]['numCitedBy']))

In [None]:
fig, ax = plt.subplots(figsize=(8,8))
ax.plot(dupjoin.loc[dupjoin.accept]['average_calibrated_quality'], np.log10(1+dupjoin.loc[dupjoin.accept]['numCitedBy']), 'r.')
ax.plot(dupjoin.loc[dupjoin.reject_not_arxiv]['average_calibrated_quality'], np.log10(1+dupjoin.loc[dupjoin.reject_not_arxiv]['numCitedBy']), 'g.')
ax.plot(dupjoin.loc[dupjoin.reject_arxiv]['average_calibrated_quality'], np.log10(1+dupjoin.loc[dupjoin.reject_arxiv]['numCitedBy']), 'b.')

In [None]:
fig, ax = plt.subplots(figsize=(8,8))
ax.plot(dupjoin2.loc[dupjoin2.accept]['average_calibrated_quality'], np.log10(1+dupjoin2.loc[dupjoin2.accept]['numCitedBy']), 'r.')
ax.plot(dupjoin2.loc[dupjoin2.reject_not_arxiv]['average_calibrated_quality'], np.log10(1+dupjoin2.loc[dupjoin2.reject_not_arxiv]['numCitedBy']), 'g.')
ax.plot(dupjoin2.loc[dupjoin2.reject_arxiv]['average_calibrated_quality'], np.log10(1+dupjoin2.loc[dupjoin2.reject_arxiv]['numCitedBy']), 'b.')

In [None]:
dupjoin.accept.sum()/dupjoin.accept.count()

In [None]:
joindf.loc[original_pairs.index]

In [None]:
(venue_counts[venue_counts>1].sum() - venue_counts['ArXiv'] - venue_counts[''])/joindf.reject.sum()

In [None]:
joindf.loc['936'].arxivId

In [None]:
pd.isna(joindf.arxivId)

In [None]:
joindf[(joindf.venue=='') & (~pd.isna(joindf.arxivId))].venue

In [None]:
joindf.loc['945'].url

In [None]:

plt.plot(joindf.loc[joindf.accept]['average_confidence'], np.log10(1+joindf.loc[joindf.accept]['numCitedBy']), 'r.')
plt.plot(joindf.loc[joindf.reject]['average_confidence'], np.log10(1+joindf.loc[joindf.reject]['numCitedBy']), 'g.')

In [None]:
np.log10(1+ joindf['numCitedBy']).hist(bins=100)

In [None]:
joindf.columns