In [1]:
import sys

import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt

sys.path.append(
    './gustav/src/'

)

from gustav import ncbi, nlm, nih

In [2]:
ml = ncbi.pubmed('main', columns=['pubmed_id', 'year'])

In [99]:
meshs = ncbi.pubmed('mesh_terms')
ui2mn = nlm.mesh('ui2mn')

helper = meshs[['mesh_terms']].drop_duplicates()
helper['ui'] = helper['mesh_terms'].str.split(':', expand=True)[0]
helper['is_technology'] = helper['ui'].isin(ui2mn[ui2mn['mn'].str.startswith('E')]['ui'])

meshs = pd.merge(meshs, helper)

In [100]:
citations = nih.icite('studies', columns=['pubmed_id', 'year', 'relative_citation_ratio', 'is_research_article', 'citations_per_year'])

In [101]:
window = range(2006, 2016)

In [102]:
c = 'citations_per_year'
cit = citations[citations['is_research_article']][['pubmed_id', 'year', c]].dropna().reset_index(drop=True)
cit['citation_rank'] = cit.groupby(['year'])[c].rank(pct=True)
df = ml.merge(meshs[meshs['is_technology']], on='pubmed_id').merge(cit, on=['pubmed_id', 'year'])
g = df[df['year'].isin(window)].groupby('mesh_terms')

In [None]:
df[df['year'].isin(window)].groupby('mesh_terms').size()['D055106:Genome-Wide Association Study']

In [None]:
cit.pubmed_id.unique().shape

In [31]:
res = pd.concat(
    [
        g['citation_rank'].median().sort_values(ascending=False).to_frame('median_citations'),
        g.size().to_frame('papers'),
        g['citation_rank'].apply(lambda x: np.mean(x>0.95)).to_frame('proportion_high_citations')
    ],
    axis=1
).reset_index()

res['log_papers'] = np.log10(res['papers'])
res = res.set_index('mesh_terms', verify_integrity=True)

In [None]:
sns.displot(res[res['papers']>0]['median_citations'])
plt.axvline(x=res.loc['D055106:Genome-Wide Association Study', 'median_citations'], color='red')
plt.xlim([0, 1])
plt.show()



In [None]:
sns.displot(res[res['papers']>0]['proportion_high_citations'])
plt.axvline(x=res.loc['D055106:Genome-Wide Association Study', 'proportion_high_citations'], color='red')
plt.xlim([0, 1])
plt.show()

In [33]:
g2p = ncbi.pubtator_medline('pooled_counts_gene_unambiguous')
g2p = g2p[g2p['bioconcept_title_or_abstract']>0][['pubmed_id', 'gene_ncbi']].drop_duplicates()
gi = ncbi.gene_info(9606)
gi = gi[gi['type_of_gene']=='protein-coding']
g2p = g2p[g2p['gene_ncbi'].isin(gi['gene_ncbi'])]
first_years = pd.merge(ml, g2p).groupby('gene_ncbi')['year'].min()


In [34]:
df = ml.merge(meshs[meshs['is_technology']], on='pubmed_id').merge(g2p)
df['new_gene'] = ~df['gene_ncbi'].isin(first_years[first_years<2000].index)

In [35]:
df = df.groupby(['year', 'pubmed_id', 'mesh_terms'])['new_gene'].any().to_frame('any_new_gene').reset_index()
g = df[df['year'].isin(window)].groupby('mesh_terms')

In [37]:
res_1 = pd.concat(
    [
        g['any_new_gene'].mean().sort_values(ascending=False).to_frame('any_new_gene'),
        g.size().to_frame('papers'),
    ],
    axis=1
).reset_index()
res_1 = res_1.set_index('mesh_terms', verify_integrity=True)

In [None]:
sns.displot(res_1[res_1['papers']>0]['any_new_gene'])
plt.axvline(x=res_1.loc['D055106:Genome-Wide Association Study', 'any_new_gene'], color='red')
plt.xlim([0, 1])
plt.show()

In [None]:
res[res['papers']>0].shape[0]

In [None]:
res[res['papers']>0].sort_values('proportion_high_citations', ascending = False).reset_index().head(50)

In [None]:
res.loc['D055106:Genome-Wide Association Study']

In [None]:
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns

sns.set_style('white', rc={
    'xtick.bottom': True,
    'ytick.left': True,
})

sns.color_palette("Set1")

matplotlib.rc('font', family='Helvetica') 
matplotlib.rc('pdf', fonttype=42)
matplotlib.rc('text', usetex='false') 
matplotlib.rcParams['axes.unicode_minus'] = False

matplotlib.rcParams['xtick.major.size'] = 2
matplotlib.rcParams['xtick.major.width'] = 0.5
matplotlib.rcParams['xtick.minor.size'] = 2
matplotlib.rcParams['xtick.minor.width'] = 0.5

matplotlib.rcParams['ytick.major.size'] = 2
matplotlib.rcParams['ytick.major.width'] = 0.5
matplotlib.rcParams['ytick.minor.size'] = 2
matplotlib.rcParams['ytick.minor.width'] = 0.5


matplotlib.rcParams.update({"axes.labelsize": 10,
"xtick.labelsize": 10,
"ytick.labelsize": 10,
"legend.fontsize": 14,
"font.size":7})

fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(4, 3), dpi=300)


sns.histplot(res[res['papers']>0]['proportion_high_citations'], ax = ax1, color = "grey")
ax1.axvline(x=res.loc['D055106:Genome-Wide Association Study', 'proportion_high_citations'], color='maroon')


ax1.tick_params(axis='x', colors='black')
ax1.yaxis.label.set_color('black')
ax1.tick_params(axis='y', colors='black')
ax1.spines['bottom'].set_linewidth(0.5)
ax1.spines['left'].set_linewidth(0.5)
sns.despine()
ax1.set_ylabel("")
ax1.set_xlabel("Share of manuscripts among 5% top cited")

ax1.set_xlim([0.05, 0.3])
ax2.set_xlim([0, 0.3])
ax1.set_ylim([0, 150])
# ax2.set_ylim([0, 50])
ax2.set_xlabel("Share of new genes introduced")
sns.histplot(res_1[res_1['papers']>0]['any_new_gene'], ax = ax2, color = "grey")
ax2.axvline(x=res_1.loc['D055106:Genome-Wide Association Study', 'any_new_gene'], color='maroon')

ax2.tick_params(axis='x', colors='black')
ax2.yaxis.label.set_color('black')
ax2.tick_params(axis='y', colors='black')
ax2.spines['bottom'].set_linewidth(0.5)
ax2.spines['left'].set_linewidth(0.5)
ax1.yaxis.label.set_color('black')
ax1.xaxis.label.set_color('black')
ax2.yaxis.label.set_color('black')
ax2.xaxis.label.set_color('black')
ax2.set_ylabel("")
ax1.text(0.205, 40, "GWAS", fontsize = 12, c = "maroon")
ax2.text(0.27, 400, "GWAS", fontsize = 12, c = "maroon")
plt.tight_layout()
plt.savefig("../figures/GWAS_innovation.pdf", bbox_inches = "tight", dpi = 300)