In [1]:
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt

import pandas as pd
import seaborn as sns
import json
import csv
import os
from collections import defaultdict
import numpy as np

def num_sents_json(fpath):
    ner_stats = defaultdict(list)
    rel_stats = defaultdict(int)
    with open(fpath, 'r') as f:
        dataset = json.load(f)
    print(len(dataset))
    num_sents = 0
    num = 0
    for pmcid in dataset:
        num_sents_per_article = len(dataset[pmcid]['annotations'])
        num_sents += num_sents_per_article
        num += dataset[pmcid]['doc_len']
        for sent in dataset[pmcid]['annotations']:
            if sent['ner']:
                for ner in sent['ner']:
                    ner_stats[ner[-1]].append(ner)
            if sent['rel']:
                rel_stats[sent['rel']] += 1
    print(num_sents, num)
    return ner_stats, rel_stats
   
json_fpath = "annotations/JSON/ner_rel_fulltext_full.json"
all_ner_stats, all_rel_stats = num_sents_json(json_fpath)

300
114079 114079


In [2]:
all_ner_stats.keys()

dict_keys(['GP', 'OG', 'DS'])

In [3]:
all_rel_stats

defaultdict(int, {'YGD': 636, 'NGD': 422, 'AMB': 25})

In [4]:
all_tag_stats = {key:len(all_ner_stats[key]) for key in all_ner_stats}

In [5]:
all_tag_stats

{'GP': 36414, 'OG': 21514, 'DS': 14546}

In [6]:
gp_stats = defaultdict(int)
ds_stats = defaultdict(int)
og_stats = defaultdict(int)
for key in all_ner_stats:
    if key == 'GP':
        for ent in all_ner_stats[key]:
            gp_stats[ent[-2]] += 1
    if key == 'DS':
        for ent in all_ner_stats[key]:
            ds_stats[ent[-2]] += 1
    if key == 'OG':
        for ent in all_ner_stats[key]:
            og_stats[ent[-2]] += 1


In [7]:
print(f'unique GP count: {len(gp_stats)}')
print(f'unique DS count: {len(ds_stats)}')
print(f'unique OG count: {len(og_stats)}')

unique GP count: 5603
unique DS count: 2044
unique OG count: 2348


In [8]:
sorted(gp_stats.items(),key=lambda x: x[1], reverse=True)[:10]


[('GFP', 467),
 ('antibody', 278),
 ('antibodies', 248),
 ('CD4', 234),
 ('MTAP', 234),
 ('IL-10', 233),
 ('Shank3', 225),
 ('p53', 218),
 ('MLL', 214),
 ('IgG', 212)]

In [9]:
sorted(ds_stats.items(),key=lambda x: x[1], reverse=True)[:10]

[('tumor', 536),
 ('cancer', 366),
 ('infection', 343),
 ('HCC', 236),
 ('breast cancer', 234),
 ('CF', 206),
 ('tumour', 201),
 ('depression', 191),
 ('obesity', 170),
 ('PH', 169)]

In [10]:
sorted(og_stats.items(),key=lambda x: x[1], reverse=True)[:10]


[('mice', 1483),
 ('human', 1229),
 ('HIV', 799),
 ('mouse', 563),
 ('plants', 451),
 ('rats', 415),
 ('animals', 381),
 ('bacteria', 316),
 ('plant', 307),
 ('animal', 296)]

In [11]:
def num_sents_json_per_article(fpath):
    ner_stats = {}
    rel_stats = {}
    sent_stats = defaultdict(int)
    with open(fpath, 'r') as f:
        dataset = json.load(f)
    print(len(dataset))
    num_sents = 0
    num = 0
    for pmcid in dataset:
        ner_stats[pmcid] = defaultdict(list)
        rel_stats[pmcid] = defaultdict(int)
        num_sents_per_article = len(dataset[pmcid]['annotations'])
        num_sents += num_sents_per_article
        num += dataset[pmcid]['doc_len']
        
        sent_stats[pmcid] = num_sents_per_article
        
        for sent in dataset[pmcid]['annotations']:
            if sent['ner']:
                for ner in sent['ner']:
                    ner_stats[pmcid][ner[-1]].append(ner)
            if sent['rel']:
                rel_stats[pmcid][sent['rel']] += 1
    print(num_sents, num)
    return ner_stats, rel_stats, sent_stats


In [12]:
article_ner_stats, article_rel_stats, article_sent_stats = num_sents_json_per_article(json_fpath)


300
114079 114079


In [13]:
article_sent_stats

defaultdict(int,
            {'PMC4792959': 278,
             'PMC4556948': 382,
             'PMC5993813': 257,
             'PMC3174205': 789,
             'PMC5962829': 256,
             'PMC3874094': 387,
             'PMC3792120': 264,
             'PMC4901335': 367,
             'PMC3581133': 386,
             'PMC2935479': 306,
             'PMC5225553': 291,
             'PMC5744400': 212,
             'PMC3281816': 365,
             'PMC3583137': 582,
             'PMC4022742': 286,
             'PMC3542345': 810,
             'PMC4452330': 359,
             'PMC4464872': 229,
             'PMC4872455': 358,
             'PMC3651197': 429,
             'PMC3362782': 241,
             'PMC5817132': 456,
             'PMC4313693': 781,
             'PMC4489904': 556,
             'PMC4552872': 303,
             'PMC5376652': 390,
             'PMC5070310': 366,
             'PMC5921292': 209,
             'PMC5641157': 285,
             'PMC3751948': 328,
             'PMC547229

In [14]:
article_rel_stats

{'PMC4792959': defaultdict(int, {}),
 'PMC4556948': defaultdict(int, {}),
 'PMC5993813': defaultdict(int, {}),
 'PMC3174205': defaultdict(int, {'YGD': 24, 'NGD': 14}),
 'PMC5962829': defaultdict(int, {}),
 'PMC3874094': defaultdict(int, {}),
 'PMC3792120': defaultdict(int, {}),
 'PMC4901335': defaultdict(int, {}),
 'PMC3581133': defaultdict(int, {}),
 'PMC2935479': defaultdict(int, {}),
 'PMC5225553': defaultdict(int, {'NGD': 7}),
 'PMC5744400': defaultdict(int, {}),
 'PMC3281816': defaultdict(int, {'NGD': 7}),
 'PMC3583137': defaultdict(int, {}),
 'PMC4022742': defaultdict(int, {}),
 'PMC3542345': defaultdict(int, {'NGD': 12, 'YGD': 2}),
 'PMC4452330': defaultdict(int, {}),
 'PMC4464872': defaultdict(int, {}),
 'PMC4872455': defaultdict(int, {'YGD': 4, 'NGD': 3}),
 'PMC3651197': defaultdict(int, {}),
 'PMC3362782': defaultdict(int, {}),
 'PMC5817132': defaultdict(int, {}),
 'PMC4313693': defaultdict(int, {}),
 'PMC4489904': defaultdict(int, {}),
 'PMC4552872': defaultdict(int, {}),
 '

In [15]:
article_gp_stats = {}
article_ds_stats = {}
article_og_stats = {}
for pmcid in article_ner_stats:
    article_gp_stats[pmcid] = defaultdict(int)
    article_ds_stats[pmcid] = defaultdict(int)
    article_og_stats[pmcid] = defaultdict(int)
    for key in article_ner_stats[pmcid]:
        if key == 'GP':
            for ent in article_ner_stats[pmcid][key]:
                article_gp_stats[pmcid][ent[-2]] += 1
        if key == 'DS':
            for ent in article_ner_stats[pmcid][key]:
                article_ds_stats[pmcid][ent[-2]] += 1
        if key == 'OG':
            for ent in article_ner_stats[pmcid][key]:
                article_og_stats[pmcid][ent[-2]] += 1

In [16]:
#unique gp count per article
unique_gp_per = {pmcid:len(article_gp_stats[pmcid]) for pmcid in article_gp_stats}
#unique ds count per article
unique_ds_per = {pmcid:len(article_ds_stats[pmcid]) for pmcid in article_ds_stats}
#unique og count per aticle
unique_og_per = {pmcid:len(article_og_stats[pmcid]) for pmcid in article_og_stats}

In [17]:
#gp count per article
gp_per = {pmcid:sum([article_gp_stats[pmcid][key] for key in article_gp_stats[pmcid]]) for pmcid in article_gp_stats}
#ds count per article
ds_per = {pmcid:sum([article_ds_stats[pmcid][key] for key in article_ds_stats[pmcid]]) for pmcid in article_ds_stats}
#og count per article
og_per = {pmcid:sum([article_og_stats[pmcid][key] for key in article_og_stats[pmcid]]) for pmcid in article_og_stats}

In [18]:
unique_gp_per

{'PMC4792959': 54,
 'PMC4556948': 125,
 'PMC5993813': 141,
 'PMC3174205': 16,
 'PMC5962829': 3,
 'PMC3874094': 46,
 'PMC3792120': 0,
 'PMC4901335': 0,
 'PMC3581133': 0,
 'PMC2935479': 20,
 'PMC5225553': 65,
 'PMC5744400': 36,
 'PMC3281816': 90,
 'PMC3583137': 50,
 'PMC4022742': 54,
 'PMC3542345': 148,
 'PMC4452330': 25,
 'PMC4464872': 25,
 'PMC4872455': 61,
 'PMC3651197': 61,
 'PMC3362782': 0,
 'PMC5817132': 0,
 'PMC4313693': 34,
 'PMC4489904': 0,
 'PMC4552872': 0,
 'PMC5376652': 42,
 'PMC5070310': 1,
 'PMC5921292': 0,
 'PMC5641157': 19,
 'PMC3751948': 7,
 'PMC5472290': 0,
 'PMC4649626': 10,
 'PMC5502978': 40,
 'PMC4767726': 26,
 'PMC3897916': 0,
 'PMC5087830': 76,
 'PMC3585192': 0,
 'PMC5484670': 67,
 'PMC5259676': 0,
 'PMC3024232': 72,
 'PMC3097211': 75,
 'PMC5317055': 0,
 'PMC3648400': 3,
 'PMC5750880': 9,
 'PMC5100220': 0,
 'PMC4749753': 13,
 'PMC5344356': 45,
 'PMC5110973': 64,
 'PMC5708618': 1,
 'PMC3598673': 24,
 'PMC3751959': 44,
 'PMC5131611': 8,
 'PMC5891595': 0,
 'PMC3950279

In [None]:
print(
f"""gp median: {np.median([it[1] for it in gp_per.items()])}
ds median: {np.median([it[1] for it in ds_per.items()])}
og median: {np.median([it[1] for it in og_per.items()])}""" 
    )

In [None]:
print(
f"""gp min: {np.min([it[1] for it in gp_per.items()])}
ds min: {np.min([it[1] for it in ds_per.items()])}
og min: {np.min([it[1] for it in og_per.items()])}""" 
    )


In [None]:
print(
f"""gp max: {np.max([it[1] for it in gp_per.items()])}
ds max: {np.max([it[1] for it in ds_per.items()])}
og max: {np.max([it[1] for it in og_per.items()])}""" 
    )

In [22]:
print(
f"""unique:
gp median: {np.median([it[1] for it in unique_gp_per.items()])}
ds median: {np.median([it[1] for it in unique_ds_per.items()])}
og median: {np.median([it[1] for it in unique_og_per.items()])}""" 
    )

unique:
gp median: 13.0
ds median: 7.0
og median: 8.0


In [23]:
print(
f"""unique:
gp min: {np.min([it[1] for it in unique_gp_per.items()])}
ds min: {np.min([it[1] for it in unique_ds_per.items()])}
og min: {np.min([it[1] for it in unique_og_per.items()])}""" 
    )


unique:
gp min: 0
ds min: 0
og min: 0


In [24]:
print(
f"""unique:
gp max: {np.max([it[1] for it in unique_gp_per.items()])}
ds max: {np.max([it[1] for it in unique_ds_per.items()])}
og max: {np.max([it[1] for it in unique_og_per.items()])}""" 
    )

unique:
gp max: 178
ds max: 77
og max: 170
