# Notes

- body text might not be complete
- bibref might not be complete
- author name parsing for bibref might be wrong (mixing first/last names)

e.g. Clinical and Immunologic Studies in Identical Twins Discordant for Systemic Lupus Erythematosus
- body text on the references page is truncated
- bibref only contained 12/25 entries
- author names for bibrefs parsed incorrectly

In [2]:
%load_ext autoreload
%autoreload 2

In [109]:
import json
import pandas as pd
from pathlib import Path
from glom import glom
from gensim import corpora, models
from gensim.parsing import preprocessing as pp
from gensim.models.phrases import Phrases, Phraser

import sys
import traceback
import logging
from multiprocessing import cpu_count, Pool
import yaml

LOG = logging.getLogger()
logging.basicConfig()

CPU_COUNT = cpu_count()

In [56]:
def log_traceback():
    # Get current system exception
    ex_type, ex_value, ex_traceback = sys.exc_info()

    # Extract unformatter stack traces as tuples
    trace_back = traceback.extract_tb(ex_traceback)

    # Format stacktrace
    stack_trace = list()

    for trace in trace_back:
        stack_trace.append(
            "File : %s , Line : %d, Func.Name : %s, Message : %s" % (
                trace[0], trace[1], trace[2], trace[3]))

    LOG.error(f"Exception type : {ex_type.__name__}")
    LOG.error(f"Exception message : {ex_value}")
    LOG.error(f"Stack trace : {stack_trace}")


def mprun(mp_func, inputs, n_workers=CPU_COUNT, ):
    pool = Pool(n_workers)

    results = []
    try:
        results = pool.map(mp_func, inputs)
    except:
        log_traceback()
    finally:
        pool.close()
        pool.join()

    return results


In [10]:
datapath = Path('data/')

In [74]:
metadata = pd.read_csv(datapath.joinpath('metadata.csv'), low_memory=False)

In [79]:
def gen_pdf_json_path(x):
    sha = x['sha']
    if pd.isnull(sha):
        return []
    return [datapath.joinpath(
        f"{x['full_text_file']}/{x['full_text_file']}/pdf_json/{sha_item.strip()}.json"
    ) for sha_item in sha.split(';')]

def gen_pmc_json_path(x):
    pmcid = x['pmcid']
    if pd.isnull(pmcid):
        return []
    return [datapath.joinpath(
        f"{x['full_text_file']}/{x['full_text_file']}/pmc_json/{pmcid_item.strip()}.xml.json"
    ) for pmcid_item in pmcid.split(';')]

In [80]:
metadata.loc[:, 'pdf_json_paths'] = metadata.apply(gen_pdf_json_path, axis=1)
metadata.loc[:, 'pmc_json_paths'] = metadata.apply(gen_pmc_json_path, axis=1)

In [81]:
metadata.head()

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_pdf_parse,has_pmc_xml_parse,full_text_file,url,pdf_json_paths,pmc_json_paths
0,8q5ondtn,,Elsevier,Intrauterine virus infections and congenital h...,10.1016/0002-8703(72)90077-4,,4361535.0,els-covid,Abstract The etiologic basis for the vast majo...,1972-12-31,"Overall, James C.",American Heart Journal,,,False,False,custom_license,https://doi.org/10.1016/0002-8703(72)90077-4,[],[]
1,pzfd0e50,,Elsevier,Coronaviruses in Balkan nephritis,10.1016/0002-8703(80)90355-5,,6243850.0,els-covid,,1980-03-31,"Georgescu, Leonida; Diosi, Peter; Buţiu, Ioan;...",American Heart Journal,,,False,False,custom_license,https://doi.org/10.1016/0002-8703(80)90355-5,[],[]
2,22bka3gi,,Elsevier,Cigarette smoking and coronary heart disease: ...,10.1016/0002-8703(80)90356-7,,7355701.0,els-covid,,1980-03-31,"Friedman, Gary D",American Heart Journal,,,False,False,custom_license,https://doi.org/10.1016/0002-8703(80)90356-7,[],[]
3,zp9k1k3z,aecbc613ebdab36753235197ffb4f35734b5ca63,Elsevier,Clinical and immunologic studies in identical ...,10.1016/0002-9343(73)90176-9,,4579077.0,els-covid,"Abstract Middle-aged female identical twins, o...",1973-08-31,"Brunner, Carolyn M.; Horwitz, David A.; Shann,...",The American Journal of Medicine,,,True,False,custom_license,https://doi.org/10.1016/0002-9343(73)90176-9,[data/custom_license/custom_license/pdf_json/a...,[]
4,cjuzul89,,Elsevier,Epidemiology of community-acquired respiratory...,10.1016/0002-9343(85)90361-4,,4014285.0,els-covid,Abstract Upper respiratory tract infections ar...,1985-06-28,"Garibaldi, Richard A.",The American Journal of Medicine,,,False,False,custom_license,https://doi.org/10.1016/0002-9343(85)90361-4,[],[]


In [82]:
metadata.loc[16763, 'pdf_json_paths']

[PosixPath('data/comm_use_subset/comm_use_subset/pdf_json/d94115064a8168415d634f654869a4009e4c3c70.json'),
 PosixPath('data/comm_use_subset/comm_use_subset/pdf_json/3d2962558f0a2ed4ddc00989168efe60856bd792.json'),
 PosixPath('data/comm_use_subset/comm_use_subset/pdf_json/eed4a68d4e44f9887b3219ad5813eed5f0c4d42b.json')]

In [28]:
def load_json_file(file):
    with Path(file).open('r') as file:
        res = json.load(file)
    return res

In [83]:
def get_full_text(json_file_list):
    if len(json_file_list) < 1:
        return ''
    body_text_list = []
    for file in json_file_list:
        json_dict = load_json_file(file)
        body_text_list.append(' '.join([paragraph['text'] for paragraph in glom(json_dict, 'body_text')]))
    return ' '.join(body_text_list)

In [85]:
full_text_all = mprun(get_full_text, metadata['pdf_json_paths'].values)

In [93]:
x['bib_entries']

{'BIBREF0': {'ref_id': 'b0',
  'title': 'Hereditary hypergammaglobulinemia and systemic lupus erythematosus. I. Clinical and electrophoretic studies',
  'authors': [{'first': '', 'middle': [], 'last': 'Larsson 0', 'suffix': ''},
   {'first': 'T', 'middle': ['D'], 'last': 'Leonhardt', 'suffix': ''}],
  'year': 1959,
  'venue': 'Acta Med Stand',
  'volume': '165',
  'issn': '',
  'pages': '',
  'other_ids': {}},
 'BIBREF3': {'ref_id': 'b3',
  'title': 'Familial systemic lupus erythematosus',
  'authors': [{'first': 'F', 'middle': ['A'], 'last': 'Salazar', 'suffix': ''},
   {'first': 'D', 'middle': ['N'], 'last': 'Robbins', 'suffix': ''},
   {'first': 'R', 'middle': [], 'last': 'Scalettar', 'suffix': ''}],
  'year': None,
  'venue': '',
  'volume': '',
  'issn': '',
  'pages': '',
  'other_ids': {}},
 'BIBREF5': {'ref_id': 'b5',
  'title': 'Disseminated lupus erythematosus in identical twin sisters associated with diabetes mellitus in one case',
  'authors': [{'first': 'M', 'middle': ['W'

In [104]:
def preprocessing(s):
    s = pp.strip_tags(s)
    s = pp.split_alphanum(s)
    s = pp.remove_stopwords(s)
    s = pp.strip_multiple_whitespaces(s)
    s = pp.stem_text(s)
    s = pp.strip_non_alphanum(s)
    s = pp.strip_numeric(s)
    s = pp.strip_punctuation(s)
    s = pp.strip_short(s, minsize=5)
    return s.split(' ')

In [105]:
full_text_processed = mprun(preprocessing, full_text_all)

In [118]:
phrases = Phrases(full_text_processed)
bigrams = Phraser(phrases)

In [106]:
dictionary = corpora.Dictionary(full_text_processed)
corpus = mprun(dictionary.doc2bow, full_text_processed)

In [107]:
lda = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=100)

In [108]:
lda.print_topics(num_topics=50, num_words=10)

[(50,
  '0.041*"membran" + 0.031*"protein" + 0.020*"lipid" + 0.015*"fusion" + 0.008*"membrane" + 0.007*"vesicl" + 0.007*"endosom" + 0.007*"format" + 0.007*"golgi" + 0.006*"requir"'),
 (84,
  '0.038*"bacteri" + 0.032*"bacteria" + 0.030*"pathogen" + 0.022*"resist" + 0.012*"microbi" + 0.011*"antimicrobi" + 0.009*"strain" + 0.009*"antibiot" + 0.008*"cultur" + 0.008*"studi"'),
 (67,
  '0.020*"schiller" + 0.020*"nakai" + 0.016*"yamamoto" + 0.013*"robinson" + 0.011*"bradi" + 0.007*"carlson" + 0.006*"willett" + 0.005*"torten" + 0.004*"ishida" + 0.004*"egberink"'),
 (77,
  '0.025*"primer" + 0.022*"detect" + 0.020*"assai" + 0.019*"probe" + 0.016*"reaction" + 0.014*"target" + 0.011*"fluoresc" + 0.010*"perform" + 0.009*"amplif" + 0.009*"sampl"'),
 (29,
  '0.026*"immun" + 0.023*"respons" + 0.014*"specif" + 0.013*"cytokin" + 0.013*"express" + 0.013*"cells" + 0.010*"antigen" + 0.009*"studi" + 0.008*"associ" + 0.008*"level"'),
 (45,
  '0.029*"licens" + 0.023*"vaccin" + 0.020*"ebola" + 0.020*"diseas" +

# Draft

In [52]:
class PDFParser():
    def __init__(self):
        self.title = ''
        self.authors = []
        self.abstract = ''
        self.body = ''
        self.bib_titles = []
        
    def load(self, json_file=None, json_dict=None):
        if json_file is not None:
            json_dict = load_json_file(json_file)
        if json_dict is None:
            raise ValueError(f'One of json or json_file must be valid')
        
        self.title = glom(json_dict, 'metadata.title')
        self.authors = glom(json_dict, 'metadata.authors')
        self.abstract = ' '.join(
            [paragraph['text'] for paragraph in glom(json_dict, 'abstract')])
        
        self.body = ' '.join(
            [paragraph['text'] for paragraph in glom(json_dict, 'body_text')])
        self.bib_titles = [bibref['title'] for bibref in glom(json_dict, 'bib_entries').values()]
        return self

    def __repr__(self):
        return self.title
    
    def __str__(self):
        return self.__repr__()

In [53]:
p = PDFParser()
p.load(metadata['pdf_json_path'][3])

Clinical and Immunologic Studies in Identical Twins Discordant for Systemic Lupus Erythematosus