# PubMed Extraction

Extract desired data from pubmed XML files into csv files to be used in NLP.

In [1]:
from xml.etree import ElementTree as etree
from pathlib import Path
import gzip, csv
from fastai.text import *

In [2]:
path = Path("./data") # you probably want a symlink here to your data drive
dest = Path(".")      # for faster speed if not SSD-drive, set dest to a different drive (to avoid head move madness)
#dest = path

## Download data

In [3]:
# run once to download the pubmed data 
# info: 1200+ xml.gz files totalling ~35GB compressed (2019-05)
if 0:
    ! wget -m -np -nd ftp://ftp.ncbi.nlm.nih.gov/pubmed/baseline/    -P {path}
    ! wget -m -np -nd ftp://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/ -P {path}

In [4]:
path.ls()

[PosixPath('data/pubmed19n1125_stats.html'),
 PosixPath('data/pubmed19n0243.xml.gz.md5'),
 PosixPath('data/pubmed19n1048_stats.html'),
 PosixPath('data/pubmed19n0257.xml.gz.md5'),
 PosixPath('data/pubmed19n0378.xml.gz'),
 PosixPath('data/pubmed19n0961.xml.gz.md5'),
 PosixPath('data/pubmed19n0381.xml.gz.md5'),
 PosixPath('data/pubmed19n1065.xml.gz'),
 PosixPath('data/pubmed19n1002.xml.gz.md5'),
 PosixPath('data/pubmed19n1204.xml.gz'),
 PosixPath('data/pubmed19n1154.xml.gz'),
 PosixPath('data/pubmed19n0997_stats.html'),
 PosixPath('data/pubmed19n0447.xml.gz.md5'),
 PosixPath('data/pubmed19n0322.xml.gz'),
 PosixPath('data/pubmed19n0880.xml.gz.md5'),
 PosixPath('data/pubmed19n1099_stats.html'),
 PosixPath('data/pubmed19n0599.xml.gz'),
 PosixPath('data/pubmed19n0293.xml.gz'),
 PosixPath('data/pubmed19n0081.xml.gz'),
 PosixPath('data/pubmed19n1065_stats.html'),
 PosixPath('data/pubmed19n1108_stats.html'),
 PosixPath('data/pubmed19n0581.xml.gz'),
 PosixPath('data/pubmed19n0455.xml.gz'),
 Posi

## Extraction

In [5]:
# Data to extract:

# Root: PubmedArticleSet.PubmedArticle.MedlineCitation

#  Condition: # only english articles
# -- .Language == eng

# 1. pmid | title | abstract | mesh (pubmed-abstracts.csv)
# - .PMID
# - .Article:
# -- .ArticleTitle
# -- .Abstract.AbstractText
#
# 2.  MeSH data 
# - .MeshHeadingList: converted into a single record with '|' separated entries of each MeshHeading
#    and each MeshHeading is converted into 'UI/MajorTopicYN' for DescriptorName, 
#    and optional 'UI/MajorTopicYN' for QualifierName joined with '-' 
#    here is a sample of what a record might look like: D000339/Y|D004650/N-Q000706/Y 
# 
#   the lookup table for these ids is saved in a separate pubmed-mesh.csv file

mesh_db = {}

def mesh_db_update(k, v):
    if k in mesh_db: return
    mesh_db[k]=v
    
def mesh_db_write(csv_writer_mesh):
    print(f"Saving MeSh records ({len(mesh_db)})")
    csv_writer_mesh.writerows(sorted(mesh_db.items()))
            
# https://stackoverflow.com/a/26435241/9201239 efficient RAM usage
def parse_entries(f, tag):
    """Yield *tag* elements from *f* xml (fn or fh) incrementaly."""
    context = iter(etree.iterparse(f, events=('start', 'end')))
    _, root = next(context) # get root element
    for event, elem in context:
        if event == 'end' and elem.tag == tag:
            yield elem
            root.clear() # free memory

def meshhead2rec(mh):
    if mh is None: return ''
    l = []
    for e in mh.find('DescriptorName'), mh.find('QualifierName'):
        if e is not None:
            ui, mt = e.attrib['UI'], e.attrib['MajorTopicYN']
            l.append(f"{ui}/{mt}")
            mesh_db_update(ui, e.text)
    return "-".join(l)
                
def meshlist2rec(mhl):
    if mhl is None: return ''
    l = [meshhead2rec(m) for m in mhl.findall('MeshHeading')]
    return "|".join(l)

def extract(f_in, fn, files_cnt, csv_writer_main):
    c  = 0
    c_eng = 0
    c_no_abstract = 0
    for e in parse_entries(f_in, 'MedlineCitation'):
        c += 1
        try:
            pmid = e.find('PMID').text
            #print(pmid)

            # 1. Abstracts
            a = e.find('Article')
            if a is None: continue
                
            lang = a.find('Language')
            if lang is None or lang.text != 'eng': continue

            c_eng += 1
            title    = a.find('ArticleTitle').text
            abstract = a.find('Abstract')
            if abstract is not None:
                abstract_text = abstract.find('AbstractText').text
                #print(pmid, title, abstract)
            else:
                abstract_text = ''
                c_no_abstract += 1

            
            # 2. MeSH Data
            mesh = meshlist2rec(e.find('MeshHeadingList'))
            #print(f"MeSH: {mesh}")
            
            # 
            csv_writer_main.writerow([pmid, title, abstract_text, mesh])
                    
        except: 
            #if not pmid: pmid = "unknown"
            #print(f"{pmid} failed to parse")
            raise
        #break
        
    print(f"{files_cnt:0>4d} {fn}: recs {c:0>5d}, eng {c_eng:0>5d}, no abstract {c_no_abstract:0>5d}")
    
    return c, c_eng, c_no_abstract

def process():

    csv_fn_main = dest/"pubmed-abstracts.csv"
    csv_fn_mesh = dest/"pubmed-mesh.csv"
    files_cnt = 0
    with open(csv_fn_main, "w") as f_out_main, open(csv_fn_mesh, "w") as f_out_mesh:
        csv_writer_main = csv.writer(f_out_main)
        csv_writer_mesh = csv.writer(f_out_mesh)
        
        csv_writer_main.writerow(["pmid", "title", "abstract", "mesh"])
        csv_writer_mesh.writerow(["meshid", "text"])
        
        total, total_eng, total_no_abstract = 0, 0, 0
        files = sorted(path.glob("*.xml.gz"))
        print(f"Total files to process: {len(files)}")
        for fn in files:
            files_cnt += 1
            #fn = path/"pubmed19n1206.xml.gz"
            with gzip.open(fn, 'rb') as f_in: 
                c, c_eng, c_no_abstract = extract(f_in, fn, files_cnt, csv_writer_main)
                total             += c
                total_eng         += c_eng
                total_no_abstract += c_no_abstract
        print(f"Grand total of {total_eng:,} English recs (no abstract {total_no_abstract:,}) out of {total:,} pubmed recs")
        # save MeSH records
        mesh_db_write(csv_writer_mesh)
        
    # summary
    ! wc -l {csv_fn_main} {csv_fn_mesh}
    ! ls -l {csv_fn_main} {csv_fn_mesh}
    
    # gzip, keep the original
    ! gzip -k {csv_fn_main} {csv_fn_mesh}
    ! ls -l {csv_fn_main}.gz {csv_fn_mesh}.gz
    
%time process()    

Total files to process: 1206
0001 data/pubmed19n0001.xml.gz: recs 30000, eng 23211, no abstract 10488
0002 data/pubmed19n0002.xml.gz: recs 30000, eng 23891, no abstract 12560
0003 data/pubmed19n0003.xml.gz: recs 30000, eng 24221, no abstract 13481
0004 data/pubmed19n0004.xml.gz: recs 30000, eng 22116, no abstract 10006
0005 data/pubmed19n0005.xml.gz: recs 30000, eng 17000, no abstract 07522
0006 data/pubmed19n0006.xml.gz: recs 30000, eng 22584, no abstract 08940
0007 data/pubmed19n0007.xml.gz: recs 30000, eng 23679, no abstract 09807
0008 data/pubmed19n0008.xml.gz: recs 30000, eng 24132, no abstract 10796
0009 data/pubmed19n0009.xml.gz: recs 30000, eng 19776, no abstract 14990
0010 data/pubmed19n0010.xml.gz: recs 30000, eng 19958, no abstract 11925
0011 data/pubmed19n0011.xml.gz: recs 30000, eng 21843, no abstract 10365
0012 data/pubmed19n0012.xml.gz: recs 30000, eng 20486, no abstract 10566
0013 data/pubmed19n0013.xml.gz: recs 30000, eng 20617, no abstract 10889
0014 data/pubmed19n001

0113 data/pubmed19n0113.xml.gz: recs 30000, eng 23083, no abstract 07772
0114 data/pubmed19n0114.xml.gz: recs 30000, eng 20953, no abstract 07437
0115 data/pubmed19n0115.xml.gz: recs 30000, eng 20585, no abstract 09956
0116 data/pubmed19n0116.xml.gz: recs 30000, eng 23024, no abstract 07172
0117 data/pubmed19n0117.xml.gz: recs 30000, eng 22765, no abstract 07331
0118 data/pubmed19n0118.xml.gz: recs 30000, eng 22532, no abstract 07179
0119 data/pubmed19n0119.xml.gz: recs 30000, eng 22708, no abstract 06987
0120 data/pubmed19n0120.xml.gz: recs 30000, eng 22360, no abstract 06638
0121 data/pubmed19n0121.xml.gz: recs 30000, eng 21180, no abstract 13207
0122 data/pubmed19n0122.xml.gz: recs 30000, eng 22993, no abstract 07666
0123 data/pubmed19n0123.xml.gz: recs 30000, eng 23004, no abstract 08031
0124 data/pubmed19n0124.xml.gz: recs 30000, eng 22655, no abstract 06851
0125 data/pubmed19n0125.xml.gz: recs 30000, eng 22977, no abstract 07083
0126 data/pubmed19n0126.xml.gz: recs 30000, eng 224

0226 data/pubmed19n0226.xml.gz: recs 30000, eng 23604, no abstract 10482
0227 data/pubmed19n0227.xml.gz: recs 30000, eng 24788, no abstract 09348
0228 data/pubmed19n0228.xml.gz: recs 30000, eng 23061, no abstract 07929
0229 data/pubmed19n0229.xml.gz: recs 30000, eng 22786, no abstract 08550
0230 data/pubmed19n0230.xml.gz: recs 30000, eng 19191, no abstract 16192
0231 data/pubmed19n0231.xml.gz: recs 30000, eng 21316, no abstract 12534
0232 data/pubmed19n0232.xml.gz: recs 30000, eng 21973, no abstract 09425
0233 data/pubmed19n0233.xml.gz: recs 30000, eng 22039, no abstract 09733
0234 data/pubmed19n0234.xml.gz: recs 30000, eng 22044, no abstract 10400
0235 data/pubmed19n0235.xml.gz: recs 30000, eng 24563, no abstract 10136
0236 data/pubmed19n0236.xml.gz: recs 30000, eng 22467, no abstract 09800
0237 data/pubmed19n0237.xml.gz: recs 30000, eng 21919, no abstract 09733
0238 data/pubmed19n0238.xml.gz: recs 30000, eng 18963, no abstract 08816
0239 data/pubmed19n0239.xml.gz: recs 30000, eng 202

0339 data/pubmed19n0339.xml.gz: recs 30000, eng 27520, no abstract 09468
0340 data/pubmed19n0340.xml.gz: recs 30000, eng 29995, no abstract 25933
0341 data/pubmed19n0341.xml.gz: recs 30000, eng 30000, no abstract 24446
0342 data/pubmed19n0342.xml.gz: recs 30000, eng 28738, no abstract 17997
0343 data/pubmed19n0343.xml.gz: recs 30000, eng 27321, no abstract 07989
0344 data/pubmed19n0344.xml.gz: recs 30000, eng 26577, no abstract 05504
0345 data/pubmed19n0345.xml.gz: recs 30000, eng 26679, no abstract 05036
0346 data/pubmed19n0346.xml.gz: recs 30000, eng 26143, no abstract 05288
0347 data/pubmed19n0347.xml.gz: recs 30000, eng 27288, no abstract 05782
0348 data/pubmed19n0348.xml.gz: recs 30000, eng 27114, no abstract 05668
0349 data/pubmed19n0349.xml.gz: recs 30000, eng 27279, no abstract 07089
0350 data/pubmed19n0350.xml.gz: recs 30000, eng 27106, no abstract 05816
0351 data/pubmed19n0351.xml.gz: recs 30000, eng 26672, no abstract 05743
0352 data/pubmed19n0352.xml.gz: recs 30000, eng 272

0452 data/pubmed19n0452.xml.gz: recs 30000, eng 14743, no abstract 14493
0453 data/pubmed19n0453.xml.gz: recs 30000, eng 14946, no abstract 14717
0454 data/pubmed19n0454.xml.gz: recs 30000, eng 14767, no abstract 14440
0455 data/pubmed19n0455.xml.gz: recs 30000, eng 15016, no abstract 14624
0456 data/pubmed19n0456.xml.gz: recs 30000, eng 13139, no abstract 12783
0457 data/pubmed19n0457.xml.gz: recs 30000, eng 14922, no abstract 14466
0458 data/pubmed19n0458.xml.gz: recs 30000, eng 14891, no abstract 14434
0459 data/pubmed19n0459.xml.gz: recs 30000, eng 14848, no abstract 14429
0460 data/pubmed19n0460.xml.gz: recs 30000, eng 17791, no abstract 17185
0461 data/pubmed19n0461.xml.gz: recs 30000, eng 14993, no abstract 14487
0462 data/pubmed19n0462.xml.gz: recs 30000, eng 14238, no abstract 13833
0463 data/pubmed19n0463.xml.gz: recs 30000, eng 14754, no abstract 14345
0464 data/pubmed19n0464.xml.gz: recs 30000, eng 15103, no abstract 14605
0465 data/pubmed19n0465.xml.gz: recs 30000, eng 185

0565 data/pubmed19n0565.xml.gz: recs 30000, eng 27338, no abstract 07465
0566 data/pubmed19n0566.xml.gz: recs 30000, eng 27017, no abstract 04491
0567 data/pubmed19n0567.xml.gz: recs 30000, eng 26936, no abstract 04701
0568 data/pubmed19n0568.xml.gz: recs 30000, eng 27560, no abstract 04343
0569 data/pubmed19n0569.xml.gz: recs 30000, eng 27741, no abstract 04577
0570 data/pubmed19n0570.xml.gz: recs 30000, eng 27010, no abstract 04513
0571 data/pubmed19n0571.xml.gz: recs 30000, eng 27494, no abstract 04486
0572 data/pubmed19n0572.xml.gz: recs 30000, eng 26904, no abstract 06032
0573 data/pubmed19n0573.xml.gz: recs 30000, eng 26867, no abstract 04083
0574 data/pubmed19n0574.xml.gz: recs 30000, eng 27547, no abstract 04298
0575 data/pubmed19n0575.xml.gz: recs 30000, eng 29043, no abstract 19164
0576 data/pubmed19n0576.xml.gz: recs 30000, eng 29472, no abstract 20267
0577 data/pubmed19n0577.xml.gz: recs 30000, eng 29434, no abstract 21884
0578 data/pubmed19n0578.xml.gz: recs 30000, eng 293

0678 data/pubmed19n0678.xml.gz: recs 30000, eng 27275, no abstract 05990
0679 data/pubmed19n0679.xml.gz: recs 30000, eng 27803, no abstract 03783
0680 data/pubmed19n0680.xml.gz: recs 30000, eng 28066, no abstract 04125
0681 data/pubmed19n0681.xml.gz: recs 30000, eng 27078, no abstract 03986
0682 data/pubmed19n0682.xml.gz: recs 30000, eng 27143, no abstract 04071
0683 data/pubmed19n0683.xml.gz: recs 30000, eng 27460, no abstract 03492
0684 data/pubmed19n0684.xml.gz: recs 30000, eng 28232, no abstract 05967
0685 data/pubmed19n0685.xml.gz: recs 30000, eng 28150, no abstract 04821
0686 data/pubmed19n0686.xml.gz: recs 30000, eng 28075, no abstract 05323
0687 data/pubmed19n0687.xml.gz: recs 30000, eng 27636, no abstract 03680
0688 data/pubmed19n0688.xml.gz: recs 30000, eng 28098, no abstract 05654
0689 data/pubmed19n0689.xml.gz: recs 30000, eng 27679, no abstract 05173
0690 data/pubmed19n0690.xml.gz: recs 30000, eng 27533, no abstract 04751
0691 data/pubmed19n0691.xml.gz: recs 30000, eng 279

0791 data/pubmed19n0791.xml.gz: recs 30000, eng 28209, no abstract 03751
0792 data/pubmed19n0792.xml.gz: recs 30000, eng 28188, no abstract 04115
0793 data/pubmed19n0793.xml.gz: recs 30000, eng 28637, no abstract 03865
0794 data/pubmed19n0794.xml.gz: recs 30000, eng 28453, no abstract 04472
0795 data/pubmed19n0795.xml.gz: recs 30000, eng 28356, no abstract 04006
0796 data/pubmed19n0796.xml.gz: recs 30000, eng 28903, no abstract 03904
0797 data/pubmed19n0797.xml.gz: recs 30000, eng 28148, no abstract 03755
0798 data/pubmed19n0798.xml.gz: recs 30000, eng 28771, no abstract 03663
0799 data/pubmed19n0799.xml.gz: recs 30000, eng 28550, no abstract 03810
0800 data/pubmed19n0800.xml.gz: recs 30000, eng 28468, no abstract 04121
0801 data/pubmed19n0801.xml.gz: recs 30000, eng 28368, no abstract 03617
0802 data/pubmed19n0802.xml.gz: recs 30000, eng 28500, no abstract 03938
0803 data/pubmed19n0803.xml.gz: recs 30000, eng 28647, no abstract 04107
0804 data/pubmed19n0804.xml.gz: recs 30000, eng 287

0904 data/pubmed19n0904.xml.gz: recs 30000, eng 29030, no abstract 04146
0905 data/pubmed19n0905.xml.gz: recs 30000, eng 29044, no abstract 03569
0906 data/pubmed19n0906.xml.gz: recs 30000, eng 28973, no abstract 03936
0907 data/pubmed19n0907.xml.gz: recs 30000, eng 28805, no abstract 03968
0908 data/pubmed19n0908.xml.gz: recs 30000, eng 29402, no abstract 05493
0909 data/pubmed19n0909.xml.gz: recs 30000, eng 28815, no abstract 03714
0910 data/pubmed19n0910.xml.gz: recs 30000, eng 28616, no abstract 03973
0911 data/pubmed19n0911.xml.gz: recs 30000, eng 29004, no abstract 03932
0912 data/pubmed19n0912.xml.gz: recs 30000, eng 28936, no abstract 03744
0913 data/pubmed19n0913.xml.gz: recs 30000, eng 29035, no abstract 03598
0914 data/pubmed19n0914.xml.gz: recs 30000, eng 28788, no abstract 03999
0915 data/pubmed19n0915.xml.gz: recs 30000, eng 29380, no abstract 03521
0916 data/pubmed19n0916.xml.gz: recs 30000, eng 29033, no abstract 03609
0917 data/pubmed19n0917.xml.gz: recs 30000, eng 290

1017 data/pubmed19n1017.xml.gz: recs 11040, eng 09999, no abstract 01130
1018 data/pubmed19n1018.xml.gz: recs 27644, eng 26732, no abstract 02400
1019 data/pubmed19n1019.xml.gz: recs 30000, eng 29012, no abstract 04285
1020 data/pubmed19n1020.xml.gz: recs 01728, eng 01680, no abstract 00228
1021 data/pubmed19n1021.xml.gz: recs 30000, eng 29831, no abstract 02962
1022 data/pubmed19n1022.xml.gz: recs 00495, eng 00459, no abstract 00039
1023 data/pubmed19n1023.xml.gz: recs 10014, eng 09637, no abstract 00926
1024 data/pubmed19n1024.xml.gz: recs 05170, eng 05071, no abstract 00254
1025 data/pubmed19n1025.xml.gz: recs 11275, eng 11148, no abstract 01122
1026 data/pubmed19n1026.xml.gz: recs 15846, eng 15637, no abstract 01722
1027 data/pubmed19n1027.xml.gz: recs 24761, eng 24128, no abstract 03470
1028 data/pubmed19n1028.xml.gz: recs 23619, eng 23272, no abstract 03581
1029 data/pubmed19n1029.xml.gz: recs 23983, eng 23242, no abstract 03034
1030 data/pubmed19n1030.xml.gz: recs 15839, eng 154

1130 data/pubmed19n1130.xml.gz: recs 15104, eng 14846, no abstract 01950
1131 data/pubmed19n1131.xml.gz: recs 26362, eng 25695, no abstract 03747
1132 data/pubmed19n1132.xml.gz: recs 19486, eng 19139, no abstract 02085
1133 data/pubmed19n1133.xml.gz: recs 17217, eng 16838, no abstract 02405
1134 data/pubmed19n1134.xml.gz: recs 23444, eng 23177, no abstract 03520
1135 data/pubmed19n1135.xml.gz: recs 12809, eng 12569, no abstract 01316
1136 data/pubmed19n1136.xml.gz: recs 06701, eng 06700, no abstract 00727
1137 data/pubmed19n1137.xml.gz: recs 17988, eng 17830, no abstract 02556
1138 data/pubmed19n1138.xml.gz: recs 30000, eng 28583, no abstract 03121
1139 data/pubmed19n1139.xml.gz: recs 08196, eng 07898, no abstract 00664
1140 data/pubmed19n1140.xml.gz: recs 20782, eng 20360, no abstract 02847
1141 data/pubmed19n1141.xml.gz: recs 30000, eng 30000, no abstract 10511
1142 data/pubmed19n1142.xml.gz: recs 30000, eng 30000, no abstract 10208
1143 data/pubmed19n1143.xml.gz: recs 30000, eng 300

## Extract just text

To make the feeding of the LM faster, let's extract just text, combining f"{title}\n{abstract}". We won't do it from scratch but use the already extracted multi-column csv file.

In [15]:
fn_in  = dest/"pubmed-abstracts.csv.gz"
fn_out = dest/"pubmed-texts.csv.gz"

convert2text = 1 # set to 1 if you want to run the conversion - it does it in chunks so under 10GB RAM or lower the chunksize
if convert2text: 
    chunksize = 1000000 # 1M records chunk
    
    cols_to_keep = [1,2] # title + abstract
    #cols_names = ['title', 'abstract']
    df_iter = pd.read_csv(fn_in, compression='gzip', chunksize=chunksize, keep_default_na=False, usecols=cols_to_keep)
    
    with gzip.open(fn_out, "wt") as f:
        c = 0
        for df in df_iter:        
            #df.head(2)
            c += chunksize
            df['texts'] = df[['title', 'abstract']].apply(lambda x: '\n'.join(x), axis=1)
            df.drop(columns=['title', 'abstract'], inplace=True)
            df.to_csv(f, index=False)
            print(c)
            #df.head(2)
            #break

1000000
2000000
3000000
4000000
5000000
6000000
7000000
8000000
9000000
10000000
11000000
12000000
13000000
14000000
15000000
16000000
17000000
18000000
19000000
20000000
21000000
22000000
23000000
24000000
25000000
26000000
27000000
28000000
29000000
