# Pubmed data acquisition and munging
- Download pubmed data and convert it into a handy csv.

## Note on downloading PubMed data.
The easiest way I have found to do this, is to simply download everything from here: ftp://ftp.ncbi.nlm.nih.gov/pubmed/baseline

- Windows: use Windows Subsystem for Linux (WSL)
- Mac/Linux: just use a terminal.

Replace 'destinationPath' with the path to an empty directory where you want to download the data to. In my case, this was `D:\pubmed\data`

`wget ftp://ftp.ncbi.nlm.nih.gov/pubmed/baseline/* destinationPath`

This will take a while to complete. On my connection, it took around 1 day.

In [1]:
import os
import gzip
from bs4 import BeautifulSoup as bs
import glob
import pandas as pd
import datetime

How many files are there? Get all of the filepaths and store in an object 'filepaths'

In [2]:
working_data_path = r'C:\Users\aday\OneDrive - SAGE Publishing\PROJECT_DATA\pubmed_case_report_classifier\data'

In [3]:
pubmed_data_path = r'C:\Users\aday\OneDrive - SAGE Publishing\DATA\PubMed'
p = os.path.abspath(os.path.join(pubmed_data_path,'*'))
filepaths = glob.glob(p)
filepaths = [ x for x in filepaths if x.endswith('.xml.gz')]
len(filepaths)

1062

Flip the ordering of filepaths. This means that we start with recent stuff and work our way backwards in time.

In [4]:
filepaths = filepaths[::-1]
len(filepaths)

1062

Now iterate over the files and pull out all of the data from the 'years_of_interest'. When we reach a file which no longer has data from these years, we can stop.

# Create a class for handling pubmed XML

In [5]:

import datetime
from bs4 import BeautifulSoup as bs
import pandas as pd

class PubMedDataSet:
    
    def __init__(self, years_of_interest:set):
        self.years_of_interest = years_of_interest
    
    def process_files(self, filepaths):
        
        years_in_file = set()
        years_of_interest = self.years_of_interest

        dataframe_path = os.path.join(working_data_path,'pubmed_sample.csv')
        
        if os.path.exists(dataframe_path):
            df = pd.read_csv(dataframe_path)
        else:
            df = pd.DataFrame()
            
        for filepath in filepaths:
            try:
                print(datetime.datetime.now(), '| Processing:', filepath)
                if len(list(years_in_file))>0:
                    # recall that we are reading our files in reverse-chronological order.
                    # they contain lots of years-worth of data, but older files should not have recent years
                    # this says 'if NONE of our years of interest were in the last file, 
                    # then we assume that we are looking at older data now and we stop reading
                    # the files. 
                    if not any([(x in years_of_interest) for x in list(years_in_file) ]):
                        print('Breaking loop - we are no longer looking at our years of interest.')
                        break
                years_in_file = set()
                rows = self.process_file(filepath)
                data = []
                for row in rows:
                    years_in_file.add(row['year'])
                    data.append(row)
                print(len(data), 'rows found. Years in file:', years_in_file)
                new_rows = pd.DataFrame(data)
                n_rows_found = new_rows[new_rows['year'].isin(years_of_interest)].shape[0]
                df = pd.concat([df,new_rows])
                df = df[df['year'].isin(years_of_interest)]
                print(datetime.datetime.now(),'| File complete. DataFrame shape:', df.shape)
                print()
                df.to_csv(dataframe_path, index=False, encoding = 'utf-8-sig')
                n_rows_cutoff = 500
                if n_rows_found <n_rows_cutoff:
                    print(f'BREAKING loop - fewer than {n_rows_cutoff} rows found in last file.')
            except Exception as e:
                print('Failure for ', filepath)
                print(e)
        return df
    

    def process_file(self, filepath):
        with gzip.open(filepath) as f:
            # TODO - soup is slow, switch to etree
            soup = bs(f.read(),'lxml')
        articles = self.get_articles(soup)
        for article in articles:
            row = self.build_row(article)
            yield row


    def build_row(self, article):
        articledate, reviseddate = self.get_article_dates(article)
        doi, pii, pmid, pmc = self.get_article_ids(article)
        journal_title, journal_issn, journal_iso = self.get_journal_dets(article)
        article_type1, article_type2, article_type3 = self.get_art_types(article)
        row = {'title':self.get_title(article),
              'articletitle':self.get_article_title(article),
              'abstract':self.get_abstract(article),
              'year':self.get_year(article),
              'articledate':articledate,
              'doi':doi,
              'pmid':pmid,
              'pii':pii,
              'pmc':pmc,
               'articletype1' :article_type1 ,
                'articletype2' :article_type2,
                'articletype3':article_type3,
               'journal_title':journal_title,
               'journal_issn':journal_issn,
               'journal_iso':journal_iso,
              }
        return row

    def get_year(self, article):
        year = None
        try:
            year = article.pubdate.year.text
        except:
            pass
        return year


    def get_articles(self, soup):
        articles = soup.find_all('pubmedarticle')
        return articles

    def get_art_types(self, article):
        article_type1 = None
        article_type2 = None
        article_type3 = None

        try:
            art_types = [x.attrs['ui']+'|'+x.text for x in article.find_all('publicationtype')]
            try:
                article_type1 = art_types[0]
            except:
                pass
            try:
                article_type2 = art_types[1]
            except:
                pass
            try:
                article_type3 = art_types[2]
            except:
                pass
        except:
            pass
        return article_type1, article_type2, article_type3

    def get_article_title(self, article):
        articletitle = None
        try:
            articletitle = article.articletitle.text
        except:
            pass
        return articletitle


    def get_title(self, article):
        title = None
        try:
            title = article.title.text
        except:
            pass
        return title

    def get_abstract(self, article):
        abstract = None
        try:
            abstract = ' '.join([x.text for x in article.abstract.find_all('abstracttext')])
        except:
            pass
        return abstract

    def get_article_dates(self, article):
        articledate = None
        reviseddate = None
        try:
            reviseddate = article.daterevised.year.text+'-'+article.daterevised.month.text+'-'+article.daterevised.day.text
        except:
            pass
        try:
            articledate = article.articledate.year.text+'-'+article.articledate.month.text+'-'+article.articledate.day.text
        except:
            pass
        return articledate, reviseddate

    def get_article_ids(self, article):
        doi = None
        pii = None
        pmid = None
        pmc = None
        try:
            ids = {x.attrs['idtype']: x.text  for x in article.find_all('articleid')}
            try:
                doi = ids['doi']
            except:
                pass
            try:
                pmid = article.pmid.text
            except:
                try:
                    pmid = ids['pmid']
                except:
                    pass
            try:
                pii = ids['pii']
            except:
                pass
            try:
                pmc = ids['pmc']
            except:
                pass
        except:
            pass
        return doi, pii, pmid, pmc

    def get_journal_dets(self, article):
        journal_title = None
        journal_issn = None
        journal_iso = None
        try:
            journal = article.journal
            try:
                journal_iso = journal.isoabbreviation.text
            except:
                pass
            try:
                journal_issn = journal.issn.text
            except:
                pass
            try:
                journal_title = journal.title.text
            except:
                pass

        except:
            pass
        return journal_title, journal_issn, journal_iso

### Example XML
(requires reading a whole file into memory, so it's a bit slow!)

In [6]:
# %%time
# filepath = filepaths[0]
# with gzip.open(filepath) as f:
#     # TODO - soup is slow, switch to etree?
#     soup = bs(f.read(),'lxml')
# articles = soup.find_all('pubmedarticle')
# articles[0]

In [7]:
years_of_interest = {'2021','2020','2019','2018'}
df = PubMedDataSet(years_of_interest = years_of_interest).process_files(filepaths)
df.shape

2021-04-09 12:42:58.565573 | Processing: C:\Users\aday\OneDrive - SAGE Publishing\DATA\PubMed\pubmed21n1062.xml.gz
20051 rows found. Years in file: {'2012', None, '2021', '2018', '2000', '2019', '2010', '2020', '2007', '2008', '1986', '2016', '2017', '2003', '2015', '1993', '1998', '2014', '1997', '2006', '2005', '2011'}
2021-04-09 12:45:30.575564 | File complete. DataFrame shape: (19547, 15)

2021-04-09 12:45:32.007050 | Processing: C:\Users\aday\OneDrive - SAGE Publishing\DATA\PubMed\pubmed21n1061.xml.gz
30000 rows found. Years in file: {'2006', None, '2021', '2018', '2017', '2000', '2019', '2020'}
2021-04-09 12:48:57.586595 | File complete. DataFrame shape: (48541, 15)

2021-04-09 12:48:59.390604 | Processing: C:\Users\aday\OneDrive - SAGE Publishing\DATA\PubMed\pubmed21n1060.xml.gz
30000 rows found. Years in file: {'2016', '2013', None, '2021', '2018', '2017', '2019', '2020', '1999', '2015'}
2021-04-09 12:52:42.993298 | File complete. DataFrame shape: (77933, 15)

2021-04-09 12:52:

(3579099, 15)

In [8]:
df.head()

Unnamed: 0,title,articletitle,abstract,year,articledate,doi,pmid,pii,pmc,articletype1,articletype2,articletype3,journal_title,journal_issn,journal_iso
0,International journal of molecular sciences,Insight into Cisplatin-Resistance Signaling of...,The microenvironment possesses a strong impact...,2020,2020-12-03,10.3390/ijms21239240,33287446,ijms21239240,PMC7730285,D016428|Journal Article,,,International journal of molecular sciences,1422-0067,Int J Mol Sci
1,"Journal of fungi (Basel, Switzerland)",Rye Snow Mold-Associated Microdochium nivale S...,Snow mold is a severe plant disease caused by ...,2020,2020-12-03,10.3390/jof6040335,33287447,jof6040335,,D016428|Journal Article,,,"Journal of fungi (Basel, Switzerland)",2309-608X,J Fungi (Basel)
2,Animals : an open access journal from MDPI,Feeding Tall Fescue Seed Reduces Ewe Milk Prod...,Endophyte-infected tall fescue (E+) produces e...,2020,2020-12-03,10.3390/ani10122291,33287449,ani10122291,,D016428|Journal Article,,,Animals : an open access journal from MDPI,2076-2615,Animals (Basel)
3,Micromachines,"Composites, Fabrication and Application of Pol...",The technological development of piezoelectric...,2020,2020-12-03,10.3390/mi11121076,33287450,mi11121076,,D016428|Journal Article,D016454|Review,,Micromachines,2072-666X,Micromachines (Basel)
4,International journal of molecular sciences,Comparison of Bisulfite Pyrosequencing and Met...,Different methodological approaches are availa...,2020,2020-12-03,10.3390/ijms21239242,33287451,ijms21239242,PMC7730915,D016428|Journal Article,,,International journal of molecular sciences,1422-0067,Int J Mol Sci


# Filter out non-research / case report article types

In [9]:
df.articletype1.value_counts().head(10)

D016428|Journal Article      3067435
D002363|Case Reports          151047
D016422|Letter                107331
D016421|Editorial              85085
D003160|Comparative Study      61925
D016425|Published Erratum      18568
D023362|Evaluation Study       15425
D016430|Clinical Trial         13324
D016433|News                   12688
D004740|English Abstract        8608
Name: articletype1, dtype: int64

In [10]:
df.articletype2.value_counts().head(10)

D013485|Research Support, Non-U.S. Gov't        540791
D016454|Review                                  249295
D016428|Journal Article                         196838
D052061|Research Support, N.I.H., Extramural    156470
D016420|Comment                                 114902
D016448|Multicenter Study                        30291
D016449|Randomized Controlled Trial              30119
D017418|Meta-Analysis                            27762
D064888|Observational Study                      25378
D016425|Published Erratum                        24147
Name: articletype2, dtype: int64

In [11]:
df.articletype3.value_counts().head(10)

D013485|Research Support, Non-U.S. Gov't            150792
D016454|Review                                       60521
D013486|Research Support, U.S. Gov't, Non-P.H.S.     29170
D052061|Research Support, N.I.H., Extramural         24735
D000078182|Systematic Review                         18972
D016449|Randomized Controlled Trial                  12215
D016448|Multicenter Study                            10992
D016420|Comment                                       8928
D016428|Journal Article                               8285
D064888|Observational Study                           8030
Name: articletype3, dtype: int64

Given the above, it looks like the first article-type is the main one. We will use this to separate case-reports and regular journal articles. For now, we will ignore other article types. 

In [12]:
# dfout = df[df['articletype1'].isin({'D016428|Journal Article','D002363|Case Reports'})]
# dfout.shape

## Write-out

In [13]:
df_path = os.path.join(working_data_path,'pubmed_sample.csv')
df.to_csv(df_path, index=False, encoding = 'utf-8-sig')