This notebook details downloading the datasets and preparing it for the later model fine tuning.

Import required libraries

In [1]:
import xml.etree.ElementTree as ET
import urllib3
import re
import pickle
import progressbar
import ftplib
import numpy as np
import pandas as pd
import os

Searched Pubmed Central for "algorithmic bias" and filtered for articles that are fully freely accessible on pubmed. The list of pmc's was then exported to a file.

In [2]:
pmcs = list()
with open("pmc_result_algorithmicbias.txt") as f:
    for pmc in f.readlines():
        pmcs.append(pmc.strip())


Downloaded a table with the FTP information on all freely accessible articles on PMC from https://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_file_list.txt, which is loaded into python below

In [3]:
oa_files = pd.read_table("oa_file_list.txt", sep='\t', 
                         names=["ftppath","source","PMCID","PMID","Copyright"], 
                        skiprows=1)


Download the articles from the list using FTP.

In [None]:
bar = progressbar.ProgressBar(max_value=len(pmcs))
i=0
erroredpmcs = list()
for pmc in pmcs:
    if not os.path.exists("papers/" + pmc + ".tar.gz"):
        try:
            ftp = ftplib.FTP("ftp.ncbi.nlm.nih.gov")
            ftp.login()
            ftp.cwd("pub/pmc")

            with open("papers/" + pmc + ".tar.gz", 'wb') as f:
                ftp.retrbinary("RETR " + 
                               oa_files.loc[oa_files["PMCID"]==pmc, "ftppath"].values[0],
                              f.write)
            ftp.quit()
        except:
            erroredpmcs.append(pmc)
    i+=1
    bar.update(i)

#ftp.retrlines("LIST")


  5% (571 of 10322) |#                   | Elapsed Time: 0:05:16 ETA:   5:56:59

This should be run on a unix command line, extract article XMLs from the downloaded tarballs

In [53]:
#Run on command line:
#for var in *.tar.gz
#do
#  tar -xf $var
#  cp ${var/.tar.gz/}/*xml xmls/${var/.tar.gz/.nxml}
#  tar -czf $var ${var/.tar.gz}/
#  rm -r ${var/.tar.gz}
#done

A function to remove other article references and figure and table references, to allow the text to conform better to natural language.

In [4]:
def clean_data(data):
    '''
    Remove anything in [], generally reference numbers
    Also, remove figure and table references
    '''
    newdata = re.sub(" ?\[.*\]", '', data)
    newdata = re.sub(" ?\(Fig.*\)", '', newdata)
    newdata = re.sub(" ?\(Table.*\)", '', newdata)
    return newdata

Read in each of the files and attempt to parse them into python dictionaries. Included all common sections of papers except for methods, references, and supplementary information so as to provide the most natural type of language to the model.

In [5]:
relevantSections = np.array(["title","abstract", "introduction", "result", "discussion", "conclusion"])
bar = progressbar.ProgressBar(max_value=len(pmcs))
i=0
articles = dict()
for pmc in pmcs:
    if not os.path.exists("papers/xmls/" + pmc + ".nxml"):
        i+=1
        continue
    article = {x:"" for x in relevantSections}
    try:
        root = ET.parse("papers/xmls/" + pmc + ".nxml").getroot()
    except:
        print(pmc)
        i+=1
        continue
    article["title"] = root.find("front").find("article-meta").find("title-group").findtext("article-title")
    if root.find("front").find("article-meta").find("abstract"):
        if root.find("front").find("article-meta").find("abstract").find("sec")\
        and root.find("front").find("article-meta").find("abstract").find("sec").find("p"):
            if root.find("front").find("article-meta").find("abstract").find("sec").find("sec"):
                article["abstract"] = " ".join([sec.findtext("p") 
                                                for sec in root.find("front").find("article-meta").find("abstract").find("sec").findall("sec")])
            else:
                article["abstract"] = " ".join([sec.findtext("p") 
                                                for sec in root.find("front").find("article-meta").find("abstract").findall("sec")])
        else:
            article["abstract"] = root.find("front").find("article-meta").find("abstract").findtext("p")
    if not root.find("body"):
        i+=1
        continue
    for x in root.find("body").findall("sec"):
        sec = x.findtext("title")
        if not sec or not any(section in sec.lower() for section in relevantSections):
            continue
        for section in relevantSections:
            if section in sec.lower():
                sec = section
        article[sec] = ""
        for y in x.iter("p"):
            text = clean_data("".join(y.itertext()))
            article[sec] += text
    articles[pmc] = article
    bar.update(i)
    i+=1


 99% (10309 of 10322) |################# | Elapsed Time: 0:00:43 ETA:   0:00:00

Save the processed articles to a pickle file

In [6]:
with open("articles.obj", 'wb') as f:
    pickle.dump(articles, f)