In [2]:
#get num records
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import pickle

In [3]:
#using this file for our example (on local only)
xml_file = ET.parse('training_set_article.xml')
root = xml_file.getroot()

In [4]:
#read in pmids
text_file = open("article_replication_PMIDs_list.txt", "r")
pmids = text_file.read().split(',')
print len(pmids)
print type(pmids[0])

354
<type 'str'>


In [5]:
#testing that we got all the records: yes! success.
i = 0
j = 0
k = 0
for record in root.findall('PubmedArticle'):
    #try:
    pmid = record.find('MedlineCitation').find('PMID').text
    if pmid not in pmids:
        print 'Error'
    #    k = k + 1
    #except AttributeError:
        #dont include if no abstract
        #something random to hold place of indent:
    #    print record.find('MedlineCitation').find('PMID').text
    #    print ("Fail:", i)
    #    j = j + 1
    i = i + 1

print i

354


### Convert to Pandas dataframe (title, abstract, mesh terms, label)

In [6]:
#convert XML subset to pandas dataframe
def iter_docs(author):
    author_attr = author.attrib
    for record in author.findall('PubmedArticle'):
        doc_dict = author_attr.copy()

        title = record.find('MedlineCitation').find('Article').find('ArticleTitle')
        if type(title.text) != str:
            title.text = title.text.encode('utf8')
        doc_dict['title'] = title.text
        pmid = record.find('MedlineCitation').find('PMID').text
        doc_dict['pmid'] = pmid
        
        abstract = ''
        for abst in record.find('MedlineCitation').find('Article').find('Abstract').findall('AbstractText'):
            if abst.text is not None:
                if type(abst.text) != str:
                    abst.text = abst.text.encode('utf8')
                abstract = abstract + abst.text
        doc_dict['abstract'] = abstract
        
        #get MeSH terms as unicode strings to simplify later processing
        meshDescriptor = ''
        meshQualifier = ''
        if record.find('MedlineCitation').find('MeshHeadingList') is not None:
            for mesh in record.find('MedlineCitation').find('MeshHeadingList').findall('MeshHeading'):
                if mesh.find('DescriptorName') is not None:
                    for desc in mesh.findall('DescriptorName'):
                        if meshDescriptor == '':
                            meshDescriptor = desc.text
                        else:
                            meshDescriptor = meshDescriptor + ' ' + desc.text
                if mesh.find('QualifierName') is not None:
                    for qual in mesh.findall('QualifierName'):
                        if meshQualifier == '':
                            meshQualifier = qual.text
                        else:
                            meshQualifier = meshQualifier + ' ' + qual.text
        doc_dict['qualifier_terms'] = meshQualifier
        doc_dict['descriptor_terms'] = meshDescriptor
        
        yield doc_dict
        
df = pd.DataFrame(list(iter_docs(root)))
df = df[['pmid', 'title', 'abstract', 'qualifier_terms', 'descriptor_terms']]

In [7]:
df

Unnamed: 0,pmid,title,abstract,qualifier_terms,descriptor_terms
0,23195993,Gene expression profiles in peripheral blood m...,Occupational exposure to nickel (Ni) is associ...,genetics metabolism epidemiology drug effects ...,Adult Asian Continental Ancestry Group Biomark...
1,25077433,SOX2 is a cancer-specific regulator of tumour ...,Although the principles that balance stem cell...,genetics metabolism pathology metabolism patho...,"Animals Carcinoma, Squamous Cell Cell Line, Tu..."
2,24107601,Imaging and cerebrospinal fluid biomarkers in ...,The pathophysiological process of Alzheimer's ...,cerebrospinal fluid genetics radionuclide imag...,"Aged Aged, 80 and over Alzheimer Disease Amylo..."
3,24891029,Preliminary evidence of cognitive and brain ab...,To ascertain whether pediatric obesity without...,pathology physiopathology physiopathology,Adolescent Attention Brain Cognition Executive...
4,21691448,Obese Adolescents with Type 2 Diabetes Mellitu...,The rates of type 2 diabetes (T2DM) continue t...,,
5,22765222,Diversity of 5S rRNA genes within individual p...,We examined intragenomic variation of paralogo...,chemistry classification genetics chemistry ge...,"Bacteria DNA, Ribosomal Databases, Nucleic Aci..."
6,24401686,Myoinositol and glutamate complex neurometabol...,To obtain quantitative neurometabolite measure...,analogs & derivatives metabolism metabolism me...,Adult Aspartic Acid Brain Injuries Case-Contro...
7,22914093,Antibiotics in early life alter the murine col...,Antibiotics administered in low doses have bee...,drug effects physiology administration & dosag...,Adiposity Age Factors Animals Anti-Bacterial A...
8,23426830,Elevated serum anti-Müllerian hormone in adole...,Serum anti-Müllerian hormone (AMH) is linked t...,blood pathology ultrasonography blood ultrason...,Adolescent Anti-Mullerian Hormone Child Female...
9,24344399,Association of obesity-mediated insulin resist...,The hypothalamus is important in hunger and me...,blood analysis anatomy & histology blood blood...,Adolescent Adult Brain-Derived Neurotrophic Fa...


In [10]:
df.iloc[0]['descriptor_terms']

'Adult Asian Continental Ancestry Group Biomarkers Case-Control Studies China Gene Expression Profiling Humans Leukocytes, Mononuclear Male Metallurgy Middle Aged Nickel Occupational Diseases Occupational Exposure Oligonucleotide Array Sequence Analysis Prognosis RNA, Messenger Real-Time Polymerase Chain Reaction Reverse Transcriptase Polymerase Chain Reaction Risk Factors'

### Add labels to dataframe from Excel file

In [8]:
#read in Excel file-- didn't work
#labs = pd.read_excel('/home/sarahwie/Documents/pubmed-nlp-research/article_replication/data_files/TrainingSet.xlsx')

In [8]:
#do manually instead
labs = pd.read_csv('/home/sarahwie/Documents/pubmed-nlp-research/article_replication/data_files/labels.csv')

In [9]:
print labs

         pmid  label
0    23195993     T0
1    25077433     T0
2    24107601     T0
3    24891029     T0
4    21691448     T0
5    22765222     T0
6    24401686     T0
7    22914093     T0
8    23426830     T0
9    24344399     T0
10   22761902     T0
11   23344457     T0
12   24444828     T0
13   22527432     T0
14   22311477     T0
15   22021878     T0
16   21527502     T0
17   23455502     T0
18   21350433     T0
19   23661059     T0
20   22869155     T0
21   23595360     T0
22   22554617     T0
23   24286176     T0
24   24520065     T0
25   21333643     T0
26   22891239     T0
27   23720714     T0
28   20160346     T0
29   22611242     T0
..        ...    ...
324  24556849  T3/T4
325  22496918  T3/T4
326  20603056  T3/T4
327  23311405  T3/T4
328  22871534  T3/T4
329  22613170  T3/T4
330  23104617  T3/T4
331  23428947  T3/T4
332  22291119  T3/T4
333  24321266  T3/T4
334  22966967  T3/T4
335  20451875     TX
336  24636091     TX
337  22290875     TX
338  24485987     TX
339  23129109

In [10]:
#append labels
df['label'] = 'A'
for i in range(len(df)):
    #get true label value from labels
    pmid = df.iloc[i]['pmid']
    df.set_value(i, 'label', labs['label'][i])

In [11]:
df

Unnamed: 0,pmid,title,abstract,qualifier_terms,descriptor_terms,label
0,23195993,Gene expression profiles in peripheral blood m...,Occupational exposure to nickel (Ni) is associ...,genetics metabolism epidemiology drug effects ...,Adult Asian Continental Ancestry Group Biomark...,T0
1,25077433,SOX2 is a cancer-specific regulator of tumour ...,Although the principles that balance stem cell...,genetics metabolism pathology metabolism patho...,"Animals Carcinoma, Squamous Cell Cell Line, Tu...",T0
2,24107601,Imaging and cerebrospinal fluid biomarkers in ...,The pathophysiological process of Alzheimer's ...,cerebrospinal fluid genetics radionuclide imag...,"Aged Aged, 80 and over Alzheimer Disease Amylo...",T0
3,24891029,Preliminary evidence of cognitive and brain ab...,To ascertain whether pediatric obesity without...,pathology physiopathology physiopathology,Adolescent Attention Brain Cognition Executive...,T0
4,21691448,Obese Adolescents with Type 2 Diabetes Mellitu...,The rates of type 2 diabetes (T2DM) continue t...,,,T0
5,22765222,Diversity of 5S rRNA genes within individual p...,We examined intragenomic variation of paralogo...,chemistry classification genetics chemistry ge...,"Bacteria DNA, Ribosomal Databases, Nucleic Aci...",T0
6,24401686,Myoinositol and glutamate complex neurometabol...,To obtain quantitative neurometabolite measure...,analogs & derivatives metabolism metabolism me...,Adult Aspartic Acid Brain Injuries Case-Contro...,T0
7,22914093,Antibiotics in early life alter the murine col...,Antibiotics administered in low doses have bee...,drug effects physiology administration & dosag...,Adiposity Age Factors Animals Anti-Bacterial A...,T0
8,23426830,Elevated serum anti-Müllerian hormone in adole...,Serum anti-Müllerian hormone (AMH) is linked t...,blood pathology ultrasonography blood ultrason...,Adolescent Anti-Mullerian Hormone Child Female...,T0
9,24344399,Association of obesity-mediated insulin resist...,The hypothalamus is important in hunger and me...,blood analysis anatomy & histology blood blood...,Adolescent Adult Brain-Derived Neurotrophic Fa...,T0


In [12]:
print len(np.where(df['label']=='TX')[0])
print len(np.where(df['label']=='T0')[0])
print len(np.where(df['label']=='T1/T2')[0])
print len(np.where(df['label']=='T3/T4')[0])
#counts match

30
162
67
94


In [13]:
pickle.dump(df, open('/home/sarahwie/Documents/pubmed-nlp-research/article_replication/df.p', 'wb'))

### Construct pandas dataframe only using abstracts (no titles or mesh terms)

In [15]:
#convert XML subset to pandas dataframe
def iter_docs(author):
    author_attr = author.attrib
    for record in author.findall('PubmedArticle'):
        doc_dict = author_attr.copy()

        title = record.find('MedlineCitation').find('Article').find('ArticleTitle').text
        doc_dict['title'] = title
        pmid = record.find('MedlineCitation').find('PMID').text
        doc_dict['pmid'] = pmid
        
        abstract = ''
        for abst in record.find('MedlineCitation').find('Article').find('Abstract').findall('AbstractText'):
            if abst.text is not None:
                if type(abst.text) != str:
                    abst.text = abst.text.encode('utf8')
                abstract = abstract + abst.text
        doc_dict['abstract'] = abstract
        
        #get MeSH terms as unicode strings to simplify later processing
        meshDescriptor = ''
        meshQualifier = ''
        if record.find('MedlineCitation').find('MeshHeadingList') is not None:
            for mesh in record.find('MedlineCitation').find('MeshHeadingList').findall('MeshHeading'):
                if mesh.find('DescriptorName') is not None:
                    for desc in mesh.findall('DescriptorName'):
                        if meshDescriptor == '':
                            meshDescriptor = desc.text
                        else:
                            meshDescriptor = meshDescriptor + ' ' + desc.text
                if mesh.find('QualifierName') is not None:
                    for qual in mesh.findall('QualifierName'):
                        if meshQualifier == '':
                            meshQualifier = qual.text
                        else:
                            meshQualifier = meshQualifier + ' ' + qual.text
        doc_dict['qualifier_terms'] = meshQualifier
        doc_dict['descriptor_terms'] = meshDescriptor
        
        yield doc_dict
        
df_absonly = pd.DataFrame(list(iter_docs(root)))
df_absonly = df_absonly[['pmid', 'title', 'abstract', 'qualifier_terms', 'descriptor_terms']]

In [16]:
#append labels
df_absonly['label'] = 'A'
for i in range(len(df_absonly)):
    #get true label value from labels
    pmid = df_absonly.iloc[i]['pmid']
    df_absonly.set_value(i, 'label', labs['label'][i])

In [17]:
pickle.dump(df_absonly, open('/home/sarahwie/Documents/pubmed-nlp-research/article_replication/df_absOnly.p', 'wb'))

## Get External Validation set

In [2]:
#decided to use the same training/test set and just use a subset for validation.
#The actual validation set is the records that were scored really highly by their model. 
#So don't think we can compare performance scores for those.