In [3]:
#get num records
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import pickle
import datetime

### Function for converting XML to pandas dataframe:

In [26]:
#convert XML subset to pandas dataframe
def iter_docs(author):
    author_attr = author.attrib
    for record in author.findall('PubmedArticle'):
        doc_dict = author_attr.copy()

        title = record.find('MedlineCitation').find('Article').find('ArticleTitle')
        if title.text is not None:
            if type(title.text) != str:
                title.text = title.text.encode('utf8')
            doc_dict['title'] = title.text
        else:
            doc_dict['title'] = ''
        
        abstract = ''
        abst_root = record.find('MedlineCitation').find('Article').find('Abstract')
        if abst_root is not None:
            for abst in abst_root.findall('AbstractText'):
                if abst.text is not None:
                    if type(abst.text) != str:
                        abst.text = abst.text.encode('utf8')
                    abstract = abstract + abst.text
        doc_dict['abstract'] = abstract
        
        #get MeSH terms as unicode strings to simplify later processing
        meshDescriptor = ''
        meshQualifier = ''
        if record.find('MedlineCitation').find('MeshHeadingList') is not None:
            for mesh in record.find('MedlineCitation').find('MeshHeadingList').findall('MeshHeading'):
                if mesh.find('DescriptorName') is not None:
                    for desc in mesh.findall('DescriptorName'):
                        if meshDescriptor == '':
                            meshDescriptor = desc.text
                        else:
                            meshDescriptor = meshDescriptor + ' ' + desc.text
                if mesh.find('QualifierName') is not None:
                    for qual in mesh.findall('QualifierName'):
                        if meshQualifier == '':
                            meshQualifier = qual.text
                        else:
                            meshQualifier = meshQualifier + ' ' + qual.text
        doc_dict['qualifier_terms'] = meshQualifier
        doc_dict['descriptor_terms'] = meshDescriptor
        
        yield doc_dict

### Read in T0 file

In [4]:
a = datetime.datetime.now().replace(microsecond=0)

#using this file for our example (on local only)
xml_file = ET.parse('/mnt/mypartition/pubmed_result_t0.xml')
root = xml_file.getroot()

b = datetime.datetime.now().replace(microsecond=0)
print b-a

0:02:12


### Convert T0 file to pandas dataframe, and assign labels.

In [27]:
#convert XML subset to pandas dataframe
df0 = pd.DataFrame(list(iter_docs(root)))
df0['sentiment'] = 'T0'
df0 = df0[['sentiment', 'title', 'abstract', 'qualifier_terms', 'descriptor_terms']]

In [30]:
#many don't have any MeSH terms
df0.shape

(35786, 5)

### Read in T1/T2 file

In [42]:
a = datetime.datetime.now().replace(microsecond=0)

#using this file for our example (on local only)
xml_file1 = ET.parse('/mnt/mypartition/pubmed_result_t1t2.xml')
root1 = xml_file1.getroot()

b = datetime.datetime.now().replace(microsecond=0)
print b-a

0:02:06


### Convert T1/T2 file to pandas dataframe, and assign labels.

In [43]:
#convert XML subset to pandas dataframe
df1 = pd.DataFrame(list(iter_docs(root1)))
df1['sentiment'] = 'T1/T2'
df1 = df1[['sentiment', 'title', 'abstract', 'qualifier_terms', 'descriptor_terms']]

In [44]:
df1.shape

(26796, 5)

### Read in T3/T4 file part 1

In [35]:
a = datetime.datetime.now().replace(microsecond=0)

#using this file for our example (on local only)
xml_file2 = ET.parse('/mnt/mypartition/pubmed_result_t3t4_firsthalf.xml')
root2 = xml_file2.getroot()

b = datetime.datetime.now().replace(microsecond=0)
print b-a

0:00:12


### Convert T3/T4 file part 1 to pandas dataframe, and assign labels.

In [36]:
#convert XML subset to pandas dataframe
df2 = pd.DataFrame(list(iter_docs(root2)))
df2['sentiment'] = 'T3/T4'
df2 = df2[['sentiment', 'title', 'abstract', 'qualifier_terms', 'descriptor_terms']]

In [37]:
df2.shape

(6087, 5)

### Read in T3/T4 file part 2

In [39]:
a = datetime.datetime.now().replace(microsecond=0)

#using this file for our example (on local only)
xml_file3 = ET.parse('/mnt/mypartition/pubmed_result_t3t4_secondhalf.xml')
root3 = xml_file3.getroot()

b = datetime.datetime.now().replace(microsecond=0)
print b-a

0:02:04


### Convert T3/T4 file part 2 to pandas dataframe, and assign labels.

In [40]:
#convert XML subset to pandas dataframe
df3 = pd.DataFrame(list(iter_docs(root3)))
df3['sentiment'] = 'T3/T4'
df3 = df3[['sentiment', 'title', 'abstract', 'qualifier_terms', 'descriptor_terms']]

In [41]:
df3.shape

(28380, 5)

### Concatenate dataframes

In [45]:
frames = [df0, df1, df2, df3]
df = pd.concat(frames, ignore_index=True)

In [46]:
print df.shape

(97049, 5)


In [48]:
print len(np.where(df['sentiment']=='T0')[0])
print len(np.where(df['sentiment']=='T1/T2')[0])
print len(np.where(df['sentiment']=='T3/T4')[0])
#counts match

35786
26796
34467


In [49]:
pickle.dump(df, open('/mnt/mypartition/Desktop2/dfJournals_trans_categories.p', 'wb'))