In [5]:
from xml.etree.ElementTree import iterparse
import pandas as pd
from tqdm import tqdm

In [51]:
def parse_element(elem):
    
    data = {}
    data['accession'] = elem.find('{http://uniprot.org/uniprot}accession').text
    data.update(
        {db.get('type'): db.get('id') for db in elem.findall('{http://uniprot.org/uniprot}dbReference')}
    )
    organism_db = elem.find('{http://uniprot.org/uniprot}organism').find(
        '{http://uniprot.org/uniprot}dbReference')
    data.update({organism_db.get('type'): organism_db.get('id')})
    data['length'] = int(elem.find('{http://uniprot.org/uniprot}sequence').get('length'))
    data['sequence'] = elem.find('{http://uniprot.org/uniprot}sequence').text
    
    return pd.Series(data)

def xml_gen():
    with open('/projects/bpms/pstjohn/swissprot/uniprot_sprot.xml') as source:
        context = iterparse(source, events=('start', 'end',))
        event, root = next(context)

        for i, (event, elem) in enumerate(context):
            if elem.tag == '{http://uniprot.org/uniprot}entry' and event == 'end':
                yield parse_element(elem)
                root.clear()
                
                
tqdm(xml_gen())

In [53]:
swissprot_df = pd.DataFrame(tqdm(xml_gen()))

561568it [09:56, 941.03it/s] 


In [58]:
isnull = swissprot_df.isnull().sum(0)

accession            0
EMBL             12061
RefSeq           95397
KEGG             85618
GO               25127
InterPro         18801
Pfam             39112
NCBI Taxonomy        0
length               0
sequence             0
dtype: int64

In [67]:
df_to_save = swissprot_df.loc[:, swissprot_df.columns.isin(isnull[isnull.sort_values() < 100000].index)].drop(['GO'], 1)

In [69]:
df_to_save.to_parquet('/projects/bpms/pstjohn/parsed_swissprot.parquet')