In [2]:
from xml.etree.ElementTree import iterparse
import pandas as pd
from tqdm import tqdm

In [23]:
def parse_element(elem):
    
    data = {}
    data['accession'] = elem.find('{http://uniprot.org/uniprot}accession').text
    data.update(
        {db.get('type'): db.get('id') for db in elem.findall('{http://uniprot.org/uniprot}dbReference')}
    )
    organism_db = elem.find('{http://uniprot.org/uniprot}organism').find(
        '{http://uniprot.org/uniprot}dbReference')
    data.update({organism_db.get('type'): organism_db.get('id')})
    data['length'] = int(elem.find('{http://uniprot.org/uniprot}sequence').get('length'))
    data['sequence'] = elem.find('{http://uniprot.org/uniprot}sequence').text
    data['subcellularLocalization'] = '\n'.join(
        (item.find('{http://uniprot.org/uniprot}location').text for item in 
         filter(lambda x: x.tag == '{http://uniprot.org/uniprot}subcellularLocation', elem.iter())))
    
    return pd.Series(data)

def xml_gen():
    with open('/projects/bpms/pstjohn/swissprot/uniprot_sprot.xml') as source:
        context = iterparse(source, events=('start', 'end',))
        event, root = next(context)

        for i, (event, elem) in enumerate(context):
            if elem.tag == '{http://uniprot.org/uniprot}entry' and event == 'end':
                                
                yield parse_element(elem)
                root.clear()

In [24]:
swissprot_df = pd.DataFrame(tqdm(xml_gen()))

561568it [10:32, 888.01it/s] 


In [25]:
isnull = swissprot_df.isnull().sum(0)

In [27]:
df_to_save = swissprot_df.loc[:, swissprot_df.columns.isin(isnull[isnull.sort_values() < 100000].index)].drop(['GO'], 1)

In [28]:
df_to_save.head()

Unnamed: 0,accession,EMBL,RefSeq,KEGG,InterPro,Pfam,NCBI Taxonomy,length,sequence,subcellularLocalization
0,Q6GZX4,AY548484,YP_031579.1,vg:2947773,IPR007031,PF04947,654924,256,MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQV...,
1,Q6GZX3,AY548484,YP_031580.1,vg:2947774,IPR004251,PF03003,654924,320,MSIIGATRLQNDKSDTYSAGPCYAGGCSAFTPRGTCGKDWDLGEQT...,Host membrane
2,Q197F8,DQ643392,YP_654574.1,vg:4156251,,,345201,458,MASNTVSAQGGSNRPVRDFSNIQDVAQFLLFDPIWNEQPGSIVPWK...,
3,Q197F7,DQ643392,YP_654575.1,vg:4156252,,,345201,156,MYQAINPCPQSWYGSPQLEREIVCKMSGAPHYPNYYPVHPNALGGA...,
4,Q6GZX2,AY548484,YP_031581.1,vg:2947775,,,654924,438,MARPLLGKTSSVRRRLESLSACSIFFFLRKFCQKMASLVFLNSPVY...,


In [34]:
df_to_save.to_parquet('/projects/bpms/pstjohn/swissprot/parsed_swissprot.parquet')