# Load taxonomy labels from uniprot

This notebook will query uniprot or uniref for genes that are in a fasta file (or alignment a2m file) and output the taxonomy for those genes.

1. Query uniref for each gene name (one-at-a-time) to get the taxonomy ID. 
   Save results to the filesystem for caching.
2. Use the NCBITaxa service to load the phylogeny 

Requires `sequence_space_data.zip` from https://evcouplings.org/3Dseq

Requires: NCBITaxa, BioPython, and Pandas

In [0]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format='png'

In [0]:
from ete3 import NCBITaxa
import requests
import xml.etree.ElementTree as ET
from Bio import SeqIO
import pandas as pd

In [0]:
UNIREF_URL = 'https://www.uniprot.org/uniref/{0}.xml'
UNIPROT_URL = 'https://www.uniprot.org/uniprot/{0}.xml'

# Functions to query genes from Uniref and Uniprot

In [0]:
def getAndSaveExternalRecord(url, output_file):
    '''
    Request a uniref xml record from the uniref website. 
    Save to filesystem for caching purposes.
    Returns the record text.
    '''
    response = requests.get(url)
    if response.ok:
        if len(response.text) < 1:
            raise IOError('{0} had invalid response: {1}'.format(url, 'returned length is zero'))
        
        f = open(output_file, 'w')
        f.write(response.text)
        f.close()
        return response.text
    else:
        raise IOError('{0} had invalid response: {1}'.format(url, response))
    return None

def getTaxonomyIdFromUniref(unirefid, output_dir, local_cache_only = False, recurse=True):
    '''
    Load taxonomy from local filesystem (if available) or from uniref directly.
    Throws an IOError if the uniref ID returns a 404 or other error when
    querying uniref. Returns the taxonomy id as a string.
    '''
    filename = '{0}/{1}.xml'.format(output_dir, unirefid)
    
    try:
        xmlroot = ET.parse(filename).getroot()
        for element in xmlroot.iter():
            if 'type' in element.attrib and element.attrib['type'] == 'NCBI taxonomy':
                return element.attrib['value']
    except:
        if local_cache_only == False:
            if recurse:
                getAndSaveExternalRecord(
                    UNIREF_URL.format(unirefid), filename
                )
                return getTaxonomyIdFromUniref(unirefid, output_dir)
            else:
                raise IOError('Unable to load uniref {0} after 2 attempts'.format(unirefid))
        raise IOError('{0}.xml does not exist and local_cache_only set to True'.format(unirefid))
        
def getTaxonomyIdFromUniprot(uniprotid, output_dir, local_cache_only = False, recurse=True):
    '''
    Load taxonomy from local filesystem (if available) or from uniprot directly.
    Throws an IOError if the uniref ID returns a 404 or other error when
    querying uniprot. Returns the taxonomy id as a string.
    '''
    filename = '{0}/{1}.xml'.format(output_dir, uniprotid)
    
    try:
        xmlroot = ET.parse(filename).getroot()
        for element in xmlroot.iter():
            if 'type' in element.attrib and element.attrib['type'] == 'NCBI Taxonomy':
                return element.attrib['id']
    except:
        if local_cache_only == False:
            if recurse:
                getAndSaveExternalRecord(
                    UNIPROT_URL.format(uniprotid), filename
                )
                return getTaxonomyIdFromUniprot(uniprotid, output_dir, recurse=False) #only try once
            else:
                raise IOError('Unable to load uniprot {0} after 2 attempts'.format(uniprotid))
        raise IOError('{0}.xml does not exist and local_cache_only set to True'.format(uniprotid))
    
        
def getSequencenameTaxonomyidHM(alignment_filename, output_dir, local_cache_only = False):
    '''
    Loads taxonomy ids for all sequences in an alignment. Returns a
    hashmap with sequence name as key and the tax id as value. Sequences that
    do not have a valid uniref ID are omitted from the returned hashmap.
    
    local_cache_only: If set to try, uniref will not be queried and only the
                      local file system will be used (useful to force caching)
    returns: a Dataframe with two columns (1) seq_record and (2) tax_id
    '''
    loaderrorcount = 0
    
    toreturn = {
        'seq_record': [],
        'seq_name': [],
        'sequence': [],
        'tax_id': []
    }
    sequences = list(SeqIO.parse(alignment_filename, 'fasta'))
    for seq_record in sequences:
        tax_id = None
        if 'up|' in seq_record.name:
            uniprotid = seq_record.name.split('|')[1]
            try:
                tax_id = getTaxonomyIdFromUniprot(
                    uniprotid, output_dir, local_cache_only=local_cache_only
                )
            except IOError as e:
                loaderrorcount += 1
                print('{0}: {1}'.format(loaderrorcount, e))
            
        elif 'UniRef' in seq_record.name:
            seqname = seq_record.name
            if 'ur|' in seq_record.name:
                seqname = seqname.split('|')[2]
                
            unirefid = seqname.split('/')[0]
            try:
                tax_id = getTaxonomyIdFromUniref(
                    unirefid, output_dir, local_cache_only=local_cache_only
                )
            except IOError as e:
                loaderrorcount += 1
                print('{0}: {1}'.format(loaderrorcount, e))
        else:
            print('warning: unable to determine sequence record type: {0}'.format(seq_record.name))
            
        if tax_id:
            toreturn['seq_record'].append( seq_record )
            toreturn['seq_name'].append( seq_record.name )
            toreturn['sequence'].append( str(seq_record.seq) )
            toreturn['tax_id'].append( tax_id )
        
    return pd.DataFrame.from_dict(toreturn)




# Query taxonomy from NCBITaxa

In [0]:
ncbi = NCBITaxa()

In [0]:
def loadTaxonomyLineage(tax_ids):
    rank_sequencevalue_hm = { #will become part of the dataframe
        'superkingdom': [],
        'phylum': [],
        'genus': [],
        'class': [],
        'subphylum': [],
        'family': [],
        'order': [],
        'species': [],
    }
    for tax_id in tax_ids:
        try:
            lineage = ncbi.get_lineage(int(tax_id))
            lineageid_name_dict = ncbi.get_taxid_translator(lineage) #dict: key=lineageid, value=sequence value
            lineageid_rank_dict = ncbi.get_rank(lineage)
            rank_lineageid_dict = dict((v,k) for k,v in lineageid_rank_dict.items())

            for rank in rank_sequencevalue_hm:
                sequence_value_for_rank = None
                if rank in rank_lineageid_dict:
                    lineageid = rank_lineageid_dict[rank]
                    sequence_value_for_rank = lineageid_name_dict[lineageid]
                rank_sequencevalue_hm[rank].append(
                    sequence_value_for_rank
                )
        except ValueError as e:
            print('Warning: {0}'.format(str(e)))
    return pd.DataFrame.from_dict(rank_sequencevalue_hm)

# Load and Save AAC6 Lineages

In [0]:
ALIGNMENT_DIR = 'data.AAC6'
SEQUENCE_ALIGNMENT = ALIGNMENT_DIR+'/44883374318b63406a7415d2f4d4cfc1_b0.4.a2m'
CACHE_DIR = 'UNIREF_RECORDS_AAC6'
OUTPUT_FILENAME = ALIGNMENT_DIR+'/AAC6_NATURAL_TAXONOMY.csv'

#Load taxonomy IDS. 
#  Note:  loaded 3740 uniref records and saved them to the file system.
#         unable to recover 253 of the IDs in the AAC6 natural alignment (404 response)
#         2 records were manually loaded due to retry errors (connection issues?)
aac6_df = getSequencenameTaxonomyidHM(SEQUENCE_ALIGNMENT, CACHE_DIR, local_cache_only=False)

In [0]:
#output to csv
pd.concat([aac6_df, loadTaxonomyLineage(aac6_df.tax_id)], 
          axis=1, 
          sort=False).drop('seq_record', axis=1).to_csv(
    
    OUTPUT_FILENAME, sep='\t', index=False
)

# Load and Save PSE1 Lineages

In [0]:
ALIGNMENT_DIR = 'data.PSE1'
CACHE_DIR = 'UNIREF_UNIPROT_RECORDS_PSE1'
OUTPUT_FILENAME = ALIGNMENT_DIR+'/PSE1_NATURAL_TAXONOMY.csv'
SEQUENCE_ALIGNMENT = ALIGNMENT_DIR+'/7fa1c5691376beab198788a726917d48_b0.4.a2m'

In [0]:
#Load taxonomy IDS. 
#  Note:  loaded 2208 uniref or uniprot records and saved them to the file system.
#         unable to recover 408 of the IDs in the PSE1 natural alignment (404 response
#         with uniref or blank response with uniprot)
pse1_df = getSequencenameTaxonomyidHM(SEQUENCE_ALIGNMENT, CACHE_DIR, local_cache_only=False)



In [0]:
#output to csv
pd.concat([pse1_df, loadTaxonomyLineage(pse1_df.tax_id)], 
          axis=1, 
          sort=False).drop('seq_record', axis=1).to_csv(
    
    OUTPUT_FILENAME, sep='\t', index=False
)