In [55]:
import os, re, requests, json
import numpy as np
import pandas as pd
from urllib.request import urlopen
from time import sleep

In [2]:
def get_url(url, **kwargs):
    '''
    Obatin a response from a given url
    '''
    response = requests.get(url, **kwargs);

    if not response.ok:
        print(response.text)
        response.raise_for_status()
        sys.exit()

    return response

# Step 1: Scan raw data, judge AH or not, obtain organism and gene names, etc

In [None]:
# uniprot API URL
WEBSITE_API = "https://rest.uniprot.org/uniprotkb/"

In [2]:
# raw data import
files = os.listdir('./SourceData/Results from the paper/')
print(len(files))

11761


In [47]:
# prepare a dataframe and array for the ID not found in uniprot
df = pd.DataFrame()

# scan raw data files, judge AH_or_not, and get the gene info from uniprot based on the ID called "entry"
for i, file in enumerate(files):
    
    # check the file format
    if file.endswith('result'):
        
        # obtain uniprot ID called entry
        entry = file.replace('.result', '')
        
        # open the file
        with open('./SourceData/Results from the paper/' + file) as f:
            lines = f.readlines()
    
        # get the amino-acid sequence and prediction
        aa_sequence = lines[1]
        prediction = lines[3]

        # judge if the protein predicted to contain AH or not
        if '1' in prediction:
            AH_or_not = 'AH'
        else:
            AH_or_not = 'Non-AH'

        # get the result of query
        try:
            r = get_url(f"{WEBSITE_API}/search?query=(accession:{entry})&fields=organism_name,gene_primary,protein_name,cc_subcellular_location")
            result = r.json()['results'][0]
        except requests.exceptions.ConnectionError:
            r.status_code = "Connection refused"
            break
        
        # extract wanted texts from the result
        ## organism name
        ### put try-except in case the gene info is obsolete
        try:
            organism_name = result['organism']['scientificName']
        except:
            organism_name = 'Unknown'
        
        ## gene name
        ### put try-except in case it lacks geneName
        try: 
            gene_name = result['genes'][0]['geneName']['value']
        except: gene_name = 'Unknown'

        ## protein name
        try: 
            protein_name = result['proteinDescription']['recommendedName']['fullName']['value'] 
        except: protein_name = 'Unknown'

        ## subcellular locations
        subcell_loc = list()
        try:
            for v in result['comments'][0]['subcellularLocations']:
                subcell_loc.append(v['location']['value'])
                subcell_loc = ', '.join(subcell_loc)
        except:
            subcell_loc = 'Unknown'
        
        
        # put the values to the dataframe
        df.loc[i, 'Entry_original'] = entry
        df.loc[i, 'Organism'] = organism_name
        df.loc[i, 'Gene_name'] = gene_name
        df.loc[i, 'Protein_name'] = protein_name
        df.loc[i, 'AH_or_Not'] = AH_or_not
        df.loc[i, 'AA_sequence'] = aa_sequence
        df.loc[i, 'Prediction'] = prediction
        df.loc[i, 'SubCell_Uniprot'] = subcell_loc
        
        # log every 100 genes
        if i % 100 == 0: print(i, entry, organism_name, protein_name)
    
    # take a break, go next
    sleep(1)
            

0 Q9CAN8 Arabidopsis thaliana Lysophospholipid acyltransferase 2
100 P53389 Saccharomyces cerevisiae (strain ATCC 204508 / S288c) Protein HOL1
200 Q12155 Saccharomyces cerevisiae (strain ATCC 204508 / S288c) Uncharacterized membrane protein YLR050C
300 O26830 Methanothermobacter thermautotrophicus (strain ATCC 29096 / DSM 1053 / JCM 10044 / NBRC 100330 / Delta H) Putative phospho-N-acetylmuramoyl-pentapeptide-transferase
400 Q51MB1 Magnaporthe oryzae (strain 70-15 / ATCC MYA-4617 / FGSC 8958) pH-response regulator protein palI/RIM9
500 Q3E8Z8 Arabidopsis thaliana Putative pectinesterase/pectinesterase inhibitor 28
600 Q5NCP0 Mus musculus E3 ubiquitin-protein ligase RNF43
700 Q22549 Caenorhabditis elegans Innexin-10
800 O07888 Treponema pallidum (strain Nichols) Flagellar protein FliL
900 Q7Z695 Homo sapiens Uncharacterized aarF domain-containing protein kinase 2
1000 Q674R7 Homo sapiens Autophagy-related protein 9B
1100 O43085 Schizosaccharomyces pombe (strain 972 / ATCC 24843) DSC E3 

In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11759 entries, 0 to 11760
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Entry_original   11759 non-null  object
 1   Organism         11759 non-null  object
 2   Gene_name        11759 non-null  object
 3   Protein_name     11759 non-null  object
 4   AH_or_Not        11759 non-null  object
 5   AA_sequence      11759 non-null  object
 6   Prediction       11759 non-null  object
 7   SubCell_Uniprot  11759 non-null  object
dtypes: object(8)
memory usage: 1.1+ MB


In [49]:
# export
df.to_csv('./IntermediateProducts/Results_step_1.csv', index=False)

# Step 2: Sort genes of metazoans

In [8]:
WEBSITE_API = 'https://rest.uniprot.org/taxonomy/'

In [3]:
# Import the df from step 1
df = pd.read_csv('./IntermediateProducts/Results_step_1.csv')

In [150]:
# create organism list
organism_list = df['Organism'].unique().tolist()
print('Number of organism species in the data is ', len(organism_list))

Number of organism species in the data is  1521


In [134]:
# a list for storage
lineage_list = list()

# regex for extracing organism name WITHOUT items in parenthesis
regex = re.compile(r'([^()]+)(\(.+\))?')

# scan organism list and get lineage from Uniprot Taxonomy
for i, organism in enumerate(organism_list):
    # extract organism name WITHOUT items in parenthesis such as strain name
    mo = regex.search(organism)
    if mo is None:
        lineage_list.append('Unknown')
        continue
    else: 
        organism = mo.group(1)
    
    # get response that contains lineage
    try:
        r = get_url(f'{WEBSITE_API}/search?query=(scientific:"{organism}")&fields=lineage')
        result = r.json()
    except requests.exceptions.ConnectionError:
        r.status_code = "Connection refused"
        break
    
    # extract lineage from the result
    lineage = list()
    lineage_full = ''
    try:
        for l in result['results'][0]['lineage']:
            name = l['scientificName']
            lineage.append(name)
        lineage_full = ', '.join(lineage)
    except:
        lineage_full = 'NotFound' # in case the result is empty
        
    # add to the found lineage to the list
    lineage_list.append(lineage_full)
    
    # log every 100
    if i % 100 == 0: print(i, organism, lineage[-2:])
    
    # take a break, go next
    sleep(1)

0 Arabidopsis thaliana ['Eukaryota', 'cellular organisms']
100 Salmonella phage P22 ['Duplodnaviria', 'Viruses']
200 Acropora millepora ['Eukaryota', 'cellular organisms']
300 Streptococcus pneumoniae  ['Bacteria', 'cellular organisms']
400 Acidianus bottle-shaped virus  []
500 Petunia integrifolia ['Eukaryota', 'cellular organisms']
600 Haloquadratum walsbyi  ['Archaea', 'cellular organisms']
700 Xanthomonas campestris pv. campestris  ['Bacteria', 'cellular organisms']
800 Friend murine leukemia virus  ['Riboviria', 'Viruses']
900 Oceanobacillus iheyensis  ['Bacteria', 'cellular organisms']
1000 Artemia salina ['Eukaryota', 'cellular organisms']
1100 Citrifermentans bemidjiense  ['Bacteria', 'cellular organisms']
1200 Quaranfil virus  ['Riboviria', 'Viruses']
1300 Planococcus maritimus ['Bacteria', 'cellular organisms']
1400 Macaca radiata ['Eukaryota', 'cellular organisms']
1500 Dechloromonas aromatica  ['Bacteria', 'cellular organisms']


In [141]:
# Add to the df_org
df_org = pd.DataFrame(organism_list, columns=['Organism'])
df_org['Lineage'] = lineage_list

# export
df_org.to_csv('./IntermediateProducts/Organisms_and_Lineage.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 122 entries, 4 to 1518
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Organism  122 non-null    object
 1   Lineage   122 non-null    object
dtypes: object(2)
memory usage: 2.9+ KB
None


In [152]:
# Select metazoans
df_org_metazoa = df_org[df_org['Lineage'].str.contains('Metazoa')]
print('Number of metazoan species is ', len(df_org_metazoa))

Number of metazoan species is  122


In [165]:
# How many are eukaryotes, bacteria, archea, and virus
n_eukaryote = len(df_org[df_org['Lineage'].str.contains('Eukaryot')])
n_bacteria = len(df_org[df_org['Lineage'].str.contains('Bacteria')])
n_archaea = len(df_org[df_org['Lineage'].str.contains('Archaea')])
n_virus = len(df_org[df_org['Lineage'].str.contains('Virus')])
n_notFound = len(df_org) - np.sum([n_eukaryote, n_bacteria, n_archaea, n_virus])
print('Number of eukaryotic species is ', n_eukaryote)
print('Number of bacteria species is ', n_bacteria)
print('Number of archea species is ', n_archaea)
print('Number of virus species is ', n_virus)
print('Number of species not found in database is ', n_notFound)

Number of eukaryotic species is  431
Number of bacteria species is  732
Number of archea species is  61
Number of virus species is  281
Number of species not found in database is  16


In [153]:
# Merge df_org_metazoa with the main df
# by doing so, genes from metazoans are sorted
df_metazoanGenes = df.merge(df_org_metazoa, how='inner', on='Organism')
print(df_metazoanGenes.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2733 entries, 0 to 2732
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Entry_original   2733 non-null   object
 1   Organism         2733 non-null   object
 2   Gene_name        2733 non-null   object
 3   Protein_name     2733 non-null   object
 4   AH_or_Not        2733 non-null   object
 5   AA_sequence      2733 non-null   object
 6   Prediction       2733 non-null   object
 7   SubCell_Uniprot  2733 non-null   object
 8   Lineage          2733 non-null   object
dtypes: object(9)
memory usage: 213.5+ KB
None


In [166]:
# remove Lineage column
df_metazoanGenes = df_metazoanGenes.drop(['Lineage'], axis=1)

# export
df_metazoanGenes.to_csv('./IntermediateProducts/Results_step_2.csv', index=False)

# Step 3: Get human and mouse entry

In [167]:
# import df from step 3
df_metazoanGenes = pd.read_csv('./IntermediateProducts/Results_step_2.csv')

In [168]:
df_metazoanGenes.head()

Unnamed: 0,Entry_original,Organism,Gene_name,Protein_name,AH_or_Not,AA_sequence,Prediction,SubCell_Uniprot
0,Q16928,Anopheles albimanus,W,Protein white,Non-AH,MTINTDDQYADGESKTTISSNRRYSTSSFQDQSMEDDGINATLTND...,0000000000000000000000000000000000000000000000...,Membrane
1,Q01151,Homo sapiens,CD83,CD83 antigen,Non-AH,MSRGLQLLLLSCAYSLAPATPEVKVACSEDVDLPCTAPWDPQVPYT...,0000000000000000000000000000000000000000000000...,Membrane
2,Q0P6D2,Homo sapiens,DIPK1C,Divergent protein kinase domain 1C,AH,MARAAGARGPAGWCRRRGRCGRGTLLAFAAWTAGWVLAAALLLRAH...,0000000000000000000000000000000000000000000000...,Endoplasmic reticulum membrane
3,Q06136,Homo sapiens,KDSR,3-ketodihydrosphingosine reductase,AH,MLLLAAAFLVAFVLLLYMVSPLISPKPLALPGAHVVVTGGSSGIGK...,0000000000000000000000000000000000000000000000...,Endoplasmic reticulum membrane
4,Q3C1V0,Homo sapiens,MS4A18,Membrane-spanning 4-domains subfamily A member 18,AH,MTEQVIGANSVPGIIAPDNVHVIQPSNPVASGNHLQPSEVTTYPIS...,0000000000000000000000000000000000000000000000...,Membrane


In [206]:
WEBSITE_API = "https://rest.uniprot.org/uniprotkb/"

organism_id_list = {'Homo sapiens': '9606', 'Mus musculus': '10090'}

In [277]:
def entry_convert(df, organism):
    """
    For each entry in a given df, 
    if the entry is from the specified organism (human or mouse),
    just reuse the entry
    if not,
    convert the entry into the entry from the specified organism
    """
    # For final output
    converted_id_list = list()
    
    # Organism of interest either human or mouse
    organism_id = organism_id_list[organism]
    
    # for counting how many genes were not found
    non_match = 0
    not_found = 0
    
    # scan entrys in the df
    for i in range(len(df)):
        gene = df.iloc[i, 2]
        
        # in case the gene name is unknown, it matches a gene Q9Y6S7, whose gene name is 'Unknown'
        # thus below precludes this gene
        if gene == 'Unknown': #
            entry_converted = 'Not_found'
            not_found += 1
            # add to the final output list
            converted_id_list.append(entry_converted)
            continue
        
        # judge if the entry comes from human or mouse
        if df.iloc[i, 1] == organism:
            entry_converted = df.iloc[i, 0] # reuse
        else:
            
            # obtain the human or mouse entry from uniprot
            try:
                r = get_url(f'{WEBSITE_API}/search?query=(gene:{gene})&(organism_id:{organism_id})&(reviewed:true)&fields=accession,gene_names')
                result = r.json()['results'][0]
                
                ## human or mouse entry
                entry_converted = result['primaryAccession']
                ## check the gene names are identical
                gene_obtained = result['genes'][0]['geneName']['value']
                if gene.lower() != gene_obtained.lower():
                    entry_converted = 'Not_matched'
                    non_match += 1
            except:
                entry_converted = 'Not_found'
                not_found += 1
            
            # take a break
            sleep(1)
        
        # add to the final output list
        converted_id_list.append(entry_converted)
        
        # log every 100
        if i % 100 == 0: print(i, gene, entry_converted)

    
    return converted_id_list, non_match, not_found

In [276]:
# get human entry
converted_entry_Hs, non_match, not_found = entry_convert(df_metazoanGenes, 'Homo sapiens')

print('Total number of genes that got converted to Hs: ', len(df_metazoanGenes) - non_match - not_found)
print('Number of genes for which similar gene name was found in Hs: ', non_match)
print('Number of genes for which Hs homolog was not found: ', not_found)

0 W Not_matched
100 SEMA6B Q9H3T3
200 LRP12 Q9Y561
300 DCBLD2 Q96PD2
400 SMIM41 A0A2R8YCJ5
500 PREB Q9HCU5
600 TMEM61 Q8N0U2
700 BRICD5 Q6PL45
800 HHLA2 Q9UM44
900 Nrros Q86YC3
1000 Zdhhc19 Q8WVZ1
1100 Tomm70 O94826
1200 Ms4a13 Q5J8X5
1300 PLPPR2 Q96GM1
1400 TMEM54 Q969K7
1500 clc-5 Not_matched
1600 nra-2 A5JYX8
1900 Klri2 Q5DT37
2000 TMEM196 Q5HYL7
2100 ATP1B4 Q9UN42
2200 frc Q95YI5
2300 DppIII Q9VHR8
2400 slc39a10 Q6P5F6
2500 PAPI Q9VQ91
{"url":"http://rest.uniprot.org/uniprotkb/search","messages":["query parameter has an invalid syntax"]}
{"url":"http://rest.uniprot.org/uniprotkb/search","messages":["query parameter has an invalid syntax"]}
2600 chrnd Q07001
{"url":"http://rest.uniprot.org/uniprotkb/search","messages":["query parameter has an invalid syntax"]}
{"url":"http://rest.uniprot.org/uniprotkb/search","messages":["query parameter has an invalid syntax"]}
Total number of genes that got converted to Hs:  2616
Number of genes for which similar gene name was found in Hs:  113
Nu

In [278]:
# Add the result to Entry_Hs column
df_metazoanGenes['Entry_Hs'] = converted_entry_Hs

In [280]:
# get mouse entry
converted_entry_Mm, non_match, not_found = entry_convert(df_metazoanGenes, 'Mus musculus')

print('Total number of genes that got converted to Mm: ', len(df_metazoanGenes) - non_match - not_found)
print('Number of genes for which similar gene name was found in Hs: ', non_match)
print('Number of genes for which Hs homolog was not found: ', not_found)

0 W Not_matched
100 SEMA6B Q9H3T3
200 LRP12 Q9Y561
300 DCBLD2 Q96PD2
400 SMIM41 A0A2R8YCJ5
500 PREB Q9HCU5
600 TMEM61 Q8N0U2
700 BRICD5 Q6PL45
800 HHLA2 Q9UM44
900 Nrros Q8BMT4
1000 Zdhhc19 Q810M5
1100 Tomm70 Q9CZW5
1200 Ms4a13 Q5FWC3
1300 PLPPR2 Q96GM1
1400 TMEM54 Q969K7
1500 clc-5 Not_matched
1600 nra-2 A5JYX8
1900 Klri2 Q5DT37
2000 TMEM196 Q5HYL7
2100 ATP1B4 Q9UN42
2200 frc Q95YI5
2300 DppIII Q9VHR8
2400 slc39a10 Q6P5F6
2500 PAPI Q9VQ91
{"url":"http://rest.uniprot.org/uniprotkb/search","messages":["query parameter has an invalid syntax"]}
{"url":"http://rest.uniprot.org/uniprotkb/search","messages":["query parameter has an invalid syntax"]}
2600 chrnd Q07001
{"url":"http://rest.uniprot.org/uniprotkb/search","messages":["query parameter has an invalid syntax"]}
{"url":"http://rest.uniprot.org/uniprotkb/search","messages":["query parameter has an invalid syntax"]}
Total number of genes that got converted to Mm:  2432
Number of genes for which similar gene name was found in Hs:  115
Nu

In [281]:
# Add the result to Entry_Mm column
df_metazoanGenes['Entry_Mm'] = converted_entry_Mm

In [293]:
organism_id = organism_id_list['Mus musculus']
gene = 'DIPK1C'
# r = get_url('https://www.uniprot.org/uniprotkb?format=json&query=(gene:DIPK1C)%20AND%20(taxonomy_id:10090)%20AND%20(reviewed:true)')
r = get_url(f'{WEBSITE_API}/search?query=gene:{gene}+AND+organism_id:{organism_id}+AND+reviewed:true')
result = r.json()
result

{'results': [{'entryType': 'UniProtKB reviewed (Swiss-Prot)',
   'primaryAccession': 'Q8BQT2',
   'uniProtkbId': 'DIK1C_MOUSE',
   'entryAudit': {'firstPublicDate': '2007-05-15',
    'lastAnnotationUpdateDate': '2022-05-25',
    'lastSequenceUpdateDate': '2010-05-18',
    'entryVersion': 129,
    'sequenceVersion': 2},
   'annotationScore': 51.59999999999999,
   'organism': {'scientificName': 'Mus musculus',
    'commonName': 'Mouse',
    'taxonId': 10090,
    'lineage': ['Eukaryota',
     'Metazoa',
     'Chordata',
     'Craniata',
     'Vertebrata',
     'Euteleostomi',
     'Mammalia',
     'Eutheria',
     'Euarchontoglires',
     'Glires',
     'Rodentia',
     'Myomorpha',
     'Muroidea',
     'Muridae',
     'Murinae',
     'Mus',
     'Mus']},
   'proteinExistence': '2: Evidence at transcript level',
   'proteinDescription': {'recommendedName': {'fullName': {'value': 'Divergent protein kinase domain 1C'}},
    'alternativeNames': [{'fullName': {'value': 'Protein FAM69C'}}]},


In [272]:
df_test = df_metazoanGenes.iloc[-30:, :]

In [275]:
df_test

Unnamed: 0,Entry_original,Organism,Gene_name,Protein_name,AH_or_Not,AA_sequence,Prediction,SubCell_Uniprot,Entry_Hs
2703,Q9ZXY3,Papio hamadryas,MT-ND2,NADH-ubiquinone oxidoreductase chain 2,Non-AH,MNPLAQLIIYTTVITGTLITMLSSHWFLAWAGLEMNMLAFIPTLIK...,0000000000000000000000000000000000000000000000...,Mitochondrion inner membrane,P03891
2704,A7RYM7,Nematostella vectensis,Unknown,Single-pass membrane and coiled-coil domain-co...,AH,MRQLPGKAAKETRKMKRERKQQNKEGHNRVVTVAIPVCLAVFVMLI...,0000000000000111111111111111111110000000000000...,Membrane,Q9Y6S7
2705,A7RM45,Nematostella vectensis,Unknown,Probable lysosomal cobalamin transporter,AH,MTIPHEVLAQGWIPFTVVVVLAILFSWFYIRYYQDHSQSEVSSTIT...,0000000000000000000000000000000000000000000000...,Lysosome membrane,Q9Y6S7
2706,A7SIM4,Nematostella vectensis,HAP2,Hapless 2,AH,MGRGQIIMILVGLLCLANESYSDVIAKSSLQMCENTGNSDDPYNVV...,0000000000000000000000000000000000000000000000...,Cell membrane,Not_matched
2707,Q9MYW3,Equus caballus,TLR4,Toll-like receptor 4,AH,MMPPTRLAGTLIPAMAFLSCLRPESWDPCVQVVPNTTYQCMDLNLY...,0000000000000000000000000000000000000000000000...,Unknown,O00206
2708,F6S3G9,Equus caballus,AQP11,Aquaporin-11,AH,MTALRALWSEMQDTCTSLGLMLSVVLLAGLARVVARQQQLHRPMAH...,0000000000000000000000000000001111111110000000...,Unknown,Q8NBQ7
2709,O46512,Equus caballus,CYP19A1,Aromatase,AH,MILEMLNPMHYNLTSMVPEVMPVATLPILLLTGFLFFVWNHEETSS...,0000000000000000000000000000000000000000000000...,Unknown,P11511
2710,Q95V11,Asterias rubens,cnh,Cytidine monophosphate-N-acetylneuraminic acid...,AH,MEQEREIVFSLSPEETSELKNGVNLISRSEKEKFVIYKDPTAENTV...,0000000000000000000000000000000000000000000000...,Membrane,Q95V11
2711,V5NAL9,Pinctada imbricata,Unknown,Toll-like receptor 4,AH,MCPLQIHVLHLIQGNQKNRKGKYVNMTRQLWYILPLLFLLCHCVTS...,0000000000000000000000000000000000000000000000...,Cell membrane,Q9Y6S7
2712,Q28487,Macaca fuscata fuscata,SLC18A3,Vesicular acetylcholine transporter,Non-AH,GMGLANLLYAPVLLLLRNVGLLTRSRSERDVLLDEPPQGLYDAVRL...,0000000000000000000000000000000000000000000000...,Membrane,Q16572


In [273]:
converted_entry_Hs, non_match, not_found = entry_convert(df_test, 'Homo sapiens')

0 MT-ND2 P03891
{"url":"http://rest.uniprot.org/uniprotkb/search","messages":["query parameter has an invalid syntax"]}


In [274]:
converted_entry_Hs

['P03891',
 'Not_found',
 'Not_found',
 'Not_matched',
 'O00206',
 'Q8NBQ7',
 'P11511',
 'Q95V11',
 'Not_found',
 'Q16572',
 'Not_found',
 'Not_found',
 'Q4H3K6',
 'P03901',
 'P03928',
 'P00846',
 'P03928',
 'P05067',
 'Not_found',
 'Not_found',
 'Not_found',
 'Q05996',
 'Not_matched',
 'Q9UQF0',
 'Not_matched',
 'P03928',
 'Not_found',
 'Not_found',
 'Not_found',
 'Not_found']

In [260]:
df_metazoanGenes['Entry_Hs'] = converted_entry_Hs

In [263]:
df_metazoanGenes.tail(n=40)

Unnamed: 0,Entry_original,Organism,Gene_name,Protein_name,AH_or_Not,AA_sequence,Prediction,SubCell_Uniprot,Entry_Hs
2693,Q31651,Anser caerulescens,MT-ND5,NADH-ubiquinone oxidoreductase chain 5,AH,GSVDLQPSLNTSYLNTWALLLTLMATAFTATYSIRMTILVQAGQTR...,0000000000000000000000000000000000000000000000...,Mitochondrion inner membrane,P03915
2694,P41308,Didelphis virginiana,MT-ND4,NADH-ubiquinone oxidoreductase chain 4,AH,MLKILLPTLMLIPLTWLSKNKWLWINTTTYSLLISITSLPMLYHPM...,0000000000000000000000000000000000000000000000...,Mitochondrion membrane,P03905
2695,P41314,Didelphis virginiana,MT-ATP8,ATP synthase protein 8,Non-AH,MPQLNTSTWTLTISLMIISLFCIYQLKMMNQTLIQITPSTEQSKLT...,0000000000000000000000000000000000000000000000...,Mitochondrion membrane,P03928
2696,B5X3B2,Salmo salar,apmap,Adipocyte plasma membrane-associated protein,AH,MNEPEGLRFRRLNRPQIITDELQEPQYKGTSTYSGKVFRVILVTLG...,0000000000000000000000000000000000000000000000...,Membrane,Q9HDC9
2697,O03168,Latimeria chalumnae,MT-ATP8,ATP synthase protein 8,Non-AH,MPQLNPSPWLLILLFSWLIFLTMLPSKTQLHTFPNMPSTQNMCKQE...,0000000000000000000000000000000000000000000000...,Mitochondrion membrane,P03928
2698,O42335,Cynops pyrrhogaster,PTC1,Protein patched homolog 1,Non-AH,AKLQTGTAYLPGKHAPLQWTQFDPLEFLEELKKINYQTDSWEELLN...,0000000000000000000000000000000000000000000000...,Membrane,Not_matched
2699,P22648,Schistocerca americana,FAS2,Fasciclin-2,Non-AH,MRTVACAVLLACFMGCLAGAWAQSAGLEILPNSENQTKPIGRSMLL...,0000000000000000000000000000000000000000000000...,Membrane,P34082
2700,B4GJC1,Drosophila persimilis,Unknown,Protein adenylyltransferase Fic,AH,MAMTILHASEKVNAEAEATTCPPTEKVKEEQQQQEQLQHSKTSKRV...,0000000000000000000000000000000000000000111111...,Membrane,Q9Y6S7
2701,B4GMC9,Drosophila persimilis,Snmp1,Sensory neuron membrane protein 1,AH,MKLDRMKLLFVSAGTLVFAILFGWVMFPKILKFMISKQVTLKPGTD...,0000000000000000000000000000000000000000000000...,Cell membrane,Q9VDD3
2702,Q8CHJ0,Cricetulus griseus,PIGU,Phosphatidylinositol glycan anchor biosynthesi...,AH,MAAPLALVLVVAVTVRAALFRSSLAEFISERVEVVSPLSSWKRVVE...,0000000000000000000000000000000000000000000000...,Endoplasmic reticulum membrane,Q9H490


In [268]:
gene = 'Unknown'
organism_id = '9606'
r = get_url(f'{WEBSITE_API}/search?query=(gene:{gene})&(organism_id:{organism_id})&(reviewed:true)&fields=accession,gene_names')
result = r.json()

{"url":"http://rest.uniprot.org/uniprotkb/search","messages":["query parameter has an invalid syntax"]}


HTTPError: 400 Client Error: Bad Request for url: https://rest.uniprot.org/uniprotkb//search?query=((gene:Unknown)&(organism_id:9606)&(reviewed:true))&fields=accession,gene_names

In [267]:
result

{'results': [{'primaryAccession': 'Q9Y6S7',
   'genes': [{'geneName': {'evidences': [{'evidenceCode': 'ECO:0000313',
        'source': 'EMBL',
        'id': 'AAD30184.1'}],
      'value': 'unknown'}}]},
  {'primaryAccession': 'A0A5P9XVI5',
   'genes': [{'geneName': {'evidences': [{'evidenceCode': 'ECO:0000313',
        'source': 'EMBL',
        'id': 'QFX97383.1'}],
      'value': 'unknown'},
     'orfNames': [{'evidences': [{'evidenceCode': 'ECO:0000313',
         'source': 'EMBL',
         'id': 'QFX97383.1'}],
       'value': 'GCD22_03296'}]}]},
  {'primaryAccession': 'Q9UHI2',
   'genes': [{'geneName': {'evidences': [{'evidenceCode': 'ECO:0000313',
        'source': 'EMBL',
        'id': 'AAF22284.1'}],
      'value': 'unknown'}}]},
  {'primaryAccession': 'A0A5P9XS33',
   'genes': [{'geneName': {'evidences': [{'evidenceCode': 'ECO:0000313',
        'source': 'EMBL',
        'id': 'QFX96831.1'}],
      'value': 'unknown'},
     'orfNames': [{'evidences': [{'evidenceCode': 'ECO:00003

In [244]:
result['results'][0]

{'primaryAccession': 'O00198',
 'genes': [{'geneName': {'value': 'HRK'}, 'synonyms': [{'value': 'BID3'}]}]}

In [252]:
df_test = df_metazoanGenes_non_Hs.iloc[:20, :]
entry_convert(df_test, 'Homo sapiens')

W Small integral membrane protein 4 Unknown
Ptpra Small integral membrane protein 4 P18433
Hrk Small integral membrane protein 4 O00198
Shisa6 Small integral membrane protein 4 Q6ZSJ9
Cd44 Small integral membrane protein 4 P26051
Cd200r3 Small integral membrane protein 4 Q5UKY4
Ltk Small integral membrane protein 4 P29376
Cmtm3 Small integral membrane protein 4 Q96MX0
Tmem269 Small integral membrane protein 4 A0A1B0GVZ9
Slc6a8 Small integral membrane protein 4 P48029
Vmn2r1 Small integral membrane protein 4 O70410
Clic3 Small integral membrane protein 4 O95833
Smim4 Small integral membrane protein 4 Q8WVI0
Rtp3 Small integral membrane protein 4 Q9BQQ7
Sigmar1 Small integral membrane protein 4 Q99720
Cd274 Small integral membrane protein 4 Q9NZQ7
Unknown Small integral membrane protein 4 Q9Y6S7
Lrrc24 Small integral membrane protein 4 Q50LG9
Tmem176a Small integral membrane protein 4 Q96HP8
Scamp3 Small integral membrane protein 4 O14828


['Unknown',
 'P18433',
 'O00198',
 'Q6ZSJ9',
 'P26051',
 'Q5UKY4',
 'P29376',
 'Q96MX0',
 'A0A1B0GVZ9',
 'P48029',
 'O70410',
 'O95833',
 'Q8WVI0',
 'Q9BQQ7',
 'Q99720',
 'Q9NZQ7',
 'Q9Y6S7',
 'Q50LG9',
 'Q96HP8',
 'O14828']