In [55]:
import os, re, requests, json
import numpy as np
import pandas as pd
from urllib.request import urlopen
from time import sleep

In [2]:
def get_url(url, **kwargs):
    '''
    Obatin a response from a given url
    '''
    response = requests.get(url, **kwargs);

    if not response.ok:
        print(response.text)
        response.raise_for_status()
        sys.exit()

    return response

# Step 1

In [None]:
# uniprot API URL
WEBSITE_API = "https://rest.uniprot.org/uniprotkb/"

In [2]:
# raw data import
files = os.listdir('./SourceData/Results from the paper/')
print(len(files))

11761


In [47]:
# prepare a dataframe and array for the ID not found in uniprot
df = pd.DataFrame()

# scan raw data files, judge AH_or_not, and get the gene info from uniprot based on the ID called "entry"
for i, file in enumerate(files):
    
    # check the file format
    if file.endswith('result'):
        
        # obtain uniprot ID called entry
        entry = file.replace('.result', '')
        
        # open the file
        with open('./SourceData/Results from the paper/' + file) as f:
            lines = f.readlines()
    
        # get the amino-acid sequence and prediction
        aa_sequence = lines[1]
        prediction = lines[3]

        # judge if the protein predicted to contain AH or not
        if '1' in prediction:
            AH_or_not = 'AH'
        else:
            AH_or_not = 'Non-AH'

        # get the result of query
        try:
            r = get_url(f"{WEBSITE_API}/search?query=(accession:{entry})&fields=organism_name,gene_primary,protein_name,cc_subcellular_location")
            result = r.json()['results'][0]
        except requests.exceptions.ConnectionError:
            r.status_code = "Connection refused"
            break
        
        # extract wanted texts from the result
        ## organism name
        ### put try-except in case the gene info is obsolete
        try:
            organism_name = result['organism']['scientificName']
        except:
            organism_name = 'Unknown'
        
        ## gene name
        ### put try-except in case it lacks geneName
        try: 
            gene_name = result['genes'][0]['geneName']['value']
        except: gene_name = 'Unknown'

        ## protein name
        try: 
            protein_name = result['proteinDescription']['recommendedName']['fullName']['value'] 
        except: protein_name = 'Unknown'

        ## subcellular locations
        subcell_loc = list()
        try:
            for v in result['comments'][0]['subcellularLocations']:
                subcell_loc.append(v['location']['value'])
                subcell_loc = ', '.join(subcell_loc)
        except:
            subcell_loc = 'Unknown'
        
        
        # put the values to the dataframe
        df.loc[i, 'Entry_original'] = entry
        df.loc[i, 'Organism'] = organism_name
        df.loc[i, 'Gene_name'] = gene_name
        df.loc[i, 'Protein_name'] = protein_name
        df.loc[i, 'AH_or_Not'] = AH_or_not
        df.loc[i, 'AA_sequence'] = aa_sequence
        df.loc[i, 'Prediction'] = prediction
        df.loc[i, 'SubCell_Uniprot'] = subcell_loc
        
        # log every 100 genes
        if i % 100 == 0: print(i, entry, organism_name, protein_name)
    
    # take a break, go next
    sleep(1)
            

0 Q9CAN8 Arabidopsis thaliana Lysophospholipid acyltransferase 2
100 P53389 Saccharomyces cerevisiae (strain ATCC 204508 / S288c) Protein HOL1
200 Q12155 Saccharomyces cerevisiae (strain ATCC 204508 / S288c) Uncharacterized membrane protein YLR050C
300 O26830 Methanothermobacter thermautotrophicus (strain ATCC 29096 / DSM 1053 / JCM 10044 / NBRC 100330 / Delta H) Putative phospho-N-acetylmuramoyl-pentapeptide-transferase
400 Q51MB1 Magnaporthe oryzae (strain 70-15 / ATCC MYA-4617 / FGSC 8958) pH-response regulator protein palI/RIM9
500 Q3E8Z8 Arabidopsis thaliana Putative pectinesterase/pectinesterase inhibitor 28
600 Q5NCP0 Mus musculus E3 ubiquitin-protein ligase RNF43
700 Q22549 Caenorhabditis elegans Innexin-10
800 O07888 Treponema pallidum (strain Nichols) Flagellar protein FliL
900 Q7Z695 Homo sapiens Uncharacterized aarF domain-containing protein kinase 2
1000 Q674R7 Homo sapiens Autophagy-related protein 9B
1100 O43085 Schizosaccharomyces pombe (strain 972 / ATCC 24843) DSC E3 

In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11759 entries, 0 to 11760
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Entry_original   11759 non-null  object
 1   Organism         11759 non-null  object
 2   Gene_name        11759 non-null  object
 3   Protein_name     11759 non-null  object
 4   AH_or_Not        11759 non-null  object
 5   AA_sequence      11759 non-null  object
 6   Prediction       11759 non-null  object
 7   SubCell_Uniprot  11759 non-null  object
dtypes: object(8)
memory usage: 1.1+ MB


In [49]:
# export
df.to_csv('./IntermediateProducts/Results_step_1.csv', index=False)

# Step 2

In [8]:
WEBSITE_API = 'https://rest.uniprot.org/taxonomy/'

In [3]:
# Import the df from step1
df = pd.read_csv('./IntermediateProducts/Results_step_1.csv')

In [133]:
# create organism list
organism_list = df['Organism'].unique().tolist()
print(len(organism_list))

1521


In [134]:
# a list for storage
lineage_list = list()

# regex for extracing organism name WITHOUT items in parenthesis
regex = re.compile(r'([^()]+)(\(.+\))?')

# scan organism list and get lineage from Uniprot Taxonomy
for i, organism in enumerate(organism_list):
    # extract organism name WITHOUT items in parenthesis such as strain name
    mo = regex.search(organism)
    if mo is None:
        lineage_list.append('Unknown')
        continue
    else: 
        organism = mo.group(1)
    
    # get response that contains lineage
    try:
        r = get_url(f'{WEBSITE_API}/search?query=(scientific:"{organism}")&fields=lineage')
        result = r.json()
    except requests.exceptions.ConnectionError:
        r.status_code = "Connection refused"
        break
    
    # extract lineage from the result
    lineage = list()
    lineage_full = ''
    try:
        for l in result['results'][0]['lineage']:
            name = l['scientificName']
            lineage.append(name)
        lineage_full = ', '.join(lineage)
    except:
        lineage_full = 'NotFound' # in case the result is empty
        
    # add to the found lineage to the list
    lineage_list.append(lineage_full)
    
    # log every 100
    if i % 100 == 0: print(i, organism, lineage[-2:])
    
    # take a break, go next
    sleep(1)

0 Arabidopsis thaliana ['Eukaryota', 'cellular organisms']
100 Salmonella phage P22 ['Duplodnaviria', 'Viruses']
200 Acropora millepora ['Eukaryota', 'cellular organisms']
300 Streptococcus pneumoniae  ['Bacteria', 'cellular organisms']
400 Acidianus bottle-shaped virus  []
500 Petunia integrifolia ['Eukaryota', 'cellular organisms']
600 Haloquadratum walsbyi  ['Archaea', 'cellular organisms']
700 Xanthomonas campestris pv. campestris  ['Bacteria', 'cellular organisms']
800 Friend murine leukemia virus  ['Riboviria', 'Viruses']
900 Oceanobacillus iheyensis  ['Bacteria', 'cellular organisms']
1000 Artemia salina ['Eukaryota', 'cellular organisms']
1100 Citrifermentans bemidjiense  ['Bacteria', 'cellular organisms']
1200 Quaranfil virus  ['Riboviria', 'Viruses']
1300 Planococcus maritimus ['Bacteria', 'cellular organisms']
1400 Macaca radiata ['Eukaryota', 'cellular organisms']
1500 Dechloromonas aromatica  ['Bacteria', 'cellular organisms']


In [141]:
# Add to the df_org
df_org = pd.DataFrame(organism_list, columns=['Organism'])
df_org['Lineage'] = lineage_list

# Select metazoans
df_org_metazoa = df_org[df_org['Lineage'].str.contains('Metazoa')]
print(df_org_metazoa.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 122 entries, 4 to 1518
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Organism  122 non-null    object
 1   Lineage   122 non-null    object
dtypes: object(2)
memory usage: 2.9+ KB
None


In [142]:
# export
df_org.to_csv('./IntermediateProducts/Organisms_and_Lineage.csv', index=False)