# **1. Setup**

In [1]:
import os, requests, sys, json
import numpy as np
import pandas as pd
from time import sleep

# https://rest.uniprot.org/beta/docs/
WEBSITE_API = "https://rest.uniprot.org/beta"

# Helper function to download data
def get_url(url, **kwargs):
  response = requests.get(url, **kwargs);

  if not response.ok:
    print(response.text)
    response.raise_for_status()
    sys.exit()

  return response

In [2]:
# directory
directory = './IntermediateProducts/Result_Selected_Organisms.csv'

## File import

In [4]:
df = pd.read_csv(directory)

In [6]:
df.head(n=10)

Unnamed: 0,Uniprot_ID,Organism,Protein_name,AH_or_Not,AA_sequence,Prediction
0,Q8N4K4,Homo sapiens (Human),Reprimo-like protein,Non-AH,MNATFLNHSGLEEVDGVGGGAGAALGNRTHGLGTWLGCCPGGAPLA...,0000000000000000000000000000000000000000000000...
1,Q8N4S7,Homo sapiens (Human),Progestin and adipoQ receptor family member 4,Non-AH,MAFLAGPRLLDWASSPPHLQFNKFVLTGYRPASSGSGCLRSLFYLH...,0000000000000000000000000000000000000000000000...
2,Q8N5G0,Homo sapiens (Human),Small integral membrane protein 20,Non-AH,MSRNLRTALIFGGFISLIGAAFYPIYFRPLMRLEEYKKEQAINRAG...,0000000000000000000000000000000000000000000000...
3,Q8N614,Homo sapiens (Human),Transmembrane protein 156,AH,MTKTALLKLFVAIVITFILILPEYFKTPKERTLELSCLEVCLQSNF...,0011111100000000000000000000000000000000000000...
4,Q8N7C4,Homo sapiens (Human),Transmembrane protein 217,Non-AH,MKQQQWCGMTAKMGTVLSGVFTIMAVDMYLIFEQKHLGNGSCTEIT...,0000000000000000000000000000000000000000000000...
5,Q8N7S6,Homo sapiens (Human),Uncharacterized protein ARIH2OS,Non-AH,MLGQRAGDGERPGLPGDGEGGVPARPGRRAERPPQRPAKVNKAVTC...,0000000000000000000000000000000000000000000000...
6,Q8N7X8,Homo sapiens (Human),SIGLEC family-like protein 1,Non-AH,MLPLLQLVPAKLLNSSCSLEKTLQCSCSFHGIPTPSVQWWMGGVPV...,0000000000000000000000000000000000000000000000...
7,Q8N816,Homo sapiens (Human),Uncharacterized protein KRT10-AS1,Non-AH,MVGILPLCCSGCVPSLCCSSYVPSVAPTAAHSVRVPHSAGHCGQRV...,0000000000000000000000000000000000000000000000...
8,Q8N8F6,Homo sapiens (Human),Protein YIPF7,Non-AH,MDLLKISHTKLHLLEDLSIKNKQRMSNLAQFDSDFYQSNFTIDNQE...,0000000000000000000000000000000000000000000000...
9,Q8N8F7,Homo sapiens (Human),Leucine-rich single-pass membrane protein 1,AH,MTHSSQDTGSCGIQEDGKLYVVDSINDLNKLNLCPAGSQHLFPLED...,0000000000000000000000000000000000000000000000...


## **2. Examine**

In [7]:
df.shape

(3240, 6)

In [8]:
df['Org + ProteinName'] = df[]
proteins = list(df['Protein_name'])
proteins_unique = list(df['Protein_name'].unique())
print('The total number of protein names: ', len(proteins))
print('The total number of unique protein names: ', len(proteins_unique))

The total number of protein names:  3240
The total number of unique protein names:  3099


In [9]:
duplicated_proteins = [protein for protein in proteins_unique 
                       if proteins.count(protein) >= 2]
print('The number of the duplicated_proteins: ', len(duplicated_proteins))

The number of the duplicated_proteins:  120


##### Note that there are 3 and 2 of 3 and 4 times duplicated proteins
##### this makes the subtraction of the total minus unique 100

## **3. Subcellular locactions**

### From 0 to 299

In [10]:
for i, uniprot_id in enumerate(df['Uniprot_ID'][:300]):
    # get the data as a json
    try:
        r = get_url(f"{WEBSITE_API}/uniprotkb/{uniprot_id}?fields=cc_subcellular_location")
    except requests.exceptions.ConnectionError:
        r.status_code = "Connection refused"
        
    data = r.json()
    
    # get the subcellular location
    subcellular_location = []
    for location_and_topology in data['comments'][0]['subcellularLocations']:
        subcellular_location.append(location_and_topology['location']['value'])
    
    # put into df as a joined string
    joined = ', '.join(subcellular_location)
    df.loc[i, 'Subcellular_location'] = joined
    
    # log
    print(i, uniprot_id, joined)
    
    # sleep
    sleep(1)

0 Q8N4K4 Membrane
1 Q8N4S7 Membrane
2 Q8N5G0 Mitochondrion inner membrane
3 Q8N614 Membrane
4 Q8N7C4 Membrane
5 Q8N7S6 Membrane
6 Q8N7X8 Membrane
7 Q8N816 Membrane
8 Q8N8F6 Endoplasmic reticulum membrane, Golgi apparatus, cis-Golgi network membrane, Golgi apparatus, trans-Golgi network membrane
9 Q8N8F7 Membrane
10 Q8N8V8 Membrane
11 Q8N8Z6 Membrane
12 Q8N9F0 Cytoplasm, Microsome membrane, Mitochondrion membrane, Endoplasmic reticulum membrane
13 Q8N9I5 Membrane
14 Q8N9R8 Membrane, Nucleus, Cytoplasm
15 Q8N9X5 Membrane
16 Q8NA29 Cell membrane, Endoplasmic reticulum membrane
17 Q8NAC3 Cell membrane
18 Q8NAN2 Mitochondrion outer membrane
19 Q8NBP5 Membrane
20 Q8NBR0 Cell membrane, Cytoplasm
21 Q8NBS3 Cell membrane, Basolateral cell membrane
22 Q8NC44 Membrane
23 Q8NCG7 Cell membrane
24 Q8NCK7 Endoplasmic reticulum membrane, Cell membrane
25 Q8NCQ3 Membrane
26 Q8NCS4 Membrane
27 Q8NDB6 Membrane
28 Q8NDY8 Membrane
29 Q8NEA5 Membrane
30 Q8NEQ5 Membrane
31 Q8NET5 Cell membrane
32 Q8NEW7 Memb

### From 300 to 599

In [11]:
index_start = 300
for i in range(index_start, index_start + 300):
    # get the uniprot id
    uniprot_id = df['Uniprot_ID'][i]
    
    # get the data as a json
    try:
        r = get_url(f"{WEBSITE_API}/uniprotkb/{uniprot_id}?fields=cc_subcellular_location")
    except requests.exceptions.ConnectionError:
        r.status_code = "Connection refused"
    
    data = r.json()
    
    
    # get the subcellular location
    subcellular_location = []
    for location_and_topology in data['comments'][0]['subcellularLocations']:
        subcellular_location.append(location_and_topology['location']['value'])
    
    # put into df as a joined string
    joined = ', '.join(subcellular_location)
    df.loc[i, 'Subcellular_location'] = joined
    
    # log
    print(i, uniprot_id, joined)
    
    # sleep
    sleep(1)

300 Q96KR6 Mitochondrion, Mitochondrion outer membrane
301 Q96KV6 Membrane
302 Q96LL3 Cell membrane
303 Q96LL9 Mitochondrion inner membrane
304 Q96LR9 Cell membrane
305 Q96LU7 Membrane
306 Q96M19 Membrane
307 Q96N19 Lysosome membrane
308 Q96N35 Membrane
309 Q96N68 Membrane
310 Q96NA8 Membrane
311 Q96PD2 Membrane
312 Q96PG2 Membrane
313 Q96PH1 Membrane
314 Q96PL5 Cell membrane, Cytoplasm
315 Q96PS6 Membrane
316 Q96Q91 Membrane
317 Q96QE4 Membrane
318 Q96QS1 Membrane
319 Q96RD9 Cell membrane
320 Q96RL6 Membrane
321 Q96S66 Endoplasmic reticulum membrane, Golgi apparatus membrane, Nucleus membrane
322 Q96T54 Membrane
323 Q99523 Golgi apparatus, Golgi stack membrane, Endosome membrane, Endoplasmic reticulum membrane, Nucleus membrane, Cell membrane, Lysosome membrane
324 Q99650 Membrane
325 Q99665 Membrane
326 Q99795 Membrane
327 Q99J93 Cell membrane, Lysosome membrane, Late endosome membrane
328 Q99K24 Membrane
329 Q99LJ5 Membrane
330 Q99LY2 Membrane, Cytoplasm, perinuclear region, Endopla

### From 600 to 1599

In [13]:
index_start = 600
for i in range(index_start, index_start + 1000):
    # get the uniprot id
    uniprot_id = df['Uniprot_ID'][i]
    
    # get the data as a json
    try:
        r = get_url(f"{WEBSITE_API}/uniprotkb/{uniprot_id}?fields=cc_subcellular_location")
    except requests.exceptions.ConnectionError:
        r.status_code = "Connection refused"
    
    data = r.json()
    
    
    # get the subcellular location
    try:
        subcellular_location = []
        for location_and_topology in data['comments'][0]['subcellularLocations']:
            subcellular_location.append(location_and_topology['location']['value'])
    except:
        subcellular_location = ['Not_found']
    
    # put into df as a joined string
    joined = ', '.join(subcellular_location)
    df.loc[i, 'Subcellular_location'] = joined
    
    # log
    print(i, uniprot_id, joined)
    
    # sleep
    sleep(1)

600 Q9LSI9 Cell membrane
601 Q9LSK9 Membrane
602 Q9LSP7 Membrane
603 Q9LSR8 Cell membrane
604 Q9LSS4 Membrane
605 Q9LSX7 Peroxisome membrane
606 Q9LT68 Plastid, chloroplast membrane, Plastid, chloroplast envelope
607 Q9LT84 Membrane
608 Q9LTD9 Membrane
609 Q9LTI3 Membrane
610 Q9LTR2 Plastid, chloroplast inner membrane
611 Q9LTZ9 Golgi apparatus membrane
612 Q9LU77 Cell membrane
613 Q9LUL8 Membrane
614 Q9LUZ9 Membrane
615 Q9LV85 Membrane, Nucleus membrane
616 Q9LVF4 Endoplasmic reticulum membrane, Vacuole membrane
617 Q9LVR3 Membrane
618 Q9LVZ7 Membrane
619 Q9LX93 Cell membrane
620 Q9LXL9 Membrane, Nucleus
621 Q9LY03 Cell membrane
622 Q9LY09 Secreted, extracellular space, extracellular matrix, pollen coat, Lipid droplet, Membrane
623 Q9LY41 Membrane
624 Q9LY50 Membrane
625 Q9LYR6 Membrane
626 Q9LYU7 Cell membrane, Endoplasmic reticulum membrane
627 Q9LZ31 Membrane
628 Q9LZA6 Lipid droplet, Membrane, Peroxisome
629 Q9LZB8 Plastid, chloroplast membrane
630 Q9LZD0 Plastid, chloroplast enve

### From 1600 to end (3239)

In [16]:
index_start = 1600
for i in range(index_start, len(df)):
    # get the uniprot id
    uniprot_id = df['Uniprot_ID'][i]
    
    # get the data as a json
    try:
        r = get_url(f"{WEBSITE_API}/uniprotkb/{uniprot_id}?fields=cc_subcellular_location")
    except requests.exceptions.ConnectionError:
        r.status_code = "Connection refused"
    
    data = r.json()
    
    
    # get the subcellular location
    try:
        subcellular_location = []
        for location_and_topology in data['comments'][0]['subcellularLocations']:
            subcellular_location.append(location_and_topology['location']['value'])
    except:
        subcellular_location = ['Not_found']
    
    # put into df as a joined string
    joined = ', '.join(subcellular_location)
    df.loc[i, 'Subcellular_location'] = joined
    
    # log
    print(i, uniprot_id, joined)
    
    # sleep
    sleep(1)

1600 Q0D289 Cell projection, cilium membrane, Cell membrane
1601 Q0D2G3 Membrane
1602 Q0D2K0 Membrane
1603 Q0EEE2 Cell projection, cilium, flagellum membrane, Endoplasmic reticulum membrane
1604 Q0GA42 Cell membrane
1605 Q0IH73 Membrane
1606 Q0IHA5 Golgi apparatus, cis-Golgi network membrane, Endoplasmic reticulum-Golgi intermediate compartment membrane
1607 Q0IHC4 Cytoplasm, perinuclear region, Cell membrane, Cell projection, axon
1608 Q0IHQ3 Cell membrane, Cell junction, tight junction
1609 Q0IIE5 Cell junction, adherens junction, Cell membrane
1610 Q0IIL4 Golgi apparatus membrane
1611 Q0IJ20 Membrane
1612 Q0P496 Golgi apparatus membrane
1613 Q0P4A4 Membrane
1614 Q0P4Q4 Membrane
1615 Q0P5H8 Membrane
1616 Q0P5M9 Nucleus inner membrane
1617 Q0P5V9 Membrane
1618 Q0P6D2 Endoplasmic reticulum membrane
1619 Q0P6H9 Membrane
1620 Q0V7T5 Cell membrane
1621 Q0V881 Cell membrane
1622 Q0V8G3 Golgi apparatus membrane
1623 Q0V947 Membrane
1624 Q0V9J0 Membrane
1625 Q0V9Z3 Cell membrane
1626 Q0VA06 

In [17]:
df.tail()

Unnamed: 0,Uniprot_ID,Organism,Protein_name,AH_or_Not,AA_sequence,Prediction,Subcellular_location
3235,O44252,Drosophila melanogaster (Fruit fly),Protein rolling stone,AH,MQLFDDFCKSFNKELQRANFGFAYNRVHLFYRSQWQKDEINTIYLL...,0001111111111100000000000000000000000000000000...,Membrane
3236,O44443,Caenorhabditis elegans,EGF-like domain-containing protein C02B10.3,Non-AH,MTGALCIVLFGVTMVTAERPKIKDTHGNLLVKLSDIPIGSCGDESY...,0000000000000000000000000000000000000000000000...,Membrane
3237,O45145,Caenorhabditis elegans,Alkaline ceramidase,AH,MESSSINRWFEYESGHAWCESAYKYQTLPYVAEFANTCTNLPIIVL...,0000000000000000000000000000000000000000000000...,Membrane
3238,O45150,Caenorhabditis elegans,Serpentine receptor class gamma-30,Non-AH,MLKCHPGFNTKLELLKYGIQFVYFIVGLGFHFAVIKVLHKKWSVYS...,0000000000000000000000000000000000000000000000...,Membrane
3239,O45300,Caenorhabditis elegans,Serpentine receptor class epsilon-13,AH,MSFNISQENEFQTMYIKYFNKTYSIIEGSYNYYLFVFYIQIALIFI...,0000000000000000000000000000000000000000000000...,Membrane


In [18]:
df.to_csv('./IntermediateProducts/Result_Selected_Organisms_SubCellLoc.csv', index=False)