### **1. Setup**

In [1]:
import os
import numpy as np
import pandas as pd

from time import sleep

import requests, sys, json

In [2]:
# directory
directory = '/Users/ShokenLEE/Desktop/DATA/Bioinformatics/\
MemBrain AH prediction/'

### Functions to get AH residue numbers and symbols

In [27]:
def find_AH_residue_number(pred):
    positions = list()
    for i in range(len(pred)):
        if pred[i] == '1':
            positions.append(i)
    return positions

In [39]:
def residues_at_positions(aa_seq, positions):
    aa = list()
    for i in positions:
        aa.append(aa_seq[int(i)])
    return aa

### File import

In [3]:
os.chdir(directory)

In [4]:
os.getcwd()

'/Users/ShokenLEE/Desktop/DATA/Bioinformatics/MemBrain AH prediction'

In [6]:
filename = 'Result_Selected_Organisms_SubCellLoc.csv'

df = pd.read_csv(directory + filename)

### Add 'AH positions' and 'AH residues'

In [41]:
df['AH_positions'] = df['Prediction'].apply(find_AH_residue_number)

In [45]:
df['AH_residues'] = df.apply(lambda x: residues_at_positions(x['AA_sequence'], x['AH_positions']), axis=1)

In [46]:
df.head()

Unnamed: 0,Uniprot_ID,Organism,Protein_name,AH_or_Not,AA_sequence,Prediction,Subcellular_location,AH_positions,AH_residues
0,Q8N4K4,Homo sapiens (Human),Reprimo-like protein,Non-AH,MNATFLNHSGLEEVDGVGGGAGAALGNRTHGLGTWLGCCPGGAPLA...,0000000000000000000000000000000000000000000000...,Membrane,[],[]
1,Q8N4S7,Homo sapiens (Human),Progestin and adipoQ receptor family member 4,Non-AH,MAFLAGPRLLDWASSPPHLQFNKFVLTGYRPASSGSGCLRSLFYLH...,0000000000000000000000000000000000000000000000...,Membrane,[],[]
2,Q8N5G0,Homo sapiens (Human),Small integral membrane protein 20,Non-AH,MSRNLRTALIFGGFISLIGAAFYPIYFRPLMRLEEYKKEQAINRAG...,0000000000000000000000000000000000000000000000...,Mitochondrion inner membrane,[],[]
3,Q8N614,Homo sapiens (Human),Transmembrane protein 156,AH,MTKTALLKLFVAIVITFILILPEYFKTPKERTLELSCLEVCLQSNF...,0011111100000000000000000000000000000000000000...,Membrane,"[2, 3, 4, 5, 6, 7]","[K, T, A, L, L, K]"
4,Q8N7C4,Homo sapiens (Human),Transmembrane protein 217,Non-AH,MKQQQWCGMTAKMGTVLSGVFTIMAVDMYLIFEQKHLGNGSCTEIT...,0000000000000000000000000000000000000000000000...,Membrane,[],[]


In [7]:
organisms = df['Organism'].unique()
organisms.sort()
organisms

array(['Arabidopsis thaliana (Mouse-ear cress)', 'Bos taurus (Bovine)',
       'Caenorhabditis elegans',
       'Danio rerio (Zebrafish) (Brachydanio rerio)',
       'Drosophila melanogaster (Fruit fly)', 'Gallus gallus (Chicken)',
       'Homo sapiens (Human)', 'Mus musculus (Mouse)',
       'Rattus norvegicus (Rat)'], dtype=object)

#### ER or nuclear proteins

In [47]:
df_ER_or_N = df[df['Subcellular_location'].str.contains('Nucleus') |
                df['Subcellular_location'].str.contains('Endoplasmic reticulum')]
print('ER or Nuclear proteins are: ' + str(len(df_ER_or_N)))

ER or Nuclear proteins are: 469


#### Not ER nor not nucleus proteins

In [48]:
df_ER_nor_N = df[~df['Subcellular_location'].str.contains('Nucleus') &
                ~df['Subcellular_location'].str.contains('Endoplasmic reticulum')]
print('Non-ER nor non-Nuclear proteins are: ' + str(len(df_ER_nor_N)))

Non-ER nor non-Nuclear proteins are: 2523


#### AH proteins in non-ER, non-nuclear proteins

In [49]:
df_ER_nor_N_and_AH = df_ER_nor_N[df_ER_nor_N['AH_or_Not'] == 'AH']
print('AH proteins in non-ER nor non-Nuclear proteins are: ' + str(len(df_ER_nor_N_and_AH)))
print(f'{len(df_ER_nor_N_and_AH)/len(df_ER_nor_N)*100} % of proteins are AH in non-ER, non-nuclear proteins')

AH proteins in non-ER nor non-Nuclear proteins are: 1282
50.812524772096715 % of proteins are AH in non-ER, non-nuclear proteins


#### AH proteins in ER, or nuclear proteins

In [50]:
df_ER_or_N_and_AH = df_ER_or_N[df_ER_or_N['AH_or_Not'] == 'AH']
print('AH proteins in ER or Nuclear proteins are: ' + str(len(df_ER_or_N_and_AH)))
print(f'{len(df_ER_or_N_and_AH)/len(df_ER_or_N)*100} % of proteins are AH in ER or nuclear proteins')

AH proteins in ER or Nuclear proteins are: 311
66.31130063965884 % of proteins are AH in ER or nuclear proteins


#### AH proteins in ER proteins

In [51]:
df_ER_AH = df_ER_or_N[df_ER_or_N['Subcellular_location'].str.contains('Endoplasmic reticulum') &
                     (df_ER_or_N['AH_or_Not'] == 'AH')]
print('AH proteins in ER proteins are: ' + str(len(df_ER_AH)))

AH proteins in ER proteins are: 267


#### AH proteins in nuclear proteins

In [52]:
df_N_AH = df_ER_or_N[df_ER_or_N['Subcellular_location'].str.contains('Nucleus') &
                     (df_ER_or_N['AH_or_Not'] == 'AH')]
print('AH proteins in ER proteins are: ' + str(len(df_N_AH)))

AH proteins in ER proteins are: 63


In [53]:
df_N_AH.head()

Unnamed: 0,Uniprot_ID,Organism,Protein_name,AH_or_Not,AA_sequence,Prediction,Subcellular_location,AH_positions,AH_residues
14,Q8N9R8,Homo sapiens (Human),Protein SCAI,AH,MVRGARQPQQPRSRLAPRLTGTVEKPPRKRRSRTEFALKEIMSSGG...,0000000000000000000000000000000000000000000000...,"Membrane, Nucleus, Cytoplasm","[506, 507, 508, 509, 510, 511]","[L, R, K, I, N, R]"
50,Q8R411,Mus musculus (Mouse),Myc target protein 1,AH,MANNTTSLGSPWPENFWEDLIMSFTVSVAIGLAIGGFLWALFVFLS...,0000000000000111111000000000000000000000000000...,Nucleus membrane,"[13, 14, 15, 16, 17, 18]","[E, N, F, W, E, D]"
102,Q8VEG4,Mus musculus (Mouse),Exonuclease 3'-5' domain-containing protein 2,AH,MSRQNLVALTVTTLLGVAMGGFVLWKGIQRRWSKTSRVMQQQPQQP...,0000000000000000000000111111111111111000000000...,"Mitochondrion outer membrane, Mitochondrion ma...","[22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 3...","[V, L, W, K, G, I, Q, R, R, W, S, K, T, S, R]"
255,Q96DY5,Mus musculus (Mouse),RING finger protein 112,AH,MPRPVLSVTAFCHRLGKRESKRSFMGNSSNSWVLPREEAQGWMGQA...,0000000000000000000000000000000000000000000000...,"Membrane, Membrane, Cytoplasm, Nucleus, Nucleu...","[450, 451, 452, 453, 454, 455, 456]","[A, Q, E, I, K, N, L]"
278,Q96KC8,Homo sapiens (Human),DnaJ homolog subfamily C member 1,AH,MTAPCSQPAQLPGRRQLGLVPFPPPPPRTPLLWLLLLLLAAVAPAR...,0000000000000000000000000000000000000000000000...,"Endoplasmic reticulum membrane, Nucleus membra...","[230, 231, 232, 233, 234, 235, 236, 237, 238, ...","[F, C, L, T, L, K, A, L, P, H, D, C, I, A, R, ..."


### Export to CSV: AH or Non-AH, ER or nucleus

In [54]:
for AH_or_not in ['AH', 'Non-AH']:
    df_sub = df[df['AH_or_Not'] == AH_or_not]
    for organelle in ['Nucleus', 'Endoplasmic reticulum']:
        df_sub_sub = df_sub[df_sub['Subcellular_location'].str.contains(organelle)]
        df_sub_sub = df_sub_sub.sort_values(by=['Organism', 'Protein_name'])
        df_sub_sub.to_csv(f'./{organelle}_{AH_or_not}.csv')

In [11]:
for organism in df_AH_Nuc['Organism'].unique():
    _df = df_AH_Nuc[df_AH_Nuc['Organism'] == organism]
    proteins = list(_df['Protein_name'])
    proteins.sort()
    print(organism, proteins)

Homo sapiens (Human) ['Bcl-2-like protein 10', 'Bcl-2-like protein 13', 'CAAX prenyl protease 1 homolog', 'Chloride channel CLIC-like protein 1', 'DnaJ homolog subfamily C member 1', 'Etoposide-induced protein 2.4 homolog', 'Interferon alpha-inducible protein 27, mitochondrial', 'Membrane-associated progesterone receptor component 2', 'Protein LYRIC', 'Protein SCAI', 'Thioredoxin-related transmembrane protein 4', 'Transmembrane protein 201', 'Transmembrane protein 250']
Mus musculus (Mouse) ['1-acyl-sn-glycerol-3-phosphate acyltransferase epsilon', '3-beta-hydroxysteroid-Delta(8),Delta(7)-isomerase', 'E3 ubiquitin-protein ligase RNF43', "Exonuclease 3'-5' domain-containing protein 2", 'Inner nuclear membrane protein Man1', 'LEM domain-containing protein 2', 'Major facilitator superfamily domain-containing protein 10', 'Membrane-anchored junction protein', 'Myc target protein 1', 'Prostaglandin E synthase 2', 'RING finger protein 112', 'SUN domain-containing protein 1', 'SUN domain-cont

In [57]:
1282 / 2523

0.5081252477209671

In [5]:
!ls

[34mAll results_original[m[m
[34mCodes[m[m
Endoplasmic reticulum_AH.csv
Endoplasmic reticulum_Non-AH.csv
Nucleus_AH.csv
Nucleus_Non-AH.csv
Organisms.csv
Result_Selected_Organisms.csv
Result_Selected_Organisms_SubCellLoc.csv
[34mResults from the paper[m[m
