In [1]:
import os, re, requests, json, warnings
import numpy as np
import pandas as pd
from urllib.request import urlopen
from time import sleep

warnings.filterwarnings('ignore')

In [2]:
def get_url(url, **kwargs):
    '''
    Obatin a response from a given url
    '''
    response = requests.get(url, **kwargs);

    if not response.ok:
        print(response.text)
        response.raise_for_status()
        sys.exit()

    return response

# Step 1: Scan raw data, judge AH or not, obtain organism and gene names, etc

In [None]:
# uniprot API URL
WEBSITE_API = "https://rest.uniprot.org/uniprotkb/"

In [2]:
# raw data import
files = os.listdir('./SourceData/Results from the paper/')
print(len(files))

11761


In [47]:
# prepare a dataframe and array for the ID not found in uniprot
df = pd.DataFrame()

# scan raw data files, judge AH_or_not, and get the gene info from uniprot based on the ID called "entry"
for i, file in enumerate(files):
    
    # check the file format
    if file.endswith('result'):
        
        # obtain uniprot ID called entry
        entry = file.replace('.result', '')
        
        # open the file
        with open('./SourceData/Results from the paper/' + file) as f:
            lines = f.readlines()
    
        # get the amino-acid sequence and prediction
        aa_sequence = lines[1]
        prediction = lines[3]

        # judge if the protein predicted to contain AH or not
        if '1' in prediction:
            AH_or_not = 'AH'
        else:
            AH_or_not = 'Non-AH'

        # get the result of query
        try:
            r = get_url(f"{WEBSITE_API}/search?query=(accession:{entry})&fields=organism_name,gene_primary,protein_name,cc_subcellular_location")
            result = r.json()['results'][0]
        except requests.exceptions.ConnectionError:
            r.status_code = "Connection refused"
            break
        
        # extract wanted texts from the result
        ## organism name
        ### put try-except in case the gene info is obsolete
        try:
            organism_name = result['organism']['scientificName']
        except:
            organism_name = 'Unknown'
        
        ## gene name
        ### put try-except in case it lacks geneName
        try: 
            gene_name = result['genes'][0]['geneName']['value']
        except: gene_name = 'Unknown'

        ## protein name
        try: 
            protein_name = result['proteinDescription']['recommendedName']['fullName']['value'] 
        except: protein_name = 'Unknown'

        ## subcellular locations
        subcell_loc = list()
        try:
            for v in result['comments'][0]['subcellularLocations']:
                subcell_loc.append(v['location']['value'])
                subcell_loc = ', '.join(subcell_loc)
        except:
            subcell_loc = 'Unknown'
        
        
        # put the values to the dataframe
        df.loc[i, 'Entry_original'] = entry
        df.loc[i, 'Organism'] = organism_name
        df.loc[i, 'Gene_name'] = gene_name
        df.loc[i, 'Protein_name'] = protein_name
        df.loc[i, 'AH_or_Not'] = AH_or_not
        df.loc[i, 'AA_sequence'] = aa_sequence
        df.loc[i, 'Prediction'] = prediction
        df.loc[i, 'SubCell_Uniprot'] = subcell_loc
        
        # log every 100 genes
        if i % 100 == 0: print(i, entry, organism_name, protein_name)
    
    # take a break, go next
    sleep(1)
            

0 Q9CAN8 Arabidopsis thaliana Lysophospholipid acyltransferase 2
100 P53389 Saccharomyces cerevisiae (strain ATCC 204508 / S288c) Protein HOL1
200 Q12155 Saccharomyces cerevisiae (strain ATCC 204508 / S288c) Uncharacterized membrane protein YLR050C
300 O26830 Methanothermobacter thermautotrophicus (strain ATCC 29096 / DSM 1053 / JCM 10044 / NBRC 100330 / Delta H) Putative phospho-N-acetylmuramoyl-pentapeptide-transferase
400 Q51MB1 Magnaporthe oryzae (strain 70-15 / ATCC MYA-4617 / FGSC 8958) pH-response regulator protein palI/RIM9
500 Q3E8Z8 Arabidopsis thaliana Putative pectinesterase/pectinesterase inhibitor 28
600 Q5NCP0 Mus musculus E3 ubiquitin-protein ligase RNF43
700 Q22549 Caenorhabditis elegans Innexin-10
800 O07888 Treponema pallidum (strain Nichols) Flagellar protein FliL
900 Q7Z695 Homo sapiens Uncharacterized aarF domain-containing protein kinase 2
1000 Q674R7 Homo sapiens Autophagy-related protein 9B
1100 O43085 Schizosaccharomyces pombe (strain 972 / ATCC 24843) DSC E3 

In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11759 entries, 0 to 11760
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Entry_original   11759 non-null  object
 1   Organism         11759 non-null  object
 2   Gene_name        11759 non-null  object
 3   Protein_name     11759 non-null  object
 4   AH_or_Not        11759 non-null  object
 5   AA_sequence      11759 non-null  object
 6   Prediction       11759 non-null  object
 7   SubCell_Uniprot  11759 non-null  object
dtypes: object(8)
memory usage: 1.1+ MB


In [49]:
# export
df.to_csv('./IntermediateProducts/Results_step_1.csv', index=False)

# Step 2: Sort genes of metazoans

In [8]:
WEBSITE_API = 'https://rest.uniprot.org/taxonomy/'

In [3]:
# Import the df from step 1
df = pd.read_csv('./IntermediateProducts/Results_step_1.csv')

In [150]:
# create organism list
organism_list = df['Organism'].unique().tolist()
print('Number of organism species in the data is ', len(organism_list))

Number of organism species in the data is  1521


In [134]:
# a list for storage
lineage_list = list()

# regex for extracing organism name WITHOUT items in parenthesis
regex = re.compile(r'([^()]+)(\(.+\))?')

# scan organism list and get lineage from Uniprot Taxonomy
for i, organism in enumerate(organism_list):
    # extract organism name WITHOUT items in parenthesis such as strain name
    mo = regex.search(organism)
    if mo is None:
        lineage_list.append('Unknown')
        continue
    else: 
        organism = mo.group(1)
    
    # get response that contains lineage
    try:
        r = get_url(f'{WEBSITE_API}/search?query=(scientific:"{organism}")&fields=lineage')
        result = r.json()
    except requests.exceptions.ConnectionError:
        r.status_code = "Connection refused"
        break
    
    # extract lineage from the result
    lineage = list()
    lineage_full = ''
    try:
        for l in result['results'][0]['lineage']:
            name = l['scientificName']
            lineage.append(name)
        lineage_full = ', '.join(lineage)
    except:
        lineage_full = 'NotFound' # in case the result is empty
        
    # add to the found lineage to the list
    lineage_list.append(lineage_full)
    
    # log every 100
    if i % 100 == 0: print(i, organism, lineage[-2:])
    
    # take a break, go next
    sleep(1)

0 Arabidopsis thaliana ['Eukaryota', 'cellular organisms']
100 Salmonella phage P22 ['Duplodnaviria', 'Viruses']
200 Acropora millepora ['Eukaryota', 'cellular organisms']
300 Streptococcus pneumoniae  ['Bacteria', 'cellular organisms']
400 Acidianus bottle-shaped virus  []
500 Petunia integrifolia ['Eukaryota', 'cellular organisms']
600 Haloquadratum walsbyi  ['Archaea', 'cellular organisms']
700 Xanthomonas campestris pv. campestris  ['Bacteria', 'cellular organisms']
800 Friend murine leukemia virus  ['Riboviria', 'Viruses']
900 Oceanobacillus iheyensis  ['Bacteria', 'cellular organisms']
1000 Artemia salina ['Eukaryota', 'cellular organisms']
1100 Citrifermentans bemidjiense  ['Bacteria', 'cellular organisms']
1200 Quaranfil virus  ['Riboviria', 'Viruses']
1300 Planococcus maritimus ['Bacteria', 'cellular organisms']
1400 Macaca radiata ['Eukaryota', 'cellular organisms']
1500 Dechloromonas aromatica  ['Bacteria', 'cellular organisms']


In [141]:
# Add to the df_org
df_org = pd.DataFrame(organism_list, columns=['Organism'])
df_org['Lineage'] = lineage_list

# export
df_org.to_csv('./IntermediateProducts/Organisms_and_Lineage.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 122 entries, 4 to 1518
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Organism  122 non-null    object
 1   Lineage   122 non-null    object
dtypes: object(2)
memory usage: 2.9+ KB
None


In [152]:
# Select metazoans
df_org_metazoa = df_org[df_org['Lineage'].str.contains('Metazoa')]
print('Number of metazoan species is ', len(df_org_metazoa))

Number of metazoan species is  122


In [165]:
# How many are eukaryotes, bacteria, archea, and virus
n_eukaryote = len(df_org[df_org['Lineage'].str.contains('Eukaryot')])
n_bacteria = len(df_org[df_org['Lineage'].str.contains('Bacteria')])
n_archaea = len(df_org[df_org['Lineage'].str.contains('Archaea')])
n_virus = len(df_org[df_org['Lineage'].str.contains('Virus')])
n_notFound = len(df_org) - np.sum([n_eukaryote, n_bacteria, n_archaea, n_virus])
print('Number of eukaryotic species is ', n_eukaryote)
print('Number of bacteria species is ', n_bacteria)
print('Number of archea species is ', n_archaea)
print('Number of virus species is ', n_virus)
print('Number of species not found in database is ', n_notFound)

Number of eukaryotic species is  431
Number of bacteria species is  732
Number of archea species is  61
Number of virus species is  281
Number of species not found in database is  16


In [153]:
# Merge df_org_metazoa with the main df
# by doing so, genes from metazoans are sorted
df_metazoanGenes = df.merge(df_org_metazoa, how='inner', on='Organism')
print(df_metazoanGenes.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2733 entries, 0 to 2732
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Entry_original   2733 non-null   object
 1   Organism         2733 non-null   object
 2   Gene_name        2733 non-null   object
 3   Protein_name     2733 non-null   object
 4   AH_or_Not        2733 non-null   object
 5   AA_sequence      2733 non-null   object
 6   Prediction       2733 non-null   object
 7   SubCell_Uniprot  2733 non-null   object
 8   Lineage          2733 non-null   object
dtypes: object(9)
memory usage: 213.5+ KB
None


In [166]:
# remove Lineage column
df_metazoanGenes = df_metazoanGenes.drop(['Lineage'], axis=1)

# export
df_metazoanGenes.to_csv('./IntermediateProducts/Results_step_2.csv', index=False)

# Step 3: Get human and mouse entry

In [3]:
# import df from step 3
df_metazoanGenes = pd.read_csv('./IntermediateProducts/Results_step_2.csv')

In [4]:
df_metazoanGenes.head()

Unnamed: 0,Entry_original,Organism,Gene_name,Protein_name,AH_or_Not,AA_sequence,Prediction,SubCell_Uniprot
0,Q16928,Anopheles albimanus,W,Protein white,Non-AH,MTINTDDQYADGESKTTISSNRRYSTSSFQDQSMEDDGINATLTND...,0000000000000000000000000000000000000000000000...,Membrane
1,Q01151,Homo sapiens,CD83,CD83 antigen,Non-AH,MSRGLQLLLLSCAYSLAPATPEVKVACSEDVDLPCTAPWDPQVPYT...,0000000000000000000000000000000000000000000000...,Membrane
2,Q0P6D2,Homo sapiens,DIPK1C,Divergent protein kinase domain 1C,AH,MARAAGARGPAGWCRRRGRCGRGTLLAFAAWTAGWVLAAALLLRAH...,0000000000000000000000000000000000000000000000...,Endoplasmic reticulum membrane
3,Q06136,Homo sapiens,KDSR,3-ketodihydrosphingosine reductase,AH,MLLLAAAFLVAFVLLLYMVSPLISPKPLALPGAHVVVTGGSSGIGK...,0000000000000000000000000000000000000000000000...,Endoplasmic reticulum membrane
4,Q3C1V0,Homo sapiens,MS4A18,Membrane-spanning 4-domains subfamily A member 18,AH,MTEQVIGANSVPGIIAPDNVHVIQPSNPVASGNHLQPSEVTTYPIS...,0000000000000000000000000000000000000000000000...,Membrane


In [5]:
WEBSITE_API = "https://rest.uniprot.org/uniprotkb/"

organism_id_list = {'Homo sapiens': '9606', 'Mus musculus': '10090'}

In [9]:
def entry_convert(df, organism):
    """
    For each entry in a given df, 
    if the entry is from the specified organism (human or mouse),
    just reuse the entry
    if not,
    convert the entry into that of the specified organism (human or mouse)
    """
    # For final output
    converted_id_list = list()
    
    # Organism of interest either human or mouse
    organism_id = organism_id_list[organism]
    
    # for counting how many genes were not found
    reused = 0
    non_match = 0
    not_found = 0
    
    # scan entrys in the df
    for i in range(len(df)):
        gene = df.iloc[i, 2]
        
#         # in case the gene name is unknown, it matches a gene Q9Y6S7, whose gene name is 'Unknown'
#         # thus below precludes this gene
#         if gene == 'Unknown': #
#             entry_converted = 'Not_found'
#             not_found += 1
#             # add to the final output list
#             converted_id_list.append(entry_converted)
#             continue
        
        # judge if the entry comes from human or mouse
        if df.iloc[i, 1] == organism:
            entry_converted = df.iloc[i, 0] # reuse the original gene name
            reused += 1
        else:
            
            # obtain the human or mouse entry from uniprot
            try:
                r = get_url(f'{WEBSITE_API}/search?query=gene:{gene}+AND+organism_id:{organism_id}+AND+reviewed:true&fields=accession,gene_names')
                result = r.json()['results'][0]
                
                ## get human or mouse entry
                entry_converted = result['primaryAccession']
                ## get gene name and check the names matche between human's or mouse's and the given organism's
                gene_obtained = result['genes'][0]['geneName']['value']
                if gene.lower() != gene_obtained.lower():
                    entry_converted = 'Not_found'
                    non_match += 1
            except:
                entry_converted = 'Not_found'
                not_found += 1
            
            # take a break
            sleep(1)
        
        # add to the final output list
        converted_id_list.append(entry_converted)
        
        # log every 100
        if i % 100 == 0: print(i, gene, entry_converted)

    
    return converted_id_list, reused, non_match, not_found

In [10]:
# get human entry
converted_entry_Hs, reused, non_match, not_found = entry_convert(df_metazoanGenes, 'Homo sapiens')

print('Total number of genes that got converted to Hs: ', len(df_metazoanGenes) - reused - non_match - not_found)
print('Number of genes for which similar gene name was found in Hs: ', non_match)
print('Number of genes for which Hs homolog was not found: ', not_found)

0 W Not_found
100 SEMA6B Q9H3T3
200 LRP12 Q9Y561
300 DCBLD2 Q96PD2
400 SMIM41 A0A2R8YCJ5
500 PREB Q9HCU5
600 TMEM61 Q8N0U2
700 BRICD5 Q6PL45
800 HHLA2 Q9UM44
900 Nrros Q86YC3
1000 Zdhhc19 Q8WVZ1
1100 Tomm70 O94826
1200 Ms4a13 Q5J8X5
1300 PLPPR2 Q96GM1
1400 TMEM54 Q969K7
1500 clc-5 Not_found
1600 nra-2 Not_found
1700 Unknown Not_found
1800 Unknown Not_found
1900 Klri2 Not_found
2000 TMEM196 Q5HYL7
2100 ATP1B4 Q9UN42
2200 frc Not_found
2300 DppIII Not_found
2400 slc39a10 Q9ULF5
2500 PAPI Not_found
{"url":"http://rest.uniprot.org/uniprotkb/search","messages":["query parameter has an invalid syntax"]}
{"url":"http://rest.uniprot.org/uniprotkb/search","messages":["query parameter has an invalid syntax"]}
2600 chrnd Q07001
{"url":"http://rest.uniprot.org/uniprotkb/search","messages":["query parameter has an invalid syntax"]}
2700 Unknown Not_found
{"url":"http://rest.uniprot.org/uniprotkb/search","messages":["query parameter has an invalid syntax"]}
Total number of genes that got converted t

In [11]:
# Add the result to Entry_Hs column
df_metazoanGenes['Entry_Hs'] = converted_entry_Hs

In [12]:
# get mouse entry
converted_entry_Mm, reused, non_match, not_found = entry_convert(df_metazoanGenes, 'Mus musculus')

print('Total number of genes that got converted to Mm: ', len(df_metazoanGenes) - reused - non_match - not_found)
print('Number of genes for which similar gene name was found in Mm: ', non_match)
print('Number of genes for which Mm homolog was not found: ', not_found)

0 W Not_found
100 SEMA6B O54951
200 LRP12 Q8BUJ9
300 DCBLD2 Q91ZV3
400 SMIM41 Not_found
500 PREB Q9WUQ2
600 TMEM61 Not_found
700 BRICD5 Not_found
800 HHLA2 Not_found
900 Nrros Q8BMT4
1000 Zdhhc19 Q810M5
1100 Tomm70 Q9CZW5
1200 Ms4a13 Q5FWC3
1300 PLPPR2 Q8VCY8
1400 TMEM54 Q9D7S1
1500 clc-5 Not_found
1600 nra-2 Not_found
1700 Unknown Not_found
1800 Unknown Not_found
1900 Klri2 Q5DT36
2000 TMEM196 Not_found
2100 ATP1B4 Q99ME6
2200 frc Not_found
2300 DppIII Not_found
2400 slc39a10 Q6P5F6
2500 PAPI Not_found
{"url":"http://rest.uniprot.org/uniprotkb/search","messages":["query parameter has an invalid syntax"]}
{"url":"http://rest.uniprot.org/uniprotkb/search","messages":["query parameter has an invalid syntax"]}
2600 chrnd P02716
{"url":"http://rest.uniprot.org/uniprotkb/search","messages":["query parameter has an invalid syntax"]}
2700 Unknown Not_found
{"url":"http://rest.uniprot.org/uniprotkb/search","messages":["query parameter has an invalid syntax"]}
Total number of genes that got con

In [13]:
# Add the result to Entry_Mm column
df_metazoanGenes['Entry_Mm'] = converted_entry_Mm

In [17]:
df_metazoanGenes.head(n=5)

Unnamed: 0,Entry_original,Organism,Gene_name,Protein_name,AH_or_Not,AA_sequence,Prediction,SubCell_Uniprot,Entry_Hs,Entry_Mm
0,Q16928,Anopheles albimanus,W,Protein white,Non-AH,MTINTDDQYADGESKTTISSNRRYSTSSFQDQSMEDDGINATLTND...,0000000000000000000000000000000000000000000000...,Membrane,Not_found,Not_found
1,Q01151,Homo sapiens,CD83,CD83 antigen,Non-AH,MSRGLQLLLLSCAYSLAPATPEVKVACSEDVDLPCTAPWDPQVPYT...,0000000000000000000000000000000000000000000000...,Membrane,Q01151,O88324
2,Q0P6D2,Homo sapiens,DIPK1C,Divergent protein kinase domain 1C,AH,MARAAGARGPAGWCRRRGRCGRGTLLAFAAWTAGWVLAAALLLRAH...,0000000000000000000000000000000000000000000000...,Endoplasmic reticulum membrane,Q0P6D2,Q8BQT2
3,Q06136,Homo sapiens,KDSR,3-ketodihydrosphingosine reductase,AH,MLLLAAAFLVAFVLLLYMVSPLISPKPLALPGAHVVVTGGSSGIGK...,0000000000000000000000000000000000000000000000...,Endoplasmic reticulum membrane,Q06136,Q6GV12
4,Q3C1V0,Homo sapiens,MS4A18,Membrane-spanning 4-domains subfamily A member 18,AH,MTEQVIGANSVPGIIAPDNVHVIQPSNPVASGNHLQPSEVTTYPIS...,0000000000000000000000000000000000000000000000...,Membrane,Q3C1V0,Not_found


In [16]:
# Export
df_metazoanGenes.to_csv('./IntermediateProducts/Results_step_3.csv', index=False)

# Step 4. Merge with proteome data

## 4-1. Human Protein Atlas (HPA)

In [30]:
# MemBrain
df_MB = pd.read_csv('./IntermediateProducts/Results_step_3.csv')

# Human protein atlas
df_HPA = pd.read_csv('./IntermediateProducts/HumanProteinAtlas/HPA_val_supp_nucleus.csv')

In [53]:
# Merge
df_MB = df_MB.merge(df_HPA, how='left', left_on='Entry_Hs', right_on='Uniprot')

In [54]:
# null check
print(df_MB.isnull().sum())

Entry_original          0
Organism                0
Gene_name               0
Protein_name            0
AH_or_Not               0
AA_sequence             0
Prediction              0
SubCell_Uniprot         0
Entry_Hs                0
Entry_Mm                0
Gene                 2648
Uniprot              2648
Reliability          2648
IF location score    2648
dtype: int64


In [55]:
#fill NaN
df_MB.fillna('Not_found', inplace=True)

In [56]:
df_MB.columns

Index(['Entry_original', 'Organism', 'Gene_name', 'Protein_name', 'AH_or_Not',
       'AA_sequence', 'Prediction', 'SubCell_Uniprot', 'Entry_Hs', 'Entry_Mm',
       'Gene', 'Uniprot', 'Reliability', 'IF location score'],
      dtype='object')

In [57]:
df_MB = df_MB[['Entry_original', 'Organism', 'Gene_name', 'Protein_name', 'AH_or_Not',
       'AA_sequence', 'Prediction', 'Entry_Hs', 'Entry_Mm', 'SubCell_Uniprot', 'IF location score',
       'Reliability']]

df_MB = df_MB.rename(columns={'Reliability': 'HPA_reliability', 'IF location score': 'SubCell_HPA'})

In [58]:
df_MB.head()

Unnamed: 0,Entry_original,Organism,Gene_name,Protein_name,AH_or_Not,AA_sequence,Prediction,Entry_Hs,Entry_Mm,SubCell_Uniprot,SubCell_HPA,HPA_reliability
0,Q16928,Anopheles albimanus,W,Protein white,Non-AH,MTINTDDQYADGESKTTISSNRRYSTSSFQDQSMEDDGINATLTND...,0000000000000000000000000000000000000000000000...,Not_found,Not_found,Membrane,Not_found,Not_found
1,Q01151,Homo sapiens,CD83,CD83 antigen,Non-AH,MSRGLQLLLLSCAYSLAPATPEVKVACSEDVDLPCTAPWDPQVPYT...,0000000000000000000000000000000000000000000000...,Q01151,O88324,Membrane,Not_found,Not_found
2,Q0P6D2,Homo sapiens,DIPK1C,Divergent protein kinase domain 1C,AH,MARAAGARGPAGWCRRRGRCGRGTLLAFAAWTAGWVLAAALLLRAH...,0000000000000000000000000000000000000000000000...,Q0P6D2,Q8BQT2,Endoplasmic reticulum membrane,Not_found,Not_found
3,Q06136,Homo sapiens,KDSR,3-ketodihydrosphingosine reductase,AH,MLLLAAAFLVAFVLLLYMVSPLISPKPLALPGAHVVVTGGSSGIGK...,0000000000000000000000000000000000000000000000...,Q06136,Q6GV12,Endoplasmic reticulum membrane,Not_found,Not_found
4,Q3C1V0,Homo sapiens,MS4A18,Membrane-spanning 4-domains subfamily A member 18,AH,MTEQVIGANSVPGIIAPDNVHVIQPSNPVASGNHLQPSEVTTYPIS...,0000000000000000000000000000000000000000000000...,Q3C1V0,Not_found,Membrane,Not_found,Not_found


In [59]:
# Export
df_MB.to_csv('./IntermediateProducts/Results_step_4-1_HPA.csv', index=False)

## 4-2. Schirmer et al 2003

In [2]:
# MemBrain
df_MB = pd.read_csv('./IntermediateProducts/Results_step_4-1_HPA.csv')

# Schirmer 2003 data
df_Schirmer = pd.read_csv('./IntermediateProducts/ProteomePapers/Schirmer_2003.csv')

In [3]:
# select columns
df_Schirmer = df_Schirmer[['Entry', 'Entry name']]

In [4]:
# Merge
df_MB = df_MB.merge(df_Schirmer, how='left', left_on='Entry_Hs', right_on='Entry')

In [11]:
# null check
print(df_MB.isnull().sum())
print('----- \n')
print("The number of genes that are found in Schirmer et al Human genes is ", len(df_MB) - df_MB['Entry'].isnull().sum())

Entry_original        0
Organism              0
Gene_name             0
Protein_name          0
AH_or_Not             0
AA_sequence           0
Prediction            0
Entry_Hs              0
Entry_Mm              0
SubCell_Uniprot       0
SubCell_HPA           0
HPA_reliability       0
Entry              2726
Entry name         2726
dtype: int64
----- 

The number of genes that are found in Schirmer et al Human genes is  7


In [12]:
# Drop "Entry name"
df_MB = df_MB.drop(columns=['Entry name'], axis=1)

In [13]:
# Repeat Merge for Mouse entry because the Schirmer data contains mouse entry as well
df_MB = df_MB.merge(df_Schirmer, how='left', left_on='Entry_Mm', right_on='Entry')

In [14]:
# null check
print(df_MB.isnull().sum())
print('----- \n')
print("The number of genes that are found in Schirmer et al Mouse genes is ", len(df_MB) - df_MB['Entry_y'].isnull().sum())

Entry_original        0
Organism              0
Gene_name             0
Protein_name          0
AH_or_Not             0
AA_sequence           0
Prediction            0
Entry_Hs              0
Entry_Mm              0
SubCell_Uniprot       0
SubCell_HPA           0
HPA_reliability       0
Entry_x            2726
Entry_y            2728
Entry name         2728
dtype: int64
----- 

The number of genes that are found in Schirmer et al Mouse genes is  5


In [15]:
# replace NaN with 0 for the following lambda
df_MB.fillna(0, inplace=True)

# Put Subcell_Schirmer2003
df_MB['Subcell_Schirmer2003'] = df_MB.apply(lambda x: 'Nuclear membrane' if (x['Entry_x'] != 0) | (x['Entry_y'] != 0) else 'Not_found', axis=1)

In [16]:
# Drop unnecessary columns
df_MB = df_MB.drop(columns=['Entry name', 'Entry_x', 'Entry_y'], axis=1)

# Check
# df_MB[df_MB['Subcell_Schirmer2003'] != 'Not_found']

In [17]:
df_MB.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2733 entries, 0 to 2732
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Entry_original        2733 non-null   object
 1   Organism              2733 non-null   object
 2   Gene_name             2733 non-null   object
 3   Protein_name          2733 non-null   object
 4   AH_or_Not             2733 non-null   object
 5   AA_sequence           2733 non-null   object
 6   Prediction            2733 non-null   object
 7   Entry_Hs              2733 non-null   object
 8   Entry_Mm              2733 non-null   object
 9   SubCell_Uniprot       2733 non-null   object
 10  SubCell_HPA           2733 non-null   object
 11  HPA_reliability       2733 non-null   object
 12  Subcell_Schirmer2003  2733 non-null   object
dtypes: object(13)
memory usage: 298.9+ KB


In [18]:
# Export
df_MB.to_csv('./IntermediateProducts/Results_step_4-2_Schirmer2003.csv', index=False)

## 4-3. Korfali et al 2012

In [36]:
# MemBrain
df_MB = pd.read_csv('./IntermediateProducts/Results_step_4-2_Schirmer2003.csv')

# Korfali 2012 data
df_Korfali2012 = pd.read_csv('./IntermediateProducts/ProteomePapers/Korfali_2012.csv')

# Limit the Korfali data to those that have Entry
df_Korfali2012 = df_Korfali2012[df_Korfali2012['Entry_Korfali_2012'] != 'Not_found']

In [37]:
# Merge
df_MB = df_MB.merge(df_Korfali2012, how='left', left_on='Entry_Hs', right_on='Entry_Korfali_2012')

In [41]:
# null check
print(df_MB.isnull().sum())
print('----- \n')
print("The number of genes that are found in Korfali et al 2012 is ", len(df_MB) - df_MB['Entry_Korfali_2012'].isnull().sum())

Entry_original             0
Organism                   0
Gene_name                  0
Protein_name               0
AH_or_Not                  0
AA_sequence                0
Prediction                 0
Entry_Hs                   0
Entry_Mm                   0
SubCell_Uniprot            0
SubCell_HPA                0
HPA_reliability            0
Subcell_Schirmer2003       0
tissue                  2675
gene name               2675
NE:MM ratio by dNSAF    2675
Entry_Korfali_2012      2675
dtype: int64
----- 

The number of genes that are found in Korfali et al 2012 is  58


In [42]:
#fill NaN
df_MB.fillna('Not_found', inplace=True)

In [43]:
df_MB.columns

Index(['Entry_original', 'Organism', 'Gene_name', 'Protein_name', 'AH_or_Not',
       'AA_sequence', 'Prediction', 'Entry_Hs', 'Entry_Mm', 'SubCell_Uniprot',
       'SubCell_HPA', 'HPA_reliability', 'Subcell_Schirmer2003', 'tissue',
       'gene name', 'NE:MM ratio by dNSAF', 'Entry_Korfali_2012'],
      dtype='object')

In [44]:
df_MB = df_MB[['Entry_original', 'Organism', 'Gene_name', 'Protein_name', 'AH_or_Not',
       'AA_sequence', 'Prediction', 'Entry_Hs', 'Entry_Mm', 'SubCell_Uniprot',
       'SubCell_HPA', 'HPA_reliability', 'Subcell_Schirmer2003', 'tissue', 'NE:MM ratio by dNSAF']]

df_MB = df_MB.rename(columns={'tissue': 'Tissue_Korfali_2012', 'NE:MM ratio by dNSAF': 'NE:MM_ratio_Korfali_2012'})

In [49]:
df_MB['Subcell_Korfali2012'] = df_MB.apply(lambda x: 'Nuclear membrane' if x['Tissue_Korfali_2012'] != 'Not_found' else 'Not_found', axis=1)

In [52]:
# Export
df_MB.to_csv('./IntermediateProducts/Results_step_4-3_Korfali2012.csv', index=False)

## 4-4. Wilkie et al 2010

In [73]:
# MemBrain
df_MB = pd.read_csv('./IntermediateProducts/Results_step_4-3_Korfali2012.csv')

# Wilkie 2010 data
df_Wilkie2010 = pd.read_csv('./IntermediateProducts/ProteomePapers/Wilkie_2010.csv')

In [74]:
# Limit the data to those that have Entry
df_Wilkie2010 = df_Wilkie2010[df_Wilkie2010['Entry_Wilkie'] != 'Not_found']

In [75]:
# Merge
df_MB = df_MB.merge(df_Wilkie2010, how='left', left_on='Entry_Hs', right_on='Entry_Wilkie')

In [76]:
# null check
print(df_MB.isnull().sum())
print('----- \n')
print("The number of genes that are found in Wilike et al 2010 is ", len(df_MB) - df_MB['Entry_Wilkie'].isnull().sum())

Entry_original                 0
Organism                       0
Gene_name                      0
Protein_name                   0
AH_or_Not                      0
AA_sequence                    0
Prediction                     0
Entry_Hs                       0
Entry_Mm                       0
SubCell_Uniprot                0
SubCell_HPA                    0
HPA_reliability                0
Subcell_Schirmer2003           0
Tissue_Korfali_2012            0
NE:MM_ratio_Korfali_2012       0
Subcell_Korfali2012            0
Entry_Wilkie                2682
Gene_name_obtained          2682
dtype: int64
----- 

The number of genes that are found in Wilike et al 2010 is  51


In [77]:
# fillna
df_MB.fillna('Not_found', inplace=True)

# Assign subcell
df_MB['Subcell_Wilkie2010'] = df_MB.apply(lambda x: 'Nuclear membrane' if x['Entry_Wilkie'] != 'Not_found' else 'Not_found', axis=1)

print(df_MB.columns)

Index(['Entry_original', 'Organism', 'Gene_name', 'Protein_name', 'AH_or_Not',
       'AA_sequence', 'Prediction', 'Entry_Hs', 'Entry_Mm', 'SubCell_Uniprot',
       'SubCell_HPA', 'HPA_reliability', 'Subcell_Schirmer2003',
       'Tissue_Korfali_2012', 'NE:MM_ratio_Korfali_2012',
       'Subcell_Korfali2012', 'Entry_Wilkie', 'Gene_name_obtained',
       'Subcell_Wilkie2010'],
      dtype='object')


In [84]:
# drop columns and reorder
df_MB = df_MB[['Entry_original', 'Organism', 'Gene_name', 'Protein_name', 'AH_or_Not',
       'AA_sequence', 'Prediction', 'Entry_Hs', 'Entry_Mm', 'SubCell_Uniprot',
       'SubCell_HPA', 'HPA_reliability', 'Subcell_Schirmer2003','Subcell_Korfali2012','Subcell_Wilkie2010',
       'Tissue_Korfali_2012', 'NE:MM_ratio_Korfali_2012',]]

In [86]:
# Export
df_MB.to_csv('./IntermediateProducts/Results_step_4-4_Wilkie2010.csv', index=False)

## 4-5. Korfali et al 2010

In [9]:
# MemBrain
df_MB = pd.read_csv('./IntermediateProducts/Results_step_4-4_Wilkie2010.csv')

# Wilkie 2010 data
df_Korfali2010 = pd.read_csv('./IntermediateProducts/ProteomePapers/Korfali_2010.csv')

In [11]:
# Limit the Korfali data to those that have Entry
df_Korfali2010 = df_Korfali2010[df_Korfali2010['Entry_Korfali_2010'] != 'Not_found']

In [13]:
# Merge
df_MB = df_MB.merge(df_Korfali2010, how='left', left_on='Entry_Hs', right_on='Entry_Korfali_2010')

In [17]:
# null check
print(df_MB.isnull().sum())
print('\n------')
print('The number of genes found in df_MB is ', len(df_MB) - df_MB['Entry_Korfali_2010'].isnull().sum())

Entry_original                 0
Organism                       0
Gene_name                      0
Protein_name                   0
AH_or_Not                      0
AA_sequence                    0
Prediction                     0
Entry_Hs                       0
Entry_Mm                       0
SubCell_Uniprot                0
SubCell_HPA                    0
HPA_reliability                0
Subcell_Schirmer2003           0
Subcell_Korfali2012            0
Subcell_Wilkie2010             0
Tissue_Korfali_2012            0
NE:MM_ratio_Korfali_2012       0
Entry_Korfali_2010          2666
Gene_name_obtained          2666
dtype: int64

------
The number of genes found in df_MB is  67


In [18]:
# fillna
df_MB.fillna('Not_found', inplace=True)

# make a column 'Subcell_Korfali2010'
df_MB['Subcell_Korfali2010'] = df_MB.apply(lambda x: 'Nuclear membrane' if x['Entry_Korfali_2010'] != 'Not_found' else 'Not_found', axis=1)

In [19]:
print(df_MB.columns)

Index(['Entry_original', 'Organism', 'Gene_name', 'Protein_name', 'AH_or_Not',
       'AA_sequence', 'Prediction', 'Entry_Hs', 'Entry_Mm', 'SubCell_Uniprot',
       'SubCell_HPA', 'HPA_reliability', 'Subcell_Schirmer2003',
       'Subcell_Korfali2012', 'Subcell_Wilkie2010', 'Tissue_Korfali_2012',
       'NE:MM_ratio_Korfali_2012', 'Entry_Korfali_2010', 'Gene_name_obtained',
       'Subcell_Korfali2010'],
      dtype='object')


In [20]:
df_MB = df_MB[['Entry_original', 'Organism', 'Gene_name', 'Protein_name', 'AH_or_Not',
       'AA_sequence', 'Prediction', 'Entry_Hs', 'Entry_Mm', 'SubCell_Uniprot',
       'SubCell_HPA', 'HPA_reliability', 'Subcell_Schirmer2003','Subcell_Korfali2010',
       'Subcell_Korfali2012', 'Subcell_Wilkie2010', 'Tissue_Korfali_2012',
       'NE:MM_ratio_Korfali_2012']]

In [21]:
# export
df_MB.to_csv('./IntermediateProducts/Results_step_4-5_Korfali2010.csv', index=False)

## 4-6. Cheng et al 2019: 243 proteins

In [45]:
# MemBrain
df_MB = pd.read_csv('./IntermediateProducts/Results_step_4-5_Korfali2010.csv')

# Cheng 2019 data
df_Cheng2019 = pd.read_excel('./SourceData/Cheng_2019/Supplementary Table S3_clustering & peptide seq.xlsx')

In [46]:
# column names
columns = df_Cheng2019.iloc[0, :].tolist()

# extract values
df_Cheng2019 = df_Cheng2019.iloc[1:, :]

# rename columns
df_Cheng2019.columns = columns

In [47]:
df_Cheng2019 = df_Cheng2019[['Accession No.', 'NE Enrich Score in U', 'NE Enrich Score in A', 'NE Enrich Score in M']]

df_Cheng2019 = df_Cheng2019.rename({'Accession No.': 'Entry_Cheng2019'}, axis=1)

In [48]:
# merge
df_MB = df_MB.merge(df_Cheng2019, how='left', left_on='Entry_Mm', right_on='Entry_Cheng2019')

In [49]:
# null check
print(df_MB.isnull().sum())
print('\n------')
print('The number of genes found in df_MB is ', len(df_MB) - df_MB['Entry_Cheng2019'].isnull().sum())

Entry_original                 0
Organism                       0
Gene_name                      0
Protein_name                   0
AH_or_Not                      0
AA_sequence                    0
Prediction                     0
Entry_Hs                       0
Entry_Mm                       0
SubCell_Uniprot                0
SubCell_HPA                    0
HPA_reliability                0
Subcell_Schirmer2003           0
Subcell_Korfali2010            0
Subcell_Korfali2012            0
Subcell_Wilkie2010             0
Tissue_Korfali_2012            0
NE:MM_ratio_Korfali_2012       0
Entry_Cheng2019             2622
NE Enrich Score in U        2622
NE Enrich Score in A        2622
NE Enrich Score in M        2622
dtype: int64

------
The number of genes found in df_MB is  111


In [50]:
# fillna
df_MB.fillna('Not_found', inplace=True)

# make a column 'Subcell_Korfali2010'
df_MB['Subcell_Cheng2019'] = df_MB.apply(lambda x: 'Nuclear membrane' if x['Entry_Cheng2019'] != 'Not_found' else 'Not_found', axis=1)

In [51]:
print(df_MB.columns)

Index(['Entry_original', 'Organism', 'Gene_name', 'Protein_name', 'AH_or_Not',
       'AA_sequence', 'Prediction', 'Entry_Hs', 'Entry_Mm', 'SubCell_Uniprot',
       'SubCell_HPA', 'HPA_reliability', 'Subcell_Schirmer2003',
       'Subcell_Korfali2010', 'Subcell_Korfali2012', 'Subcell_Wilkie2010',
       'Tissue_Korfali_2012', 'NE:MM_ratio_Korfali_2012', 'Entry_Cheng2019',
       'NE Enrich Score in U', 'NE Enrich Score in A', 'NE Enrich Score in M',
       'Subcell_Cheng2019'],
      dtype='object')


In [55]:
df_MB = df_MB[['Entry_original', 'Organism', 'Gene_name', 'Protein_name', 'AH_or_Not',
       'AA_sequence', 'Prediction', 'Entry_Hs', 'Entry_Mm', 'SubCell_Uniprot',
       'SubCell_HPA', 'HPA_reliability', 'Subcell_Schirmer2003',
       'Subcell_Korfali2010', 'Subcell_Wilkie2010','Subcell_Korfali2012', 'Subcell_Cheng2019',
        'Tissue_Korfali_2012', 'NE:MM_ratio_Korfali_2012', 'NE Enrich Score in U', 'NE Enrich Score in A', 'NE Enrich Score in M']]

df_MB = df_MB.rename({'NE Enrich Score in U':'NEscore_Undiff_Cheng2019', 'NE Enrich Score in A':'NEscore_Adipo_Cheng2019', 'NE Enrich Score in M':'NEscore_Myo_Cheng2019'}, axis=1)

In [56]:
df_MB.head()

Unnamed: 0,Entry_original,Organism,Gene_name,Protein_name,AH_or_Not,AA_sequence,Prediction,Entry_Hs,Entry_Mm,SubCell_Uniprot,...,Subcell_Schirmer2003,Subcell_Korfali2010,Subcell_Wilkie2010,Subcell_Korfali2012,Subcell_Cheng2019,Tissue_Korfali_2012,NE:MM_ratio_Korfali_2012,NEscore_Undiff_Cheng2019,NEscore_Adipo_Cheng2019,NEscore_Myo_Cheng2019
0,Q16928,Anopheles albimanus,W,Protein white,Non-AH,MTINTDDQYADGESKTTISSNRRYSTSSFQDQSMEDDGINATLTND...,0000000000000000000000000000000000000000000000...,Not_found,Not_found,Membrane,...,Not_found,Not_found,Not_found,Not_found,Not_found,Not_found,Not_found,Not_found,Not_found,Not_found
1,Q01151,Homo sapiens,CD83,CD83 antigen,Non-AH,MSRGLQLLLLSCAYSLAPATPEVKVACSEDVDLPCTAPWDPQVPYT...,0000000000000000000000000000000000000000000000...,Q01151,O88324,Membrane,...,Not_found,Not_found,Not_found,Not_found,Not_found,Not_found,Not_found,Not_found,Not_found,Not_found
2,Q0P6D2,Homo sapiens,DIPK1C,Divergent protein kinase domain 1C,AH,MARAAGARGPAGWCRRRGRCGRGTLLAFAAWTAGWVLAAALLLRAH...,0000000000000000000000000000000000000000000000...,Q0P6D2,Q8BQT2,Endoplasmic reticulum membrane,...,Not_found,Not_found,Not_found,Not_found,Not_found,Not_found,Not_found,Not_found,Not_found,Not_found
3,Q06136,Homo sapiens,KDSR,3-ketodihydrosphingosine reductase,AH,MLLLAAAFLVAFVLLLYMVSPLISPKPLALPGAHVVVTGGSSGIGK...,0000000000000000000000000000000000000000000000...,Q06136,Q6GV12,Endoplasmic reticulum membrane,...,Not_found,Not_found,Not_found,Not_found,Nuclear membrane,Not_found,Not_found,0.253493,0.420695,0.720657
4,Q3C1V0,Homo sapiens,MS4A18,Membrane-spanning 4-domains subfamily A member 18,AH,MTEQVIGANSVPGIIAPDNVHVIQPSNPVASGNHLQPSEVTTYPIS...,0000000000000000000000000000000000000000000000...,Q3C1V0,Not_found,Membrane,...,Not_found,Not_found,Not_found,Not_found,Not_found,Not_found,Not_found,Not_found,Not_found,Not_found


In [58]:
# export: all 2733 proteins
df_MB.to_csv('./FinalOutput/Results_Whole2733.csv', index=False)

# export: only proteins with Entry_Hs or Mm was found
df_MB_w_entry = df_MB[(df_MB['Entry_Hs'] != 'Not_found')|(df_MB.Entry_Mm != 'Not_found')]
print(df_MB_w_entry.shape)
df_MB_w_entry.to_csv('./FinalOutput/Results_w_Entry.csv', index=False)

(2050, 22)


In [65]:
# export: only with NE proteins
df_MB_NE = df_MB[(df_MB.SubCell_Uniprot.str.contains('Nucleaus'))|
                (df_MB.SubCell_HPA.str.contains('Nucle'))|
                (df_MB.Subcell_Schirmer2003.str.contains('Nuclear'))|
                (df_MB.Subcell_Korfali2010.str.contains('Nuclear'))|
                (df_MB.Subcell_Wilkie2010.str.contains('Nuclear'))|
                (df_MB.Subcell_Korfali2012.str.contains('Nuclear'))|
                (df_MB.Subcell_Cheng2019.str.contains('Nuclear'))]
print(df_MB_NE.shape)
df_MB_NE.to_csv('./FinalOutput/Results_NE.csv', index=False)

(235, 22)
