In [1]:
import requests
import numpy as np
from time import sleep
import pandas as pd
from my_utils import get_url
import my_config

In [2]:
df = pd.read_csv('./Output/Proteomics-HPA_Merged.csv')

In [3]:
print(len(df))

410


In [4]:
NE_names = ['Nucleus outer membrane','Nucleus membrane','Nucleus inner membrane','Nucleus, nuclear pore complex', 'Nucleus envelope', 'Nucleus lamina']
ER_names = ['Endoplasmic reticulum membrane','Endoplasmic reticulum','Sarcoplasmic reticulum membrane',
            'Endoplasmic reticulum-Golgi intermediate compartment membrane', 'Endoplasmic reticulum lumen']
NE_ER_names = NE_names + ER_names

good_evidence = ['ECO:0000269', # experimental
                 'ECO:0000305', # inferrence from paper
                 'ECO:0000250', # seq similarity
                 'ECO:0000255', # seq model
                 'ECO:0000312', # imported from other database
                 'ECO:0007744'] # a combination of experimental and computational evidence

five_papers = ['12958361', '20693407', '20876400', '22990521', '31142202']

In [5]:
for i, entry in enumerate(df.UniprotID):
    
    # List for data storage
    location_store = []
    evidences_store = []
    pmids_store = []
    # For judging Uniprot supports NE or ER localization
    location_judge = 0
    evidence_judge = 0
    pmid_judge = 0 # if pmid is in five papers, values becomes 
    
    try:
        # Query the gene name to get the Uniprot ID and the Uniprot-registered gene name
        params = {
        "query": f'accession:{entry}',
        "fields": "cc_subcellular_location",
        "format": "json"
        }
        
        r = get_url(my_config.WEBSITE_API, params=params)
        result = r.json()['results'][0]
        if 'comments' in result:
            for comment in result['comments']:
                if comment['commentType'] == 'SUBCELLULAR LOCATION':
                    subcell_loc_info = comment.get('subcellularLocations', []) # this can contain subcellular location name and evidence information
                    for info in subcell_loc_info: # each info could contain location name in "value" and evidence info in "evidences", the latter can be multiple
                        if 'location' in info:
                            location = info['location'].get('value', '')
                            evidences = info['location'].get('evidences', [])
                            
                            # Location
                            if location:
                                location_store.append(location)
                                # determine if location is NE or ER
                                if location in NE_ER_names:
                                    location_judge = 1
                        
                            # Evidence
                            for evidence in evidences: # each evidence contains one evidence code and pmid
                                # Evidence code
                                evidenceCode = evidence.get('evidenceCode', '')
                                if evidenceCode not in evidences_store:
                                    evidences_store.append(evidenceCode)
                                # determine if the evidence is reliable
                                if evidenceCode in good_evidence:
                                    evidence_judge = 1
                                
                                # PMID if any
                                if 'id' in evidence:
                                    pmids_store.append(evidence['id'])
                                    # determine if the paper is one from the 5 proteome paper
                                    if evidence['id'] in five_papers:
                                        pmid_judge = 1
                            

    except Exception as e:
        print(f'Error in fetching {entry}: {e}')

    # store to df
    df.loc[i, 'Uniprot_loc'] = ", ".join(location_store)
    df.loc[i, 'Uniprot_loc_evi'] = ", ".join(evidences_store)
    df.loc[i, 'Uniprot_loc_pmid'] = ", ".join(pmids_store)
    df.loc[i, 'Uniprot_loc_judgeNEER'] = location_judge
    df.loc[i, 'Uniprot_loc_judgeEvi'] = evidence_judge
    df.loc[i, 'Uniprot_loc_judgePMID'] = pmid_judge
    
    
    if i % 40 == 0: print(i, entry, location_store, evidences_store, pmids_store, location_judge, evidence_judge, pmid_judge)
    
    sleep(1)

0 Q92604 ['Endoplasmic reticulum membrane'] ['ECO:0000269'] ['15485873'] 1 1 0


KeyboardInterrupt: 

In [44]:
df.to_csv('./Output/MergedProteome-HPA-UP_crude.csv', index=False)

#### Judge if 1) the locations contain any NE or ER AND 2) with good evidence AND 3) PMID is not any of the 5 proteome papers

In [40]:
df['Uniprot'] = ((df['Uniprot_loc_judgeNEER'] == 1) 
                 & (df['Uniprot_loc_judgeEvi'] == 1) 
                 & (df['Uniprot_loc_judgePMID'] == 0)).astype('int')

In [41]:
df_cleaned = df.drop(['Uniprot_loc_judgeNEER', 'Uniprot_loc_judgeEvi', 'Uniprot_loc_judgePMID'], axis=1)

In [43]:
df_cleaned.tail()

Unnamed: 0,UniprotID,GeneName,Schirmer_2003,Korfali_2010,Wilkie_2010,Korfali_2012,Korfali_2012_NE:MM-ratio,Cheng_2019,Cheng_2019_Score:Undiff,Cheng_2019_Score:Adipo,Cheng_2019_Score:Myo,#ProteomePapers,HPA,HPA_reliability,HPA_loc,Uniprot_loc,Uniprot_loc_evi,Uniprot_loc_pmid,Uniprot
405,Q9P0I2,Emc3,0.0,0.0,0.0,0.0,0.0,1.0,0.512254,0.425723,0.489759,1.0,0,NO,NO,Endoplasmic reticulum membrane,ECO:0000269,22119785,1
406,Q96HA1,Pom121,0.0,0.0,0.0,0.0,0.0,1.0,0.990009,0.938691,1.0,1.0,1,Validated,Nucleoplasm: Validated;Nuclear membrane: Valid...,"Nucleus, nuclear pore complex, Nucleus membran...","ECO:0000269, ECO:0000250","17900573, 17900573",1
407,Q86Y07,Vrk2,0.0,0.0,0.0,0.0,0.0,1.0,0.933302,0.730169,0.907452,1.0,0,NO,NO,"Cytoplasm, Endoplasmic reticulum membrane, Mit...","ECO:0000269, ECO:0000250","16704422, 16704422, 16704422, Q8BN21, 16704422...",1
408,Q14728,Mfsd10,0.0,0.0,0.0,0.0,0.0,1.0,0.8988,0.68708,0.886931,1.0,0,NO,NO,"Nucleus inner membrane, Cell membrane","ECO:0000250, ECO:0000305","Q9D2V8, 18638446",1
409,O43292,Gpaa1,0.0,0.0,0.0,0.0,0.0,1.0,0.671657,0.595264,0.944521,1.0,0,NO,NO,Endoplasmic reticulum membrane,ECO:0000269,11483512,1


In [62]:
five_papers = ['12958361', '20693407', '20876400', '22990521', '31142202']
def checkPMID(x):
    """
    Returns 1 if x contains any of the five papers ID
    """
    
    return_value = False
    for pmid in five_papers:
        if pmid in x:
            return_value = True
    
    return return_value

In [63]:
# Uniprot score
# check if certain evidenceCode is in either evidence_NE or _ER
df['Uniprot'] = np.where((df['Uniprot_loc_evi'].apply(str).apply(checkEvidence) & ~df['Uniprot_loc_pmid'].apply(str).apply(checkPMID)), 1, 0)
# df['Uniprot_subCell'] = df.Uniprot_Subcell_evi_ER.apply(lambda x: 1 if checkEvidence(x) else 0)

In [204]:
# df['Uniprot_Subcell_pmid-NE-ER'] = df['Uniprot_Subcell_pmid-NE-ER'].apply(str)
df['5_papers'] = df['Uniprot_Subcell_pmid-NE-ER'].apply(str).apply(checkPMID)

In [228]:
df.to_csv('./Output/Merged_Proteome-HPA-UP_011023.csv', index=False)

In [23]:
df = pd.read_csv('./Output/Merged_Proteome-HPA-UP_011023.csv')

In [24]:
df.fillna('Not_found', inplace=True)

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3405 entries, 0 to 3404
Data columns (total 19 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   UniprotID                   3405 non-null   object 
 1   GeneName                    3405 non-null   object 
 2   Schirmer_2003               3405 non-null   float64
 3   Korfali_2010                3405 non-null   float64
 4   Wilkie_2010                 3405 non-null   float64
 5   Korfali_2012                3405 non-null   float64
 6   Cheng_2019                  3405 non-null   float64
 7   HPA                         3405 non-null   int64  
 8   HPA_loc                     3405 non-null   object 
 9   HPA_reliability             3405 non-null   object 
 10  NE Enrich Score in U        3405 non-null   float64
 11  NE Enrich Score in A        3405 non-null   float64
 12  NE Enrich Score in M        3405 non-null   float64
 13  Uniprot_Subcell_loc         3405 

In [26]:
df['Sum_Proteme'] = df.Schirmer_2003 + df.Korfali_2010 + df.Wilkie_2010 + df.Korfali_2012 + df.Cheng_2019
df['HPA-UP'] = df.HPA + df.Uniprot_subCell

In [27]:
df.loc[df.Sum_Proteme >= 2,'Tier'] = 1
df.loc[(df.Sum_Proteme == 1)&(df['HPA-UP'] == 1),'Tier'] = 2
df.fillna(3, inplace=True)

In [31]:
df_tier1_2 = df[df.Tier != 3]

In [32]:
df_tier1_2.to_csv('./Output/Merged_Proteome-HPA-UP_tier12_012023.csv', index=False)

In [47]:
df_tier1_2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 260 entries, 0 to 405
Data columns (total 22 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   UniprotID                   260 non-null    object 
 1   GeneName                    260 non-null    object 
 2   Schirmer_2003               260 non-null    float64
 3   Korfali_2010                260 non-null    float64
 4   Wilkie_2010                 260 non-null    float64
 5   Korfali_2012                260 non-null    float64
 6   Cheng_2019                  260 non-null    float64
 7   HPA                         260 non-null    int64  
 8   HPA_loc                     260 non-null    object 
 9   HPA_reliability             260 non-null    object 
 10  NE Enrich Score in U        260 non-null    float64
 11  NE Enrich Score in A        260 non-null    float64
 12  NE Enrich Score in M        260 non-null    float64
 13  Uniprot_Subcell_loc         260 non

In [37]:
df_tier1_2.tail()

Unnamed: 0,UniprotID,GeneName,Schirmer_2003,Korfali_2010,Wilkie_2010,Korfali_2012,Cheng_2019,HPA,HPA_loc,HPA_reliability,...,NE Enrich Score in M,Uniprot_Subcell_loc,Uniprot_Subcell_evi_NE,Uniprot_Subcell_evi_ER,Uniprot_Subcell_pmid-NE-ER,Uniprot_subCell,5_papers,Sum_Proteme,HPA-UP,Tier
400,P35610,Soat1,0.0,0.0,0.0,0.0,1.0,0,0,0,...,0.677791,Endoplasmic reticulum membrane,Not_found,ECO:0000269,"10438503, 16154994",1,0,1.0,1,2.0
401,Q9P0I2,Emc3,0.0,0.0,0.0,0.0,1.0,0,0,0,...,0.489759,Endoplasmic reticulum membrane,Not_found,ECO:0000269,22119785,1,0,1.0,1,2.0
403,Q86Y07,Vrk2,0.0,0.0,0.0,0.0,1.0,0,0,0,...,0.907452,"Cytoplasm, Endoplasmic reticulum membrane, Mit...",ECO:0000250,ECO:0000269,"16704422, Q8BN21",1,0,1.0,1,2.0
404,Q14728,Mfsd10,0.0,0.0,0.0,0.0,1.0,0,0,0,...,0.886931,Nucleus inner membrane,ECO:0000250,Not_found,Q9D2V8,1,0,1.0,1,2.0
405,O43292,Gpaa1,0.0,0.0,0.0,0.0,1.0,0,0,0,...,0.944521,Endoplasmic reticulum membrane,Not_found,ECO:0000269,11483512,1,0,1.0,1,2.0


## Crossing with MemBrain result

In [33]:
# MemBrain
df_MB = pd.read_csv('./Output/Results_step_3.csv')

In [51]:
df_MB = df_MB.drop_duplicates(subset=['Entry_original'])

In [63]:
_df = df_MB.merge(df_tier1_2, how='outer', left_on='Entry_Hs', right_on='UniprotID')
_df = _df.drop_duplicates(subset=['UniprotID'])

In [64]:
_df.fillna('Not_found', inplace=True)

In [65]:
df_tier1_2_NOT_MB = _df[(_df.Entry_original == 'Not_found')&(_df.UniprotID != 'Not_found')]

In [66]:
df_tier1_2___ = _df[_df.UniprotID != 'Not_found']

In [67]:
df_tier1_2___.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 259 entries, 749 to 2876
Data columns (total 32 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Entry_original              259 non-null    object
 1   Organism                    259 non-null    object
 2   Gene_name                   259 non-null    object
 3   Protein_name                259 non-null    object
 4   AH_or_Not                   259 non-null    object
 5   AA_sequence                 259 non-null    object
 6   Prediction                  259 non-null    object
 7   SubCell_Uniprot             259 non-null    object
 8   Entry_Hs                    259 non-null    object
 9   Entry_Mm                    259 non-null    object
 10  UniprotID                   259 non-null    object
 11  GeneName                    259 non-null    object
 12  Schirmer_2003               259 non-null    object
 13  Korfali_2010                259 non-null    obj

In [68]:
df_tier1_2_NOT_MB.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 144 entries, 2733 to 2876
Data columns (total 32 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Entry_original              144 non-null    object
 1   Organism                    144 non-null    object
 2   Gene_name                   144 non-null    object
 3   Protein_name                144 non-null    object
 4   AH_or_Not                   144 non-null    object
 5   AA_sequence                 144 non-null    object
 6   Prediction                  144 non-null    object
 7   SubCell_Uniprot             144 non-null    object
 8   Entry_Hs                    144 non-null    object
 9   Entry_Mm                    144 non-null    object
 10  UniprotID                   144 non-null    object
 11  GeneName                    144 non-null    object
 12  Schirmer_2003               144 non-null    object
 13  Korfali_2010                144 non-null    ob

In [69]:
df_tier1_2_MB = _df[(_df.Entry_original != 'Not_found')&(_df.UniprotID != 'Not_found')]

In [70]:
df_tier1_2_MB.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 115 entries, 749 to 2724
Data columns (total 32 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Entry_original              115 non-null    object
 1   Organism                    115 non-null    object
 2   Gene_name                   115 non-null    object
 3   Protein_name                115 non-null    object
 4   AH_or_Not                   115 non-null    object
 5   AA_sequence                 115 non-null    object
 6   Prediction                  115 non-null    object
 7   SubCell_Uniprot             115 non-null    object
 8   Entry_Hs                    115 non-null    object
 9   Entry_Mm                    115 non-null    object
 10  UniprotID                   115 non-null    object
 11  GeneName                    115 non-null    object
 12  Schirmer_2003               115 non-null    object
 13  Korfali_2010                115 non-null    obj

In [71]:
df_tier1_2_MB_AH = df_tier1_2_MB[df_tier1_2_MB.AH_or_Not == 'AH']

In [72]:
df_tier1_2_MB_AH.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 83 entries, 749 to 2724
Data columns (total 32 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Entry_original              83 non-null     object
 1   Organism                    83 non-null     object
 2   Gene_name                   83 non-null     object
 3   Protein_name                83 non-null     object
 4   AH_or_Not                   83 non-null     object
 5   AA_sequence                 83 non-null     object
 6   Prediction                  83 non-null     object
 7   SubCell_Uniprot             83 non-null     object
 8   Entry_Hs                    83 non-null     object
 9   Entry_Mm                    83 non-null     object
 10  UniprotID                   83 non-null     object
 11  GeneName                    83 non-null     object
 12  Schirmer_2003               83 non-null     object
 13  Korfali_2010                83 non-null     obje

In [185]:
print(df_tier1_2.shape, df_MB.shape, _df.shape)

(383, 17) (2733, 10) (4450, 27)


In [182]:
_df = _df.drop_duplicates()
_df.head(n=23)

AttributeError: 'tuple' object has no attribute 'drop_duplicates'

In [99]:
_df.tail()

Unnamed: 0,Entry_original,Organism,Gene_name,Protein_name,AH_or_Not,AA_sequence,Prediction,SubCell_Uniprot,Entry_Hs,Entry_Mm,...,Schirmer_2003,Korfali_2010,Wilkie_2010,Korfali_2012,Cheng_2019,HPA,HPA_reliability,NE Enrich Score in U,NE Enrich Score in A,NE Enrich Score in M
7422,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,Nucleoplasm: Supported;Cytosol: Supported,Supported,0.0,0.0,0.0
7423,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,Nucleus: Validated,Validated,0.0,0.0,0.0
7424,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,Nucleus: Supported,Supported,0.0,0.0,0.0
7425,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,Nucleus: Validated;Nucleoli: Supported,Validated,0.0,0.0,0.0
7426,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,Nucleoli: Supported;Vesicles: Approved,Supported,0.0,0.0,0.0


In [49]:
_df_HPA = _df[_df.HPA != 0]

In [50]:
_df_HPA = _df_HPA[_df_HPA.HPA.str.contains('Nucle')]

In [52]:
_df_HPA.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 85 entries, 2242 to 2492
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Entry_original        85 non-null     object 
 1   Organism              85 non-null     object 
 2   Gene_name             85 non-null     object 
 3   Protein_name          85 non-null     object 
 4   AH_or_Not             85 non-null     object 
 5   AA_sequence           85 non-null     object 
 6   Prediction            85 non-null     object 
 7   SubCell_Uniprot       85 non-null     object 
 8   Entry_Hs              85 non-null     object 
 9   Entry_Mm              85 non-null     object 
 10  UniprotID             85 non-null     object 
 11  GeneName              85 non-null     object 
 12  Schirmer_2003         85 non-null     float64
 13  Korfali_2010          85 non-null     float64
 14  Wilkie_2010           85 non-null     float64
 15  Korfali_2012        