### Obtain Subcell loc info from Uniprot

In [1]:
import pandas as pd
from my_utils import find_duplicate

In [2]:
df = pd.read_csv('./Output/Proteomics-HPA_Merged.csv')

In [123]:
NE_names = ['Nucleus outer membrane','Nucleus membrane','Nucleus inner membrane','Nucleus, nuclear pore complex', 'Nucleus envelope', 'Nucleus lamina']
ER_names = ['Endoplasmic reticulum membrane','Endoplasmic reticulum','Sarcoplasmic reticulum membrane',
            'Endoplasmic reticulum-Golgi intermediate compartment membrane', 'Endoplasmic reticulum lumen']

In [200]:
for i, entry in enumerate(df.UniprotID):
    
    locations = []
    evidence_NE = []
    evidence_ER = []
    pmids = []
    
    try:
        r = get_url(f'{WEBSITE_API}/search?query=accession:{entry}&fields=cc_subcellular_location')
        subcell_loc_info = r.json()['results'][0]['comments'][0]['subcellularLocations']
        sleep(1)
        
        for info in subcell_loc_info:
            keys = info['location'].keys()
            if 'evidences' in keys: # check if the localization info contains an evidence
                location = info['location']['value']; locations.append(location) # if so the loc info is stored
                
                # Below check if localzation is either NE or ER
                # if so, the evidenceCode and PMID if any are stored
                if location in NE_names:
                    evidences = info['location']['evidences']
                    for evidence in evidences:
                        evidence_keys = evidence.keys()
                        evidenceCode = evidence['evidenceCode'];
                        if evidenceCode not in evidence_NE: # check if the Code is new
                            evidence_NE.append(evidenceCode)
                        if 'id' in evidence_keys: # check if PMID is available
                            pmids.append(evidence['id'])

                if location in ER_names:
                    evidences = info['location']['evidences']
                    for evidence in evidences:
                        evidence_keys = evidence.keys()
                        evidenceCode = evidence['evidenceCode'];
                        if evidenceCode not in evidence_ER:
                            evidence_ER.append(evidenceCode)
                        if 'id' in evidence_keys:
                            pmids.append(evidence['id'])
                            

    except:
        pass

    # store to df
    df.loc[i, 'Uniprot_Subcell_loc'] = ", ".join(locations)
    df.loc[i, 'Uniprot_Subcell_evi_NE'] = ", ".join(evidence_NE)
    df.loc[i, 'Uniprot_Subcell_evi_ER'] = ", ".join(evidence_ER)
    df.loc[i, 'Uniprot_Subcell_pmid-NE-ER'] = ", ".join(pmids)
    
    if i%200 == 0: print(i, entry, locations)

0 Q92604 ['Endoplasmic reticulum membrane']
{"url":"http://rest.uniprot.org/uniprotkb/search","messages":["The 'accession' filter value 'Not_found' has invalid format. It should be a valid UniProtKB accession"]}
200 Q9BVT8 ['Membrane', 'Postsynaptic cell membrane', 'Recycling endosome', 'Cytoplasm', 'Nucleus', 'Nucleus, nucleolus']
400 P35610 ['Endoplasmic reticulum membrane']
{"url":"http://rest.uniprot.org/uniprotkb/search","messages":["The 'accession' filter value '0' has invalid format. It should be a valid UniProtKB accession"]}
600 Q9UIG0 ['Nucleus']
{"url":"http://rest.uniprot.org/uniprotkb/search","messages":["The 'accession' filter value 'P42771;Q8N726' has invalid format. It should be a valid UniProtKB accession"]}
800 P53567 ['Nucleus']
{"url":"http://rest.uniprot.org/uniprotkb/search","messages":["The 'accession' filter value 'P0CG12;P0CG13' has invalid format. It should be a valid UniProtKB accession"]}
{"url":"http://rest.uniprot.org/uniprotkb/search","messages":["The 'ac

In [201]:
good_evidence = ['ECO:0000269', # experimental
                 'ECO:0000305', # inferrence from paper
                 'ECO:0000250', # seq similarity
                 'ECO:0000255', # seq model
                 'ECO:0000312', # imported from other database
                 'ECO:0007744'] # a combination of experimental and computational evidence

def checkEvidence(x):
    if x in good_evidence:
        return True
    else:
        return False

In [218]:
import numpy as np
np.where((True|False), 1, 0)

array(1)

In [202]:
five_papers = ['12958361', '20693407', '20876400', '22990521', '31142202']
def checkPMID(x):
    return_value = 0
    for pmid in five_papers:
        if pmid in x:
            return_value = 1
    
    return return_value

In [224]:
# Uniprot score
# check if certain evidenceCode is in either evidence_NE or _ER
df['Uniprot_subCell'] = np.where((df['Uniprot_Subcell_evi_NE'].apply(str).apply(checkEvidence) | df['Uniprot_Subcell_evi_ER'].apply(str).apply(checkEvidence)), 1, 0)
# df['Uniprot_subCell'] = df.Uniprot_Subcell_evi_ER.apply(lambda x: 1 if checkEvidence(x) else 0)

In [204]:
# df['Uniprot_Subcell_pmid-NE-ER'] = df['Uniprot_Subcell_pmid-NE-ER'].apply(str)
df['5_papers'] = df['Uniprot_Subcell_pmid-NE-ER'].apply(str).apply(checkPMID)

In [205]:
df.head(n=30)

Unnamed: 0,UniprotID,GeneName,Schirmer_2003,Korfali_2010,Wilkie_2010,Korfali_2012,Cheng_2019,HPA,HPA_loc,HPA_reliability,NE Enrich Score in U,NE Enrich Score in A,NE Enrich Score in M,Uniprot_Subcell_loc,Uniprot_Subcell_evi_NE,Uniprot_Subcell_evi_ER,Uniprot_Subcell_pmid-NE-ER,Uniprot_subCell,5_papers
0,Q92604,LPGAT1 FAM34A KIAA0205,1.0,1.0,1.0,1.0,0.0,0,0,0,0.0,0.0,0.0,Endoplasmic reticulum membrane,,ECO:0000269,15485873,1,0
1,Q9Y3T9,NOC2L NIR,1.0,1.0,1.0,0.0,0.0,1,Nucleoli: Validated,Validated,0.0,0.0,0.0,,,,,0,0
2,Q8NBX0,SCCPDH CGI-49,1.0,1.0,1.0,1.0,0.0,0,0,0,0.0,0.0,0.0,,,,,0,0
3,A0A384NPM7,SCCPDH hCG_1782151,1.0,0.0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,,,,,0,0
4,Q5VTL8,PRPF38B,1.0,1.0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,Nucleus,,,,0,0
5,Q9BTX1,NDC1 TMEM48,1.0,1.0,1.0,1.0,1.0,1,Nuclear membrane: Supported;Actin filaments: U...,Supported,0.674122,0.740816,0.896719,,,,,0,0
6,Q8NFQ8,TOR1AIP2 IFRG15 LULL1,1.0,1.0,0.0,1.0,0.0,0,0,0,0.0,0.0,0.0,,,,,0,0
7,Q9NXE4,SMPD4 KIAA1418 SKNY,1.0,1.0,1.0,1.0,1.0,0,0,0,0.9327,0.895236,0.983445,"Endoplasmic reticulum membrane, Golgi apparatu...",ECO:0000269,ECO:0000269,"16517606, 18505924, 31495489, 31495489",1,0
8,A0AV96,RBM47,1.0,1.0,0.0,0.0,0.0,1,Nucleoplasm: Supported;Cytosol: Supported,Supported,0.0,0.0,0.0,Nucleus,,,,0,0
9,Q7LBC6,KDM3B C5orf7 JHDM2B JMJD1B KIAA1082,1.0,1.0,0.0,0.0,0.0,1,Nucleoplasm: Validated,Validated,0.0,0.0,0.0,Nucleus,,,,0,0


In [228]:
df.to_csv('./Output/Merged_Proteome-HPA-UP_011023.csv', index=False)

In [23]:
df = pd.read_csv('./Output/Merged_Proteome-HPA-UP_011023.csv')

In [24]:
df.fillna('Not_found', inplace=True)

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3405 entries, 0 to 3404
Data columns (total 19 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   UniprotID                   3405 non-null   object 
 1   GeneName                    3405 non-null   object 
 2   Schirmer_2003               3405 non-null   float64
 3   Korfali_2010                3405 non-null   float64
 4   Wilkie_2010                 3405 non-null   float64
 5   Korfali_2012                3405 non-null   float64
 6   Cheng_2019                  3405 non-null   float64
 7   HPA                         3405 non-null   int64  
 8   HPA_loc                     3405 non-null   object 
 9   HPA_reliability             3405 non-null   object 
 10  NE Enrich Score in U        3405 non-null   float64
 11  NE Enrich Score in A        3405 non-null   float64
 12  NE Enrich Score in M        3405 non-null   float64
 13  Uniprot_Subcell_loc         3405 

In [26]:
df['Sum_Proteme'] = df.Schirmer_2003 + df.Korfali_2010 + df.Wilkie_2010 + df.Korfali_2012 + df.Cheng_2019
df['HPA-UP'] = df.HPA + df.Uniprot_subCell

In [27]:
df.loc[df.Sum_Proteme >= 2,'Tier'] = 1
df.loc[(df.Sum_Proteme == 1)&(df['HPA-UP'] == 1),'Tier'] = 2
df.fillna(3, inplace=True)

In [31]:
df_tier1_2 = df[df.Tier != 3]

In [32]:
df_tier1_2.to_csv('./Output/Merged_Proteome-HPA-UP_tier12_012023.csv', index=False)

In [47]:
df_tier1_2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 260 entries, 0 to 405
Data columns (total 22 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   UniprotID                   260 non-null    object 
 1   GeneName                    260 non-null    object 
 2   Schirmer_2003               260 non-null    float64
 3   Korfali_2010                260 non-null    float64
 4   Wilkie_2010                 260 non-null    float64
 5   Korfali_2012                260 non-null    float64
 6   Cheng_2019                  260 non-null    float64
 7   HPA                         260 non-null    int64  
 8   HPA_loc                     260 non-null    object 
 9   HPA_reliability             260 non-null    object 
 10  NE Enrich Score in U        260 non-null    float64
 11  NE Enrich Score in A        260 non-null    float64
 12  NE Enrich Score in M        260 non-null    float64
 13  Uniprot_Subcell_loc         260 non

In [37]:
df_tier1_2.tail()

Unnamed: 0,UniprotID,GeneName,Schirmer_2003,Korfali_2010,Wilkie_2010,Korfali_2012,Cheng_2019,HPA,HPA_loc,HPA_reliability,...,NE Enrich Score in M,Uniprot_Subcell_loc,Uniprot_Subcell_evi_NE,Uniprot_Subcell_evi_ER,Uniprot_Subcell_pmid-NE-ER,Uniprot_subCell,5_papers,Sum_Proteme,HPA-UP,Tier
400,P35610,Soat1,0.0,0.0,0.0,0.0,1.0,0,0,0,...,0.677791,Endoplasmic reticulum membrane,Not_found,ECO:0000269,"10438503, 16154994",1,0,1.0,1,2.0
401,Q9P0I2,Emc3,0.0,0.0,0.0,0.0,1.0,0,0,0,...,0.489759,Endoplasmic reticulum membrane,Not_found,ECO:0000269,22119785,1,0,1.0,1,2.0
403,Q86Y07,Vrk2,0.0,0.0,0.0,0.0,1.0,0,0,0,...,0.907452,"Cytoplasm, Endoplasmic reticulum membrane, Mit...",ECO:0000250,ECO:0000269,"16704422, Q8BN21",1,0,1.0,1,2.0
404,Q14728,Mfsd10,0.0,0.0,0.0,0.0,1.0,0,0,0,...,0.886931,Nucleus inner membrane,ECO:0000250,Not_found,Q9D2V8,1,0,1.0,1,2.0
405,O43292,Gpaa1,0.0,0.0,0.0,0.0,1.0,0,0,0,...,0.944521,Endoplasmic reticulum membrane,Not_found,ECO:0000269,11483512,1,0,1.0,1,2.0


## Crossing with MemBrain result

In [33]:
# MemBrain
df_MB = pd.read_csv('./Output/Results_step_3.csv')

In [51]:
df_MB = df_MB.drop_duplicates(subset=['Entry_original'])

In [63]:
_df = df_MB.merge(df_tier1_2, how='outer', left_on='Entry_Hs', right_on='UniprotID')
_df = _df.drop_duplicates(subset=['UniprotID'])

In [64]:
_df.fillna('Not_found', inplace=True)

In [65]:
df_tier1_2_NOT_MB = _df[(_df.Entry_original == 'Not_found')&(_df.UniprotID != 'Not_found')]

In [66]:
df_tier1_2___ = _df[_df.UniprotID != 'Not_found']

In [67]:
df_tier1_2___.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 259 entries, 749 to 2876
Data columns (total 32 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Entry_original              259 non-null    object
 1   Organism                    259 non-null    object
 2   Gene_name                   259 non-null    object
 3   Protein_name                259 non-null    object
 4   AH_or_Not                   259 non-null    object
 5   AA_sequence                 259 non-null    object
 6   Prediction                  259 non-null    object
 7   SubCell_Uniprot             259 non-null    object
 8   Entry_Hs                    259 non-null    object
 9   Entry_Mm                    259 non-null    object
 10  UniprotID                   259 non-null    object
 11  GeneName                    259 non-null    object
 12  Schirmer_2003               259 non-null    object
 13  Korfali_2010                259 non-null    obj

In [68]:
df_tier1_2_NOT_MB.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 144 entries, 2733 to 2876
Data columns (total 32 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Entry_original              144 non-null    object
 1   Organism                    144 non-null    object
 2   Gene_name                   144 non-null    object
 3   Protein_name                144 non-null    object
 4   AH_or_Not                   144 non-null    object
 5   AA_sequence                 144 non-null    object
 6   Prediction                  144 non-null    object
 7   SubCell_Uniprot             144 non-null    object
 8   Entry_Hs                    144 non-null    object
 9   Entry_Mm                    144 non-null    object
 10  UniprotID                   144 non-null    object
 11  GeneName                    144 non-null    object
 12  Schirmer_2003               144 non-null    object
 13  Korfali_2010                144 non-null    ob

In [69]:
df_tier1_2_MB = _df[(_df.Entry_original != 'Not_found')&(_df.UniprotID != 'Not_found')]

In [70]:
df_tier1_2_MB.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 115 entries, 749 to 2724
Data columns (total 32 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Entry_original              115 non-null    object
 1   Organism                    115 non-null    object
 2   Gene_name                   115 non-null    object
 3   Protein_name                115 non-null    object
 4   AH_or_Not                   115 non-null    object
 5   AA_sequence                 115 non-null    object
 6   Prediction                  115 non-null    object
 7   SubCell_Uniprot             115 non-null    object
 8   Entry_Hs                    115 non-null    object
 9   Entry_Mm                    115 non-null    object
 10  UniprotID                   115 non-null    object
 11  GeneName                    115 non-null    object
 12  Schirmer_2003               115 non-null    object
 13  Korfali_2010                115 non-null    obj

In [71]:
df_tier1_2_MB_AH = df_tier1_2_MB[df_tier1_2_MB.AH_or_Not == 'AH']

In [72]:
df_tier1_2_MB_AH.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 83 entries, 749 to 2724
Data columns (total 32 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Entry_original              83 non-null     object
 1   Organism                    83 non-null     object
 2   Gene_name                   83 non-null     object
 3   Protein_name                83 non-null     object
 4   AH_or_Not                   83 non-null     object
 5   AA_sequence                 83 non-null     object
 6   Prediction                  83 non-null     object
 7   SubCell_Uniprot             83 non-null     object
 8   Entry_Hs                    83 non-null     object
 9   Entry_Mm                    83 non-null     object
 10  UniprotID                   83 non-null     object
 11  GeneName                    83 non-null     object
 12  Schirmer_2003               83 non-null     object
 13  Korfali_2010                83 non-null     obje

In [185]:
print(df_tier1_2.shape, df_MB.shape, _df.shape)

(383, 17) (2733, 10) (4450, 27)


In [182]:
_df = _df.drop_duplicates()
_df.head(n=23)

AttributeError: 'tuple' object has no attribute 'drop_duplicates'

In [99]:
_df.tail()

Unnamed: 0,Entry_original,Organism,Gene_name,Protein_name,AH_or_Not,AA_sequence,Prediction,SubCell_Uniprot,Entry_Hs,Entry_Mm,...,Schirmer_2003,Korfali_2010,Wilkie_2010,Korfali_2012,Cheng_2019,HPA,HPA_reliability,NE Enrich Score in U,NE Enrich Score in A,NE Enrich Score in M
7422,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,Nucleoplasm: Supported;Cytosol: Supported,Supported,0.0,0.0,0.0
7423,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,Nucleus: Validated,Validated,0.0,0.0,0.0
7424,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,Nucleus: Supported,Supported,0.0,0.0,0.0
7425,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,Nucleus: Validated;Nucleoli: Supported,Validated,0.0,0.0,0.0
7426,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,Nucleoli: Supported;Vesicles: Approved,Supported,0.0,0.0,0.0


In [49]:
_df_HPA = _df[_df.HPA != 0]

In [50]:
_df_HPA = _df_HPA[_df_HPA.HPA.str.contains('Nucle')]

In [52]:
_df_HPA.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 85 entries, 2242 to 2492
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Entry_original        85 non-null     object 
 1   Organism              85 non-null     object 
 2   Gene_name             85 non-null     object 
 3   Protein_name          85 non-null     object 
 4   AH_or_Not             85 non-null     object 
 5   AA_sequence           85 non-null     object 
 6   Prediction            85 non-null     object 
 7   SubCell_Uniprot       85 non-null     object 
 8   Entry_Hs              85 non-null     object 
 9   Entry_Mm              85 non-null     object 
 10  UniprotID             85 non-null     object 
 11  GeneName              85 non-null     object 
 12  Schirmer_2003         85 non-null     float64
 13  Korfali_2010          85 non-null     float64
 14  Wilkie_2010           85 non-null     float64
 15  Korfali_2012        