In [124]:
import pandas as pd
import requests, json
from time import sleep

In [108]:
def get_url(url, **kwargs):
    '''
    Obatin a response from a given url
    '''
    response = requests.get(url, **kwargs);

    if not response.ok:
        print(response.text)
        response.raise_for_status()
        sys.exit()

    return response

### 1. Data import

In [59]:
df_schirmer2003 = pd.read_csv('./Output/Schirmer2003/Output.csv')

In [60]:
df_schirmer2003.tail()

Unnamed: 0,Entry,Gene names
39,Q8N163,Ccar2
40,Q6NW34,Nepro
41,Q969X5,Ergic1
42,Q8NC56,Lemd2
43,Q5GFL6,Vwa2


In [61]:
df_korfali2010 = pd.read_csv('./Output/Korfali_2010.csv')

In [8]:
df_korfali2010.head()

Unnamed: 0,Entry_Korfali_2010,Gene_name_obtained
0,Q9UH99,SUN2
1,P42166,TMPO
2,Q14739,LBR
3,Q8TEM1,NUP210
4,Q9BTV4,TMEM43


In [62]:
df_wilkie2010 = pd.read_csv('./Output/Wilkie_2010.csv')

In [10]:
df_wilkie2010.head()

Unnamed: 0,Entry_Wilkie,Gene_name_obtained
0,Q9UH99,SUN2
1,Q5JTV8,TOR1AIP1
2,Q14739,LBR
3,O94901,SUN1
4,Q8TEM1,NUP210


In [63]:
df_korfali2012 = pd.read_csv('./Output/Korfali2012/Korfali2012_Hs.csv')

In [6]:
df_korfali2012.head()

Unnamed: 0.1,Unnamed: 0,tissue,gene name,alternate names,accession numbers,NE:MM ratio by dNSAF,reference,Uniprot_id
0,0,liver enriched,TMEM53,"NET4, transmembrane protein 53",ref|NP_081113.1|,2.57,"This study and Schirmer, E.C., et al. (2003). ...",Q6P2H8
1,1,liver enriched,TMEM120A,"NET29, transmembrane protein induced by tumor ...",ref|NP_766129.1|,inf,"This study and Malik, P., et al. (2010) Cell M...",Q9BXJ8
2,2,liver enriched,SCARA5,"NET33, PREDICTED: similar to protease, serine,...",gi|109502608|ref|XP_001066668.1|,0.1,"This study and Malik, P., et al. (2010) Cell M...",Q6ZMJ2
3,3,liver enriched,TMEM74,"NET36, PREDICTED: hypothetical protein [Rattus...",ref|XP_001063530.1|,3.36,"This study and Malik, P., et al. (2010) Cell M...",Q96NL1
4,4,liver enriched,PPAPDC3,"NET39, phosphatidic acid phosphatase type 2 do...",gi|59891419|ref|NP_001012349.1|;gi|34147436|re...,4.42,"This study and Schirmer, E.C., et al. (2003). ...",Q8NBV4


In [64]:
df_cheng2019 = pd.read_csv('./Output/Cheng2019.csv')

In [11]:
df_cheng2019.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 242 entries, 0 to 241
Data columns (total 5 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   gene_name             242 non-null    object 
 1   entry_h               242 non-null    object 
 2   NE Enrich Score in U  242 non-null    float64
 3   NE Enrich Score in A  242 non-null    float64
 4   NE Enrich Score in M  242 non-null    float64
dtypes: float64(3), object(2)
memory usage: 9.6+ KB


In [65]:
df_HPA = pd.read_csv('./Output/HPA_val_supp_nucleus.csv')

In [4]:
df_HPA.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3077 entries, 0 to 3076
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Gene               3077 non-null   object
 1   Uniprot            3067 non-null   object
 2   Reliability        3077 non-null   object
 3   IF location score  3077 non-null   object
dtypes: object(4)
memory usage: 96.3+ KB


### 2. Merge

### 2-1. Schirmer + Korfali2010

In [66]:
df = df_schirmer2003.merge(df_korfali2010, left_on='Entry', right_on='Entry_Korfali_2010', how='outer')

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 180 entries, 0 to 179
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   NCBI ID             56 non-null     object
 1   Entry               56 non-null     object
 2   Entry name          56 non-null     object
 3   Protein names       56 non-null     object
 4   Gene names          53 non-null     object
 5   Organism            56 non-null     object
 6   Entry_Korfali_2010  139 non-null    object
 7   Gene_name_obtained  139 non-null    object
dtypes: object(8)
memory usage: 12.7+ KB


In [67]:
df['UniprotID'] = df.Entry.fillna(df.Entry_Korfali_2010)
df['GeneName'] = df['Gene names'].fillna(df.Gene_name_obtained)

In [106]:
df.columns

Index(['NCBI ID', 'Entry', 'Entry name', 'Protein names', 'Gene names',
       'Organism', 'Entry_Korfali_2010', 'Gene_name_obtained', 'UniprotID',
       'GeneName'],
      dtype='object')

In [68]:
df = df[['UniprotID', 'GeneName', 'Entry', 'Entry_Korfali_2010']]

df.fillna(0, inplace=True)

df.Entry = df.Entry.apply(lambda x: 1 if x != 0 else 0)
df.Entry_Korfali_2010 = df.Entry_Korfali_2010.apply(lambda x: 1 if x != 0 else 0)

df = df.rename(columns={'Entry':'Schirmer_2003', 'Entry_Korfali_2010':'Korfali_2010'})

In [13]:
df.tail()

Unnamed: 0,UniprotID,GeneName,Schirmer_2003,Korfali_2010
152,Q86V85,GPR180,0,1
153,Q8N386,LRRC25,0,1
154,Q5TGY1,TMCO4,0,1
155,Q9NX61,TMEM161A,0,1
156,Q96AA3,RFT1,0,1


### 2-2. Add Wilkie 2010

In [69]:
df = df.merge(df_wilkie2010, left_on='UniprotID', right_on='Entry_Wilkie', how='outer')

df.UniprotID = df.UniprotID.fillna(df.Entry_Wilkie)
df.GeneName = df.GeneName.fillna(df.Gene_name_obtained)

df.fillna(0, inplace=True)

In [15]:
df.tail()

Unnamed: 0,UniprotID,GeneName,Schirmer_2003,Korfali_2010,Entry_Wilkie,Gene_name_obtained
182,Q6ZV29,PNPLA7,0.0,0.0,Q6ZV29,PNPLA7
183,Q0P6H9,TMEM62,0.0,0.0,Q0P6H9,TMEM62
184,Q8IZF2,ADGRF5,0.0,0.0,Q8IZF2,ADGRF5
185,A6NMS7,LRRC37A,0.0,0.0,A6NMS7,LRRC37A
186,Q7Z407,CSMD3,0.0,0.0,Q7Z407,CSMD3


In [70]:
df.Entry_Wilkie = df.Entry_Wilkie.apply(lambda x: 1 if x != 0 else 0)

In [71]:
df = df.drop(columns=['Gene_name_obtained'], axis=1)
df = df.rename(columns={'Entry_Wilkie': 'Wilkie_2010'})

### 2-3. Add Korfali 2012

In [72]:
df = df.merge(df_korfali2012, left_on='UniprotID', right_on='Uniprot_id', how='outer')

In [73]:
df.UniprotID = df.UniprotID.fillna(df.Uniprot_id)
df.GeneName = df.GeneName.fillna(df['gene name'])

df.fillna(0, inplace=True)

df.Uniprot_id = df.Uniprot_id.apply(lambda x: 1 if x != 0 else 0)

In [74]:
df.columns

Index(['UniprotID', 'GeneName', 'Schirmer_2003', 'Korfali_2010', 'Wilkie_2010',
       'Unnamed: 0', 'tissue', 'gene name', 'alternate names',
       'accession numbers', 'NE:MM ratio by dNSAF', 'reference', 'Uniprot_id'],
      dtype='object')

In [75]:
df = df[['UniprotID', 'GeneName', 'Schirmer_2003', 'Korfali_2010', 'Wilkie_2010',
           'Uniprot_id']]
df = df.rename(columns={'Uniprot_id':'Korfali_2012'})

In [76]:
df.tail()

Unnamed: 0,UniprotID,GeneName,Schirmer_2003,Korfali_2010,Wilkie_2010,Korfali_2012
212,Q96PC5,CTAGE5,0.0,0.0,0.0,1
213,Q8WUY1,C8orf55,0.0,0.0,0.0,1
214,Q7Z2K6,ERMP1,0.0,0.0,0.0,1
215,Q7Z3C6,ATG9A,0.0,0.0,0.0,1
216,Q6DD88,ATLA3,0.0,0.0,0.0,1


### 2-4. Add Cheng 2019

In [77]:
df = df.merge(df_cheng2019, left_on='UniprotID', right_on='entry_h', how='outer')

In [78]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 413 entries, 0 to 412
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   UniprotID             218 non-null    object 
 1   GeneName              218 non-null    object 
 2   Schirmer_2003         218 non-null    float64
 3   Korfali_2010          218 non-null    float64
 4   Wilkie_2010           218 non-null    float64
 5   Korfali_2012          218 non-null    float64
 6   gene_name             243 non-null    object 
 7   entry_h               243 non-null    object 
 8   NE Enrich Score in U  243 non-null    float64
 9   NE Enrich Score in A  243 non-null    float64
 10  NE Enrich Score in M  243 non-null    float64
dtypes: float64(7), object(4)
memory usage: 38.7+ KB


In [79]:
df.UniprotID = df.UniprotID.fillna(df.entry_h)
df.GeneName = df.GeneName.fillna(df.gene_name)

df.fillna(0, inplace=True)

df.entry_h = df.entry_h.apply(lambda x: 1 if x != 0 else 0)

In [80]:
df.columns

Index(['UniprotID', 'GeneName', 'Schirmer_2003', 'Korfali_2010', 'Wilkie_2010',
       'Korfali_2012', 'gene_name', 'entry_h', 'NE Enrich Score in U',
       'NE Enrich Score in A', 'NE Enrich Score in M'],
      dtype='object')

In [81]:
df = df.rename(columns={'entry_h':'Cheng_2019'})
df = df.drop(columns=['gene_name'], axis=1)

In [82]:
df.head()

Unnamed: 0,UniprotID,GeneName,Schirmer_2003,Korfali_2010,Wilkie_2010,Korfali_2012,Cheng_2019,NE Enrich Score in U,NE Enrich Score in A,NE Enrich Score in M
0,Q92604,LPGAT1 FAM34A KIAA0205,1.0,1.0,1.0,1.0,0,0.0,0.0,0.0
1,Q9Y3T9,NOC2L NIR,1.0,1.0,1.0,0.0,0,0.0,0.0,0.0
2,Q9Y3T9,Noc2l,1.0,1.0,1.0,0.0,0,0.0,0.0,0.0
3,Q8NBX0,SCCPDH CGI-49,1.0,1.0,1.0,1.0,0,0.0,0.0,0.0
4,A0A384NPM7,SCCPDH hCG_1782151,1.0,0.0,0.0,0.0,0,0.0,0.0,0.0


### 2-5. HPA

In [83]:
df_HPA.head()

Unnamed: 0,Gene,Uniprot,Reliability,IF location score
0,A1CF,Q9NQ94,Supported,Nucleoplasm: Supported
1,AAGAB,Q6PD74,Supported,Nuclear speckles: Approved;Cytosol: Supported
2,ABCB6,Q9NP58,Validated,Nucleoplasm: Supported;Golgi apparatus: Valida...
3,ABCB8,Q9NUT2,Supported,Nucleus: Approved;Mitochondria: Supported
4,ABCC5,O15440,Supported,Nucleus: Approved;Cell Junctions: Approved;Pla...


In [84]:
df = df.merge(df_HPA, left_on="UniprotID", right_on="Uniprot", how='outer')

In [85]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3424 entries, 0 to 3423
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   UniprotID             413 non-null    object 
 1   GeneName              413 non-null    object 
 2   Schirmer_2003         413 non-null    float64
 3   Korfali_2010          413 non-null    float64
 4   Wilkie_2010           413 non-null    float64
 5   Korfali_2012          413 non-null    float64
 6   Cheng_2019            413 non-null    float64
 7   NE Enrich Score in U  413 non-null    float64
 8   NE Enrich Score in A  413 non-null    float64
 9   NE Enrich Score in M  413 non-null    float64
 10  Gene                  3079 non-null   object 
 11  Uniprot               3069 non-null   object 
 12  Reliability           3079 non-null   object 
 13  IF location score     3079 non-null   object 
dtypes: float64(8), object(6)
memory usage: 401.2+ KB


In [86]:
df.head()

Unnamed: 0,UniprotID,GeneName,Schirmer_2003,Korfali_2010,Wilkie_2010,Korfali_2012,Cheng_2019,NE Enrich Score in U,NE Enrich Score in A,NE Enrich Score in M,Gene,Uniprot,Reliability,IF location score
0,Q92604,LPGAT1 FAM34A KIAA0205,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,,,,
1,Q9Y3T9,NOC2L NIR,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,NOC2L,Q9Y3T9,Validated,Nucleoli: Validated
2,Q9Y3T9,Noc2l,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,NOC2L,Q9Y3T9,Validated,Nucleoli: Validated
3,Q8NBX0,SCCPDH CGI-49,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,,,,
4,A0A384NPM7,SCCPDH hCG_1782151,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,


In [87]:
df.UniprotID = df.UniprotID.fillna(df.Uniprot)
df.GeneName = df.GeneName.fillna(df.Gene)

In [88]:
df.fillna(0, inplace=True)

df = df.rename(columns={'Reliability': 'HPA_reliability', 'IF location score':'HPA'})
df = df.drop(columns=['Gene', 'Uniprot'], axis=1)

In [89]:
df.columns

Index(['UniprotID', 'GeneName', 'Schirmer_2003', 'Korfali_2010', 'Wilkie_2010',
       'Korfali_2012', 'Cheng_2019', 'NE Enrich Score in U',
       'NE Enrich Score in A', 'NE Enrich Score in M', 'HPA_reliability',
       'HPA'],
      dtype='object')

In [90]:
df = df[['UniprotID', 'GeneName', 'Schirmer_2003', 'Korfali_2010', 'Wilkie_2010',
       'Korfali_2012', 'Cheng_2019', 'HPA', 'HPA_reliability', 'NE Enrich Score in U',
       'NE Enrich Score in A', 'NE Enrich Score in M'
       ]]

In [91]:
df.to_csv('./Output/Merged_Proteome-HPA_010423.csv', index=False)

### Obtain Subcell loc info from Uniprot

In [113]:
# uniprot API URL
WEBSITE_API = "https://rest.uniprot.org/uniprotkb"

In [131]:
for i, entry in enumerate(df.UniprotID):
    
    try:
        # get response with the query, then the gene name in it
        r = get_url(f'{WEBSITE_API}/search?query=accession:{entry}&fields=cc_subcellular_location')
        subcell_loc_info = r.json()['results'][0]['comments'][0]['subcellularLocations'][0]['location']
        subcell_loc = subcell_loc_info['value']
        subcell_loc_evidence = subcell_loc_info['evidences'][0]['evidenceCode']
        
    except:
        subcell_loc = 'Not_found'
        subcell_loc_evidence = 'Not_found'
        
    df.loc[i, 'Uniprot_Subcell_loc'] = subcell_loc
    df.loc[i, 'Uniprot_Subcell_evi'] = subcell_loc_evidence
    
    sleep(1)
    
    if i%200 == 0: print(i, entry, subcell_loc)

0 Q92604 Endoplasmic reticulum membrane
{"url":"http://rest.uniprot.org/uniprotkb/search","messages":["The 'accession' filter value 'Not_found' has invalid format. It should be a valid UniProtKB accession"]}
{"url":"http://rest.uniprot.org/uniprotkb/search","messages":["The 'accession' filter value 'Not_found' has invalid format. It should be a valid UniProtKB accession"]}
{"url":"http://rest.uniprot.org/uniprotkb/search","messages":["The 'accession' filter value 'Not_found' has invalid format. It should be a valid UniProtKB accession"]}
200 Q8IWT6 Cell membrane
400 Q2PZI1 Membrane
{"url":"http://rest.uniprot.org/uniprotkb/search","messages":["The 'accession' filter value '0' has invalid format. It should be a valid UniProtKB accession"]}
{"url":"http://rest.uniprot.org/uniprotkb/search","messages":["The 'accession' filter value '0' has invalid format. It should be a valid UniProtKB accession"]}
{"url":"http://rest.uniprot.org/uniprotkb/search","messages":["The 'accession' filter value

In [137]:
df.to_csv('./Output/Merged_Proteome-HPA-UP_010423.csv', index=False)

In [152]:
df['Proteome_score'] = df['Schirmer_2003'] + df['Korfali_2010'] + df['Wilkie_2010'] + df['Korfali_2012'] + df['Cheng_2019']

In [153]:
df.loc[(df['HPA'] != 0)&(df['Uniprot_Subcell_loc'] != 'Not_found'), 'UP-HPA_score'] = 2
df.loc[(df['HPA'] == 0)&(df['Uniprot_Subcell_loc'] == 'Not_found'), 'UP-HPA_score'] = 0
df.loc[(df['UP-HPA_score'].isnull()), 'UP-HPA_score'] = 1

In [159]:
df.loc[(df['Proteome_score'] > 1), 'Tier'] = 1
df.loc[(df['UP-HPA_score'] != 0)&(df['Proteome_score'] == 1), 'Tier'] = 2
df.loc[(df['Tier'].isnull()), 'Tier'] = 3

In [161]:
df_tier1_2 = df[df.Tier != 3]

In [164]:
df_tier1_2.head()

Unnamed: 0,UniprotID,GeneName,Schirmer_2003,Korfali_2010,Wilkie_2010,Korfali_2012,Cheng_2019,HPA,HPA_reliability,NE Enrich Score in U,NE Enrich Score in A,NE Enrich Score in M,Uniprot_Subcell_loc,Uniprot_Subcell_evi,Proteome_score,UP-HPA_score,Tier
0,Q92604,LPGAT1 FAM34A KIAA0205,1.0,1.0,1.0,1.0,0.0,0,0,0.0,0.0,0.0,Endoplasmic reticulum membrane,ECO:0000269,4.0,1.0,1.0
1,Q9Y3T9,NOC2L NIR,1.0,1.0,1.0,0.0,0.0,Nucleoli: Validated,Validated,0.0,0.0,0.0,Not_found,Not_found,3.0,1.0,1.0
2,Q9Y3T9,Noc2l,1.0,1.0,1.0,0.0,0.0,Nucleoli: Validated,Validated,0.0,0.0,0.0,Not_found,Not_found,3.0,1.0,1.0
3,Q8NBX0,SCCPDH CGI-49,1.0,1.0,1.0,1.0,0.0,0,0,0.0,0.0,0.0,Not_found,Not_found,4.0,0.0,1.0
5,Q5VTL8,PRPF38B,1.0,1.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,Nucleus,ECO:0000305,2.0,1.0,1.0


## Crossing with MemBrain result

In [173]:
# MemBrain
df_MB = pd.read_csv('./Output/Results_step_3.csv')

In [184]:
_df = df_MB.merge(df_tier1_2, how='outer', left_on='Entry_Hs', right_on='UniprotID')

In [185]:
print(df_tier1_2.shape, df_MB.shape, _df.shape)

(383, 17) (2733, 10) (4450, 27)


In [182]:
_df = _df.drop_duplicates()
_df.head(n=23)

AttributeError: 'tuple' object has no attribute 'drop_duplicates'

In [99]:
_df.tail()

Unnamed: 0,Entry_original,Organism,Gene_name,Protein_name,AH_or_Not,AA_sequence,Prediction,SubCell_Uniprot,Entry_Hs,Entry_Mm,...,Schirmer_2003,Korfali_2010,Wilkie_2010,Korfali_2012,Cheng_2019,HPA,HPA_reliability,NE Enrich Score in U,NE Enrich Score in A,NE Enrich Score in M
7422,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,Nucleoplasm: Supported;Cytosol: Supported,Supported,0.0,0.0,0.0
7423,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,Nucleus: Validated,Validated,0.0,0.0,0.0
7424,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,Nucleus: Supported,Supported,0.0,0.0,0.0
7425,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,Nucleus: Validated;Nucleoli: Supported,Validated,0.0,0.0,0.0
7426,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,Nucleoli: Supported;Vesicles: Approved,Supported,0.0,0.0,0.0


In [49]:
_df_HPA = _df[_df.HPA != 0]

In [50]:
_df_HPA = _df_HPA[_df_HPA.HPA.str.contains('Nucle')]

In [52]:
_df_HPA.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 85 entries, 2242 to 2492
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Entry_original        85 non-null     object 
 1   Organism              85 non-null     object 
 2   Gene_name             85 non-null     object 
 3   Protein_name          85 non-null     object 
 4   AH_or_Not             85 non-null     object 
 5   AA_sequence           85 non-null     object 
 6   Prediction            85 non-null     object 
 7   SubCell_Uniprot       85 non-null     object 
 8   Entry_Hs              85 non-null     object 
 9   Entry_Mm              85 non-null     object 
 10  UniprotID             85 non-null     object 
 11  GeneName              85 non-null     object 
 12  Schirmer_2003         85 non-null     float64
 13  Korfali_2010          85 non-null     float64
 14  Wilkie_2010           85 non-null     float64
 15  Korfali_2012        