In [1]:
import pandas as pd
import requests, json
from time import sleep

In [2]:
def get_url(url, **kwargs):
    '''
    Obatin a response from a given url
    '''
    response = requests.get(url, **kwargs);

    if not response.ok:
        print(response.text)
        response.raise_for_status()
        sys.exit()

    return response

In [6]:
# uniprot API URL
WEBSITE_API = "https://rest.uniprot.org/uniprotkb"

### 1. Data import

In [35]:
df_schirmer2003 = pd.read_csv('./Output/Schirmer2003/Output.csv')
df_korfali2010 = pd.read_csv('./Output/Korfali_2010.csv')
df_wilkie2010 = pd.read_csv('./Output/Wilkie_2010.csv')
df_korfali2012 = pd.read_csv('./Output/Korfali2012/Korfali2012_Hs.csv')
df_cheng2019 = pd.read_csv('./Output/Cheng2019.csv')
df_HPA = pd.read_csv('./Output/HPA_val_supp_nucleus.csv')

### 2. Merge

### 2-1. Schirmer + Korfali2010

In [89]:
df = df_schirmer2003.merge(df_korfali2010, left_on='Entry', right_on='Entry_Korfali_2010', how='outer')

In [90]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 157 entries, 0 to 156
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Entry               44 non-null     object
 1   Gene names          44 non-null     object
 2   Entry_Korfali_2010  142 non-null    object
 3   Gene_name_obtained  142 non-null    object
dtypes: object(4)
memory usage: 6.1+ KB


In [91]:
df['UniprotID'] = df.Entry.fillna(df.Entry_Korfali_2010)
df['GeneName'] = df['Gene names'].fillna(df.Gene_name_obtained)

In [92]:
df.columns

Index(['Entry', 'Gene names', 'Entry_Korfali_2010', 'Gene_name_obtained',
       'UniprotID', 'GeneName'],
      dtype='object')

In [93]:
df = df[['UniprotID', 'GeneName', 'Entry', 'Entry_Korfali_2010']]

df.fillna(0, inplace=True)

df.Entry = df.Entry.apply(lambda x: 1 if x != 0 else 0)
df.Entry_Korfali_2010 = df.Entry_Korfali_2010.apply(lambda x: 1 if x != 0 else 0)

df = df.rename(columns={'Entry':'Schirmer_2003', 'Entry_Korfali_2010':'Korfali_2010'})

In [94]:
df.tail()

Unnamed: 0,UniprotID,GeneName,Schirmer_2003,Korfali_2010
152,Q86V85,GPR180,0,1
153,Q8N386,LRRC25,0,1
154,Q5TGY1,TMCO4,0,1
155,Q9NX61,TMEM161A,0,1
156,Q96AA3,RFT1,0,1


### 2-2. Add Wilkie 2010

In [95]:
df = df.merge(df_wilkie2010, left_on='UniprotID', right_on='Entry_Wilkie', how='outer')

df.UniprotID = df.UniprotID.fillna(df.Entry_Wilkie)
df.GeneName = df.GeneName.fillna(df.Gene_name_obtained)

df.fillna(0, inplace=True)

In [96]:
df.tail()

Unnamed: 0,UniprotID,GeneName,Schirmer_2003,Korfali_2010,Entry_Wilkie,Gene_name_obtained
182,Q6ZV29,PNPLA7,0.0,0.0,Q6ZV29,PNPLA7
183,Q0P6H9,TMEM62,0.0,0.0,Q0P6H9,TMEM62
184,Q8IZF2,ADGRF5,0.0,0.0,Q8IZF2,ADGRF5
185,A6NMS7,LRRC37A,0.0,0.0,A6NMS7,LRRC37A
186,Q7Z407,CSMD3,0.0,0.0,Q7Z407,CSMD3


In [97]:
df.Entry_Wilkie = df.Entry_Wilkie.apply(lambda x: 1 if x != 0 else 0)

In [98]:
df = df.drop(columns=['Gene_name_obtained'], axis=1)
df = df.rename(columns={'Entry_Wilkie': 'Wilkie_2010'})

### 2-3. Add Korfali 2012

In [99]:
df = df.merge(df_korfali2012, left_on='UniprotID', right_on='Uniprot_id', how='outer')

In [100]:
df.UniprotID = df.UniprotID.fillna(df.Uniprot_id)
df.GeneName = df.GeneName.fillna(df['gene name'])

df.fillna(0, inplace=True)

df.Uniprot_id = df.Uniprot_id.apply(lambda x: 1 if x != 0 else 0)

In [101]:
df.columns

Index(['UniprotID', 'GeneName', 'Schirmer_2003', 'Korfali_2010', 'Wilkie_2010',
       'Unnamed: 0', 'tissue', 'gene name', 'alternate names',
       'accession numbers', 'NE:MM ratio by dNSAF', 'reference', 'Uniprot_id'],
      dtype='object')

In [102]:
df = df[['UniprotID', 'GeneName', 'Schirmer_2003', 'Korfali_2010', 'Wilkie_2010',
           'Uniprot_id']]
df = df.rename(columns={'Uniprot_id':'Korfali_2012'})

In [103]:
df.tail()

Unnamed: 0,UniprotID,GeneName,Schirmer_2003,Korfali_2010,Wilkie_2010,Korfali_2012
212,Q96PC5,CTAGE5,0.0,0.0,0.0,1
213,Q8WUY1,C8orf55,0.0,0.0,0.0,1
214,Q7Z2K6,ERMP1,0.0,0.0,0.0,1
215,Q7Z3C6,ATG9A,0.0,0.0,0.0,1
216,Q6DD88,ATLA3,0.0,0.0,0.0,1


### 2-4. Add Cheng 2019

In [104]:
df = df.merge(df_cheng2019, left_on='UniprotID', right_on='entry_h', how='outer')

In [105]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 413 entries, 0 to 412
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   UniprotID             218 non-null    object 
 1   GeneName              218 non-null    object 
 2   Schirmer_2003         218 non-null    float64
 3   Korfali_2010          218 non-null    float64
 4   Wilkie_2010           218 non-null    float64
 5   Korfali_2012          218 non-null    float64
 6   gene_name             243 non-null    object 
 7   entry_h               243 non-null    object 
 8   NE Enrich Score in U  243 non-null    float64
 9   NE Enrich Score in A  243 non-null    float64
 10  NE Enrich Score in M  243 non-null    float64
dtypes: float64(7), object(4)
memory usage: 38.7+ KB


In [106]:
df.UniprotID = df.UniprotID.fillna(df.entry_h)
df.GeneName = df.GeneName.fillna(df.gene_name)

df.fillna(0, inplace=True)

df.entry_h = df.entry_h.apply(lambda x: 1 if x != 0 else 0)

In [107]:
df.columns

Index(['UniprotID', 'GeneName', 'Schirmer_2003', 'Korfali_2010', 'Wilkie_2010',
       'Korfali_2012', 'gene_name', 'entry_h', 'NE Enrich Score in U',
       'NE Enrich Score in A', 'NE Enrich Score in M'],
      dtype='object')

In [108]:
df = df.rename(columns={'entry_h':'Cheng_2019'})
df = df.drop(columns=['gene_name'], axis=1)

In [109]:
df.head()

Unnamed: 0,UniprotID,GeneName,Schirmer_2003,Korfali_2010,Wilkie_2010,Korfali_2012,Cheng_2019,NE Enrich Score in U,NE Enrich Score in A,NE Enrich Score in M
0,Q92604,LPGAT1 FAM34A KIAA0205,1.0,1.0,1.0,1.0,0,0.0,0.0,0.0
1,Q9Y3T9,NOC2L NIR,1.0,1.0,1.0,0.0,0,0.0,0.0,0.0
2,Q9Y3T9,Noc2l,1.0,1.0,1.0,0.0,0,0.0,0.0,0.0
3,Q8NBX0,SCCPDH CGI-49,1.0,1.0,1.0,1.0,0,0.0,0.0,0.0
4,A0A384NPM7,SCCPDH hCG_1782151,1.0,0.0,0.0,0.0,0,0.0,0.0,0.0


### 2-5. HPA

In [110]:
df_HPA.head()

Unnamed: 0,Gene,Uniprot,Reliability,IF location score
0,A1CF,Q9NQ94,Supported,Nucleoplasm: Supported
1,AAGAB,Q6PD74,Supported,Nuclear speckles: Approved;Cytosol: Supported
2,ABCB6,Q9NP58,Validated,Nucleoplasm: Supported;Golgi apparatus: Valida...
3,ABCB8,Q9NUT2,Supported,Nucleus: Approved;Mitochondria: Supported
4,ABCC5,O15440,Supported,Nucleus: Approved;Cell Junctions: Approved;Pla...


In [111]:
df = df.merge(df_HPA, left_on="UniprotID", right_on="Uniprot", how='outer')

In [112]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3424 entries, 0 to 3423
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   UniprotID             413 non-null    object 
 1   GeneName              413 non-null    object 
 2   Schirmer_2003         413 non-null    float64
 3   Korfali_2010          413 non-null    float64
 4   Wilkie_2010           413 non-null    float64
 5   Korfali_2012          413 non-null    float64
 6   Cheng_2019            413 non-null    float64
 7   NE Enrich Score in U  413 non-null    float64
 8   NE Enrich Score in A  413 non-null    float64
 9   NE Enrich Score in M  413 non-null    float64
 10  Gene                  3079 non-null   object 
 11  Uniprot               3069 non-null   object 
 12  Reliability           3079 non-null   object 
 13  IF location score     3079 non-null   object 
dtypes: float64(8), object(6)
memory usage: 401.2+ KB


In [113]:
df.head()

Unnamed: 0,UniprotID,GeneName,Schirmer_2003,Korfali_2010,Wilkie_2010,Korfali_2012,Cheng_2019,NE Enrich Score in U,NE Enrich Score in A,NE Enrich Score in M,Gene,Uniprot,Reliability,IF location score
0,Q92604,LPGAT1 FAM34A KIAA0205,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,,,,
1,Q9Y3T9,NOC2L NIR,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,NOC2L,Q9Y3T9,Validated,Nucleoli: Validated
2,Q9Y3T9,Noc2l,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,NOC2L,Q9Y3T9,Validated,Nucleoli: Validated
3,Q8NBX0,SCCPDH CGI-49,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,,,,
4,A0A384NPM7,SCCPDH hCG_1782151,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,


In [114]:
df.UniprotID = df.UniprotID.fillna(df.Uniprot)
df.GeneName = df.GeneName.fillna(df.Gene)

In [115]:
df.fillna(0, inplace=True)

df = df.rename(columns={'Reliability': 'HPA_reliability', 'IF location score':'HPA_loc'})
df = df.drop(columns=['Gene', 'Uniprot'], axis=1)

In [116]:
# scoring
df['HPA'] = df.HPA_loc.apply(lambda x: 1 if x != 0 else 0)

In [117]:
df.columns

Index(['UniprotID', 'GeneName', 'Schirmer_2003', 'Korfali_2010', 'Wilkie_2010',
       'Korfali_2012', 'Cheng_2019', 'NE Enrich Score in U',
       'NE Enrich Score in A', 'NE Enrich Score in M', 'HPA_reliability',
       'HPA_loc', 'HPA'],
      dtype='object')

In [118]:
df = df[['UniprotID', 'GeneName', 'Schirmer_2003', 'Korfali_2010', 'Wilkie_2010',
       'Korfali_2012', 'Cheng_2019', 'HPA', 'HPA_loc', 'HPA_reliability', 'NE Enrich Score in U',
       'NE Enrich Score in A', 'NE Enrich Score in M'
       ]]

In [119]:
df.to_csv('./Output/Merged_Proteome-HPA_011023.csv', index=False)

### Obtain Subcell loc info from Uniprot

In [187]:
df = pd.read_csv('./Output/Merged_Proteome-HPA_011023.csv')

In [188]:
df.head()

Unnamed: 0,UniprotID,GeneName,Schirmer_2003,Korfali_2010,Wilkie_2010,Korfali_2012,Cheng_2019,HPA,HPA_loc,HPA_reliability,NE Enrich Score in U,NE Enrich Score in A,NE Enrich Score in M
0,Q92604,LPGAT1 FAM34A KIAA0205,1.0,1.0,1.0,1.0,0.0,0,0,0,0.0,0.0,0.0
1,Q9Y3T9,NOC2L NIR,1.0,1.0,1.0,0.0,0.0,1,Nucleoli: Validated,Validated,0.0,0.0,0.0
2,Q9Y3T9,Noc2l,1.0,1.0,1.0,0.0,0.0,1,Nucleoli: Validated,Validated,0.0,0.0,0.0
3,Q8NBX0,SCCPDH CGI-49,1.0,1.0,1.0,1.0,0.0,0,0,0,0.0,0.0,0.0
4,A0A384NPM7,SCCPDH hCG_1782151,1.0,0.0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0


In [189]:
# remove Uniprot ID duplicates
df = df.drop_duplicates(subset='UniprotID').reset_index(drop=True)

In [123]:
NE_names = ['Nucleus outer membrane','Nucleus membrane','Nucleus inner membrane','Nucleus, nuclear pore complex', 'Nucleus envelope', 'Nucleus lamina']
ER_names = ['Endoplasmic reticulum membrane','Endoplasmic reticulum','Sarcoplasmic reticulum membrane',
            'Endoplasmic reticulum-Golgi intermediate compartment membrane', 'Endoplasmic reticulum lumen']

In [200]:
for i, entry in enumerate(df.UniprotID):
    
    locations = []
    evidence_NE = []
    evidence_ER = []
    pmids = []
    
    try:
        r = get_url(f'{WEBSITE_API}/search?query=accession:{entry}&fields=cc_subcellular_location')
        subcell_loc_info = r.json()['results'][0]['comments'][0]['subcellularLocations']
        sleep(1)
        
        for info in subcell_loc_info:
            keys = info['location'].keys()
            if 'evidences' in keys: # check if the localization info contains an evidence
                location = info['location']['value']; locations.append(location) # if so the loc info is stored
                
                # Below check if localzation is either NE or ER
                # if so, the evidenceCode and PMID if any are stored
                if location in NE_names:
                    evidences = info['location']['evidences']
                    for evidence in evidences:
                        evidence_keys = evidence.keys()
                        evidenceCode = evidence['evidenceCode'];
                        if evidenceCode not in evidence_NE: # check if the Code is new
                            evidence_NE.append(evidenceCode)
                        if 'id' in evidence_keys: # check if PMID is available
                            pmids.append(evidence['id'])

                if location in ER_names:
                    evidences = info['location']['evidences']
                    for evidence in evidences:
                        evidence_keys = evidence.keys()
                        evidenceCode = evidence['evidenceCode'];
                        if evidenceCode not in evidence_ER:
                            evidence_ER.append(evidenceCode)
                        if 'id' in evidence_keys:
                            pmids.append(evidence['id'])
                            

    except:
        pass

    # store to df
    df.loc[i, 'Uniprot_Subcell_loc'] = ", ".join(locations)
    df.loc[i, 'Uniprot_Subcell_evi_NE'] = ", ".join(evidence_NE)
    df.loc[i, 'Uniprot_Subcell_evi_ER'] = ", ".join(evidence_ER)
    df.loc[i, 'Uniprot_Subcell_pmid-NE-ER'] = ", ".join(pmids)
    
    if i%200 == 0: print(i, entry, locations)

0 Q92604 ['Endoplasmic reticulum membrane']
{"url":"http://rest.uniprot.org/uniprotkb/search","messages":["The 'accession' filter value 'Not_found' has invalid format. It should be a valid UniProtKB accession"]}
200 Q9BVT8 ['Membrane', 'Postsynaptic cell membrane', 'Recycling endosome', 'Cytoplasm', 'Nucleus', 'Nucleus, nucleolus']
400 P35610 ['Endoplasmic reticulum membrane']
{"url":"http://rest.uniprot.org/uniprotkb/search","messages":["The 'accession' filter value '0' has invalid format. It should be a valid UniProtKB accession"]}
600 Q9UIG0 ['Nucleus']
{"url":"http://rest.uniprot.org/uniprotkb/search","messages":["The 'accession' filter value 'P42771;Q8N726' has invalid format. It should be a valid UniProtKB accession"]}
800 P53567 ['Nucleus']
{"url":"http://rest.uniprot.org/uniprotkb/search","messages":["The 'accession' filter value 'P0CG12;P0CG13' has invalid format. It should be a valid UniProtKB accession"]}
{"url":"http://rest.uniprot.org/uniprotkb/search","messages":["The 'ac

In [201]:
good_evidence = ['ECO:0000269', # experimental
                 'ECO:0000305', # inferrence from paper
                 'ECO:0000250', # seq similarity
                 'ECO:0000255', # seq model
                 'ECO:0000312', # imported from other database
                 'ECO:0007744'] # a combination of experimental and computational evidence

def checkEvidence(x):
    if x in good_evidence:
        return True
    else:
        return False

In [218]:
import numpy as np
np.where((True|False), 1, 0)

array(1)

In [202]:
five_papers = ['12958361', '20693407', '20876400', '22990521', '31142202']
def checkPMID(x):
    return_value = 0
    for pmid in five_papers:
        if pmid in x:
            return_value = 1
    
    return return_value

In [224]:
# Uniprot score
# check if certain evidenceCode is in either evidence_NE or _ER
df['Uniprot_subCell'] = np.where((df['Uniprot_Subcell_evi_NE'].apply(str).apply(checkEvidence) | df['Uniprot_Subcell_evi_ER'].apply(str).apply(checkEvidence)), 1, 0)
# df['Uniprot_subCell'] = df.Uniprot_Subcell_evi_ER.apply(lambda x: 1 if checkEvidence(x) else 0)

In [204]:
# df['Uniprot_Subcell_pmid-NE-ER'] = df['Uniprot_Subcell_pmid-NE-ER'].apply(str)
df['5_papers'] = df['Uniprot_Subcell_pmid-NE-ER'].apply(str).apply(checkPMID)

In [205]:
df.head(n=30)

Unnamed: 0,UniprotID,GeneName,Schirmer_2003,Korfali_2010,Wilkie_2010,Korfali_2012,Cheng_2019,HPA,HPA_loc,HPA_reliability,NE Enrich Score in U,NE Enrich Score in A,NE Enrich Score in M,Uniprot_Subcell_loc,Uniprot_Subcell_evi_NE,Uniprot_Subcell_evi_ER,Uniprot_Subcell_pmid-NE-ER,Uniprot_subCell,5_papers
0,Q92604,LPGAT1 FAM34A KIAA0205,1.0,1.0,1.0,1.0,0.0,0,0,0,0.0,0.0,0.0,Endoplasmic reticulum membrane,,ECO:0000269,15485873,1,0
1,Q9Y3T9,NOC2L NIR,1.0,1.0,1.0,0.0,0.0,1,Nucleoli: Validated,Validated,0.0,0.0,0.0,,,,,0,0
2,Q8NBX0,SCCPDH CGI-49,1.0,1.0,1.0,1.0,0.0,0,0,0,0.0,0.0,0.0,,,,,0,0
3,A0A384NPM7,SCCPDH hCG_1782151,1.0,0.0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,,,,,0,0
4,Q5VTL8,PRPF38B,1.0,1.0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,Nucleus,,,,0,0
5,Q9BTX1,NDC1 TMEM48,1.0,1.0,1.0,1.0,1.0,1,Nuclear membrane: Supported;Actin filaments: U...,Supported,0.674122,0.740816,0.896719,,,,,0,0
6,Q8NFQ8,TOR1AIP2 IFRG15 LULL1,1.0,1.0,0.0,1.0,0.0,0,0,0,0.0,0.0,0.0,,,,,0,0
7,Q9NXE4,SMPD4 KIAA1418 SKNY,1.0,1.0,1.0,1.0,1.0,0,0,0,0.9327,0.895236,0.983445,"Endoplasmic reticulum membrane, Golgi apparatu...",ECO:0000269,ECO:0000269,"16517606, 18505924, 31495489, 31495489",1,0
8,A0AV96,RBM47,1.0,1.0,0.0,0.0,0.0,1,Nucleoplasm: Supported;Cytosol: Supported,Supported,0.0,0.0,0.0,Nucleus,,,,0,0
9,Q7LBC6,KDM3B C5orf7 JHDM2B JMJD1B KIAA1082,1.0,1.0,0.0,0.0,0.0,1,Nucleoplasm: Validated,Validated,0.0,0.0,0.0,Nucleus,,,,0,0


In [228]:
df.to_csv('./Output/Merged_Proteome-HPA-UP_011023.csv', index=False)

### Narrow to Tier1 and Tier2

In [18]:
df = pd.read_csv('./Output/Merged_Proteome-HPA-UP_010423.csv')

In [4]:
df['Proteome_score'] = df['Schirmer_2003'] + df['Korfali_2010'] + df['Wilkie_2010'] + df['Korfali_2012'] + df['Cheng_2019']

In [19]:
a = df['Uniprot_Subcell_loc'].unique().tolist()

In [None]:
ER_or_NE = ['Nucleus envelope', 'Nucleus inner membrane', 'Nucleus lamina', 'Nucleus membrane', 'Nucleus outer membrane', 'Endoplasmic reticulum']

In [153]:
df.loc[(df['HPA'] != 0)&(df['Uniprot_Subcell_loc'] != 'Not_found'), 'UP-HPA_score'] = 2
df.loc[(df['HPA'] == 0)&(df['Uniprot_Subcell_loc'] == 'Not_found'), 'UP-HPA_score'] = 0
df.loc[(df['UP-HPA_score'].isnull()), 'UP-HPA_score'] = 1

In [159]:
df.loc[(df['Proteome_score'] > 1), 'Tier'] = 1
df.loc[(df['UP-HPA_score'] != 0)&(df['Proteome_score'] == 1), 'Tier'] = 2
df.loc[(df['Tier'].isnull()), 'Tier'] = 3

In [161]:
df_tier1_2 = df[df.Tier != 3]

In [164]:
df_tier1_2.head()

Unnamed: 0,UniprotID,GeneName,Schirmer_2003,Korfali_2010,Wilkie_2010,Korfali_2012,Cheng_2019,HPA,HPA_reliability,NE Enrich Score in U,NE Enrich Score in A,NE Enrich Score in M,Uniprot_Subcell_loc,Uniprot_Subcell_evi,Proteome_score,UP-HPA_score,Tier
0,Q92604,LPGAT1 FAM34A KIAA0205,1.0,1.0,1.0,1.0,0.0,0,0,0.0,0.0,0.0,Endoplasmic reticulum membrane,ECO:0000269,4.0,1.0,1.0
1,Q9Y3T9,NOC2L NIR,1.0,1.0,1.0,0.0,0.0,Nucleoli: Validated,Validated,0.0,0.0,0.0,Not_found,Not_found,3.0,1.0,1.0
2,Q9Y3T9,Noc2l,1.0,1.0,1.0,0.0,0.0,Nucleoli: Validated,Validated,0.0,0.0,0.0,Not_found,Not_found,3.0,1.0,1.0
3,Q8NBX0,SCCPDH CGI-49,1.0,1.0,1.0,1.0,0.0,0,0,0.0,0.0,0.0,Not_found,Not_found,4.0,0.0,1.0
5,Q5VTL8,PRPF38B,1.0,1.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,Nucleus,ECO:0000305,2.0,1.0,1.0


## Crossing with MemBrain result

In [173]:
# MemBrain
df_MB = pd.read_csv('./Output/Results_step_3.csv')

In [184]:
_df = df_MB.merge(df_tier1_2, how='outer', left_on='Entry_Hs', right_on='UniprotID')

In [185]:
print(df_tier1_2.shape, df_MB.shape, _df.shape)

(383, 17) (2733, 10) (4450, 27)


In [182]:
_df = _df.drop_duplicates()
_df.head(n=23)

AttributeError: 'tuple' object has no attribute 'drop_duplicates'

In [99]:
_df.tail()

Unnamed: 0,Entry_original,Organism,Gene_name,Protein_name,AH_or_Not,AA_sequence,Prediction,SubCell_Uniprot,Entry_Hs,Entry_Mm,...,Schirmer_2003,Korfali_2010,Wilkie_2010,Korfali_2012,Cheng_2019,HPA,HPA_reliability,NE Enrich Score in U,NE Enrich Score in A,NE Enrich Score in M
7422,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,Nucleoplasm: Supported;Cytosol: Supported,Supported,0.0,0.0,0.0
7423,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,Nucleus: Validated,Validated,0.0,0.0,0.0
7424,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,Nucleus: Supported,Supported,0.0,0.0,0.0
7425,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,Nucleus: Validated;Nucleoli: Supported,Validated,0.0,0.0,0.0
7426,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,Nucleoli: Supported;Vesicles: Approved,Supported,0.0,0.0,0.0


In [49]:
_df_HPA = _df[_df.HPA != 0]

In [50]:
_df_HPA = _df_HPA[_df_HPA.HPA.str.contains('Nucle')]

In [52]:
_df_HPA.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 85 entries, 2242 to 2492
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Entry_original        85 non-null     object 
 1   Organism              85 non-null     object 
 2   Gene_name             85 non-null     object 
 3   Protein_name          85 non-null     object 
 4   AH_or_Not             85 non-null     object 
 5   AA_sequence           85 non-null     object 
 6   Prediction            85 non-null     object 
 7   SubCell_Uniprot       85 non-null     object 
 8   Entry_Hs              85 non-null     object 
 9   Entry_Mm              85 non-null     object 
 10  UniprotID             85 non-null     object 
 11  GeneName              85 non-null     object 
 12  Schirmer_2003         85 non-null     float64
 13  Korfali_2010          85 non-null     float64
 14  Wilkie_2010           85 non-null     float64
 15  Korfali_2012        