In [66]:
import os, glob, json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from time import sleep

In [57]:
from bioservices import UniProt

### 1. Data import

In [2]:
# protein list from HPA
# downloaded 3/14/22
data_NE = 'subcell_location_Nuclear.json'
# data_nucleoli = 'subcell_location_Nucleoli_Nucleoli.json' # Skipped cause Uniprot ID not present
data_nucleoplasm = 'subcell_location_Nucleoplasm_Nuclear.json'
data_ER = 'subcell_location_Endoplasmic.json'

In [42]:
# Empty dataFrame
df = pd.DataFrame()

#### Extract wanted data pieces from each json and put them into a dataframe

In [43]:
for filename in [data_NE, data_nucleoplasm, data_ER]:
    with open('./SourceData/HPA/' + filename, 'r') as f:
        data = json.load(f)

    print(filename, len(data))
    
    # Prepare empty lists
    gene_name_list = []
    uniprot_ID_list = []
    subcell_loc_list = []
    evidence_IF_list = []
    
    # For each set of gene information in json, extract wanted pieces: gene name, uniprotID, subcell_loc and its data reliability
    for gene in data:
        gene_name_list.append(gene['Gene'])
        uniprot_id = gene['Uniprot']
        uniprot_ID_list.append(uniprot_id)
        subcell_loc_list.append(gene['Subcellular main location'])
        evidence_IF_list.append(gene['Reliability (IF)'])
    
    # Prepare a temporary data frame to which the above items are stored
    _df = pd.DataFrame()
    _df['Gene_name'] = gene_name_list
    _df['Uniprot_id'] = uniprot_ID_list
    _df['Subcellular_loc_HPA'] = subcell_loc_list
    _df['EvidenceLevel_HPA'] = evidence_IF_list
    _df['Filename'] = filename
    
    # Temporary dataframe is merged to the main dataframe
    df = pd.concat([df, _df], axis=0)

subcell_location_Nuclear.json 278
subcell_location_Nucleoplasm_Nuclear.json 6784
subcell_location_Endoplasmic.json 523


In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7585 entries, 0 to 522
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Gene_name            7585 non-null   object
 1   Uniprot_id           7585 non-null   object
 2   Subcellular_loc_HPA  7585 non-null   object
 3   EvidenceLevel_HPA    7585 non-null   object
 4   Filename             7585 non-null   object
dtypes: object(5)
memory usage: 355.5+ KB


In [45]:
# Put "Subcellular location" column based on the filenames
df.loc[df['Filename'].str.contains(data_NE),'Subcellular_location'] = 'Nuclear_membrane'
df.loc[df['Filename'].str.contains(data_nucleoplasm),'Subcellular_location'] = 'Nucleoplasm'
df.loc[df['Filename'].str.contains(data_ER),'Subcellular_location'] = 'ER'

In [46]:
# Clean up
df = df.drop(['Filename'], axis=1)
df = df.reset_index(drop=True)

In [47]:
# df.head()
df.tail()

Unnamed: 0,Gene_name,Uniprot_id,Subcellular_loc_HPA,EvidenceLevel_HPA,Subcellular_location
7580,ZNF33B,[Q06732],"[Nucleoli fibrillar center, Endoplasmic reticu...",Approved,ER
7581,ZNF501,[Q96CX3],[Endoplasmic reticulum],Approved,ER
7582,ZNF804A,[Q7Z570],[Endoplasmic reticulum],Approved,ER
7583,ZSCAN21,[Q9Y5A6],[Endoplasmic reticulum],Approved,ER
7584,ZW10,[O43264],"[Endoplasmic reticulum, Cytosol]",Supported,ER


### 2. Tidy Uniprot ID

#### Apparently there are genes with no or multiple uniprot IDs assigned
#### Thus need to retrive a uniprot ID by gene name from Uniprot,
#### Or split multiple IDs to separate rows

In [162]:
# for splitting the df based on the number of IDs per gene
df['Length of ID'] = df['Uniprot_id'].apply(len)

#### 2-1. Genes with no IDs assigned

In [79]:
df_no_id = df[df['Length of ID'] == 0]

In [80]:
df_no_id

Unnamed: 0,Gene_name,Uniprot_id,Subcellular_loc_HPA,EvidenceLevel_HPA,Subcellular_location,Length of ID
262,ZNF487,[],[Nuclear membrane],Approved,Nuclear_membrane,0
263,FXYD6-FXYD2,[],"[Nuclear membrane, Vesicles]",Approved,Nuclear_membrane,0
265,AC010422.5,[],"[Nuclear membrane, Endoplasmic reticulum]",Approved,Nuclear_membrane,0
277,AC010319.2,[],[Nuclear membrane],Uncertain,Nuclear_membrane,0
318,AC003006.1,[],[Nucleoplasm],Approved,Nucleoplasm,0
...,...,...,...,...,...,...
7095,AP001781.2,[],[Endoplasmic reticulum],Approved,ER,0
7236,GIMAP1-GIMAP5,[],[Endoplasmic reticulum],Supported,ER,0
7410,RPL17-C18orf32,[],"[Endoplasmic reticulum, Cytosol]",Supported,ER,0
7426,RPL36A-HNRNPH2,[],"[Endoplasmic reticulum, Cytosol]",Approved,ER,0


##### Retrieve uniprot IDs from uniprot based on the gene name and organism ID wherever possible

In [81]:
u = UniProt()

for i in range(len(df_no_id)):
    # get a gene name from df_no_id
    gene = df_no_id.iloc[i, 0]
    
    # search Uniprot with organism being H sapiens 9606
    res = u.search(gene +'+AND+organism:9606', frmt='tab', limit=1, columns='id')
    
    # split the tab fmt to a list
    res_split = res.split('\n')
    
    # if res contains ID, the split generates a list with 3 items 'Entry', id, and a space
    if len(res_split) == 3:
        print(res)
        uniprot_id = res_split[1]
    else:
        print('ID not found')
        uniprot_id = 'Not_found'
    
    sleep(3)
    
    # replace the Uniprot ID cell with the ID found
    df_no_id.iloc[i, 1] = uniprot_id

In [158]:
df_no_id = df_no_id.drop(['Length of ID'], axis=1)

#### 2-2. Genes with multiple IDs

In [141]:
df_multi_id = df[df['Length of ID'] >= 2]

In [142]:
df_multi_id = df_multi_id.reset_index(drop=True)

##### Generate a dictionary to which additional IDs are stored, tied with its gene name

In [143]:
_dict = {'Gene_name': [], 'Uniprot_id': []}
for i in range(len(df_multi_id)):
    id_list = df_multi_id.iloc[i, 1]
    print(id_list)
    first_id = id_list[0]
    df_multi_id.iloc[i, 1] = first_id
    for j in range(1, len(id_list)):
        additional_id = id_list[j]
        _dict['Gene_name'].append(df_multi_id.iloc[i, 0])
        _dict['Uniprot_id'].append(additional_id)

['P42166', 'P42167']
['Q5XKL5', 'Q9UPP5']
['P39880', 'Q13948']
['P0DP91', 'Q03468']
['O95467', 'P63092', 'P84996', 'Q5JWF2']
['O96007', 'O96033']
['Q8IXS6', 'Q9Y2D5']
['P0DPB5', 'P0DPB6']
['P0CAP2', 'Q6EEV4']
['O43236', 'Q8NEP4']
['L0R6Q1', 'Q96G79']
['P35544', 'P62861']
['Q8NFQ8', 'Q9H496']
['Q5JU69', 'Q8N2E6']


##### This is concatenated as to the main df_multi_id

In [144]:
_df = pd.DataFrame(_dict)

df_multi_id = pd.concat([df_multi_id, _df], axis=0)

##### Then sorted by Length of ID as well as Gene name so filled rows come to top of each gene and ffill can be applied

In [145]:
df_multi_id = df_multi_id.sort_values(by=['Gene_name', 'Length of ID'])

df_multi_id = df_multi_id.fillna(method='ffill')

##### Cleanup

In [146]:
df_multi_id = df_multi_id.drop(['Length of ID'], axis=1)

In [147]:
df_multi_id

Unnamed: 0,Gene_name,Uniprot_id,Subcellular_loc_HPA,EvidenceLevel_HPA,Subcellular_location
1,BTBD8,Q5XKL5,[Nucleoplasm],Supported,Nucleoplasm
1,BTBD8,Q9UPP5,[Nucleoplasm],Supported,Nucleoplasm
2,CUX1,P39880,"[Nucleoplasm, Golgi apparatus]",Supported,Nucleoplasm
2,CUX1,Q13948,"[Nucleoplasm, Golgi apparatus]",Supported,Nucleoplasm
3,ERCC6,P0DP91,[Nucleoplasm],Approved,Nucleoplasm
3,ERCC6,Q03468,[Nucleoplasm],Approved,Nucleoplasm
11,FAU,P35544,"[Endoplasmic reticulum, Cytosol]",Enhanced,ER
13,FAU,P62861,"[Endoplasmic reticulum, Cytosol]",Enhanced,ER
4,GNAS,O95467,[Nucleoplasm],Approved,Nucleoplasm
4,GNAS,P63092,[Nucleoplasm],Approved,Nucleoplasm


### 3. Concatanate all dfs of zero or multiple IDs to a df of single ID

In [148]:
df.head()

Unnamed: 0,Gene_name,Uniprot_id,Subcellular_loc_HPA,EvidenceLevel_HPA,Subcellular_location,Length of ID
0,TPR,[P12270],[Nuclear membrane],Enhanced,Nuclear_membrane,1
1,XPO1,[O14980],"[Nucleoplasm, Nuclear membrane]",Enhanced,Nuclear_membrane,1
2,NUP50,[Q9UKX7],"[Nucleoplasm, Nuclear membrane]",Enhanced,Nuclear_membrane,1
3,RANGAP1,[P46060],[Nuclear membrane],Enhanced,Nuclear_membrane,1
4,EMD,[P50402],[Nuclear membrane],Enhanced,Nuclear_membrane,1


In [149]:
df_single_id = df[df['Length of ID'] == 1]

##### Turn the list fmt to string fmt in Uniprot_id

In [150]:
def list_to_str(_list):
    _str = _list[0]
    return _str

In [151]:
df_single_id['Uniprot_id_str'] = df_single_id['Uniprot_id'].apply(list_to_str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_single_id['Uniprot_id_str'] = df_single_id['Uniprot_id'].apply(list_to_str)


In [152]:
df_single_id.head()

Unnamed: 0,Gene_name,Uniprot_id,Subcellular_loc_HPA,EvidenceLevel_HPA,Subcellular_location,Length of ID,Uniprot_id_str
0,TPR,[P12270],[Nuclear membrane],Enhanced,Nuclear_membrane,1,P12270
1,XPO1,[O14980],"[Nucleoplasm, Nuclear membrane]",Enhanced,Nuclear_membrane,1,O14980
2,NUP50,[Q9UKX7],"[Nucleoplasm, Nuclear membrane]",Enhanced,Nuclear_membrane,1,Q9UKX7
3,RANGAP1,[P46060],[Nuclear membrane],Enhanced,Nuclear_membrane,1,P46060
4,EMD,[P50402],[Nuclear membrane],Enhanced,Nuclear_membrane,1,P50402


In [153]:
df_single_id = df_single_id.drop(['Uniprot_id'], axis=1)
df_single_id = df_single_id.rename(columns={'Uniprot_id_str': 'Uniprot_id'})

In [154]:
df_single_id.head()

Unnamed: 0,Gene_name,Subcellular_loc_HPA,EvidenceLevel_HPA,Subcellular_location,Length of ID,Uniprot_id
0,TPR,[Nuclear membrane],Enhanced,Nuclear_membrane,1,P12270
1,XPO1,"[Nucleoplasm, Nuclear membrane]",Enhanced,Nuclear_membrane,1,O14980
2,NUP50,"[Nucleoplasm, Nuclear membrane]",Enhanced,Nuclear_membrane,1,Q9UKX7
3,RANGAP1,[Nuclear membrane],Enhanced,Nuclear_membrane,1,P46060
4,EMD,[Nuclear membrane],Enhanced,Nuclear_membrane,1,P50402


##### Clean up before concat

In [156]:
df_single_id = df_single_id.drop(['Length of ID'], axis=1)

##### Concat three dfs

In [159]:
df_clean = pd.concat([df_single_id, df_multi_id, df_no_id])

In [160]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7601 entries, 0 to 7455
Data columns (total 5 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Gene_name             7601 non-null   object
 1   Subcellular_loc_HPA   7601 non-null   object
 2   EvidenceLevel_HPA     7601 non-null   object
 3   Subcellular_location  7601 non-null   object
 4   Uniprot_id            7601 non-null   object
dtypes: object(5)
memory usage: 356.3+ KB


### 4. Export

In [161]:
df_clean.to_csv('./Output/HPA_NE-NP-ER_Hs.csv')