In [23]:
import os, re
import pandas as pd
from bioservices import UniProt

In [3]:
u = UniProt()

Creating directory /Users/ShokenLEE/Library/Caches/bioservices 
Welcome to Bioservices
It looks like you do not have a configuration file.
We are creating one with default values in /Users/ShokenLEE/Library/Application Support/bioservices/bioservices.cfg .
Done


In [24]:
organism_ids = {'Human': '9606', 'Mouse': '10090', 
                'Rat': '10116', 'Chicken': '9031',
               'Frog': '8355', 'Zebrafish': '7955'}

### 1. Data import, extract data necessary cells, and rename columns

In [25]:
# S4 was extracted beforehand to a single file
df = pd.read_excel('./SourceData/Papers/Korfali2012/2012NUCLEUS0047R-SupTables_S4.xlsx')

In [26]:
# prepare column names
columns = df.iloc[1,[0,1,2,3,17,18]]
columns

Table S4. Summary of NETs directly analyzed                  tissue
Unnamed: 1                                                gene name
Unnamed: 2                                          alternate names
Unnamed: 3                                        accession numbers
Unnamed: 17                                    NE:MM ratio by dNSAF
Unnamed: 18                                               reference
Name: 1, dtype: object

In [56]:
# df.head()
# df.tail(n=30)

In [27]:
# Extract necessary cells
df = df.iloc[2:136,[0,1,2,3,17,18]]
# df.head()
# df.tail()

In [28]:
# rename the columns
df.columns = columns
# df.head()

In [29]:
df.tail()

1,tissue,gene name,alternate names,accession numbers,NE:MM ratio by dNSAF,reference
131,,UNC84B,"SUN2, unc-84 homolog B",gi|109482575|ref|XP_001076724.1|;gi|34996501|r...,74.71,"Hodzic, D.M., et al. (2004) J. Biol. Chem. 279..."
132,,SYNE1,"syne-1, nesprin 1 isoform longer, C6orf98",gi|119120865|ref|NP_001073154.1|;gi|23097308|r...,inf,"Apel, E.D., et al. (2000) J. Biol. Chem. 275, ..."
133,,SYNE2,"syne-2, spectrin repeat containing, nuclear en...",gi|33624848|ref|NP_055995.3|;gi|109479539|ref|...,inf,"Zhang, Q., et al. (2001) J. Cell Sci. 114, 44..."
134,,POM121C,"POM121, nuclear pore membrane protein 121",gi|26051278|ref|NP_742017.1|;gi|16758424|ref|N...,inf,"Soderqvist, H., et al. (1997) Eur. J. Biochem...."
135,,NUP210,"gp210, nucleoporin 210",gi|16758020|ref|NP_445774.1|;gi|9055314|ref|NP...,inf,"Gerace, L., et al. (1982) J. Cell Biol. 95, 82..."


In [30]:
# Fill in tissue data
df['tissue'] = df['tissue'].fillna(method='ffill')
df = df.dropna(axis=0)

In [32]:
# df.head()
df.tail()

1,tissue,gene name,alternate names,accession numbers,NE:MM ratio by dNSAF,reference
131,Original NETs (pre-MudPIT proteomics),UNC84B,"SUN2, unc-84 homolog B",gi|109482575|ref|XP_001076724.1|;gi|34996501|r...,74.71,"Hodzic, D.M., et al. (2004) J. Biol. Chem. 279..."
132,Original NETs (pre-MudPIT proteomics),SYNE1,"syne-1, nesprin 1 isoform longer, C6orf98",gi|119120865|ref|NP_001073154.1|;gi|23097308|r...,inf,"Apel, E.D., et al. (2000) J. Biol. Chem. 275, ..."
133,Original NETs (pre-MudPIT proteomics),SYNE2,"syne-2, spectrin repeat containing, nuclear en...",gi|33624848|ref|NP_055995.3|;gi|109479539|ref|...,inf,"Zhang, Q., et al. (2001) J. Cell Sci. 114, 44..."
134,Original NETs (pre-MudPIT proteomics),POM121C,"POM121, nuclear pore membrane protein 121",gi|26051278|ref|NP_742017.1|;gi|16758424|ref|N...,inf,"Soderqvist, H., et al. (1997) Eur. J. Biochem...."
135,Original NETs (pre-MudPIT proteomics),NUP210,"gp210, nucleoporin 210",gi|16758020|ref|NP_445774.1|;gi|9055314|ref|NP...,inf,"Gerace, L., et al. (1982) J. Cell Biol. 95, 82..."


#### Rename tissue

In [33]:
tissues = df['tissue'].unique()
tissues

array(['liver enriched', 'muscle enriched', 'blood enriched',
       'liver and blood', 'liver and muscle', 'blood and muscle',
       'ALL tissues', 'Original NETs (pre-MudPIT proteomics)'],
      dtype=object)

#### Do proteins in 'Original NETs' have Nucleus locacalization in Uniprot?

In [37]:
original_NETs = df[df['tissue']=='Original NETs (pre-MudPIT proteomics)']['gene name']
original_NETs = list(original_NETs)
original_NETs

['TMEM43',
 'LBR',
 'TOR1AIP1',
 'TMPO',
 'EMD',
 'NRM',
 'LEMD3',
 'UNC84A',
 'UNC84B',
 'SYNE1',
 'SYNE2',
 'POM121C',
 'NUP210']

In [45]:
for protein in original_NETs:
    res = u.search(protein+'+AND+organism:' + organism_ids['Human'], 
         frmt='tab', columns='comment(SUBCELLULAR LOCATION)', limit=1)
    print(res)

Subcellular location [CC]
SUBCELLULAR LOCATION: Endoplasmic reticulum membrane {ECO:0000269|PubMed:32614325}. Nucleus inner membrane; Multi-pass membrane protein. Note=Retained in the inner nuclear membrane through interaction with EMD and A- and B-lamins. The N- and C-termini are oriented towards the nucleoplasm. The majority of the hydrophilic domain resides in the endoplasmic reticulum lumen (By similarity). {ECO:0000250}.

Subcellular location [CC]
SUBCELLULAR LOCATION: Nucleus inner membrane {ECO:0000269|PubMed:8157662}; Multi-pass membrane protein {ECO:0000255}. Endoplasmic reticulum membrane {ECO:0000269|PubMed:21327084}. Cytoplasm {ECO:0000269|PubMed:21327084}. Nucleus {ECO:0000269|PubMed:21327084}. Note=Nucleus; nuclear rim. {ECO:0000269|PubMed:21327084}.

Subcellular location [CC]
SUBCELLULAR LOCATION: Nucleus inner membrane {ECO:0000269|PubMed:12061773, ECO:0000269|PubMed:24275647}; Single-pass membrane protein {ECO:0000269|PubMed:12061773, ECO:0000269|PubMed:24275647}.

Sub

#### Thus only TMEM43 is considered an ER protein, and others are all nucleus

#### Let's keep them

### 2. Link the NCBI acuniquesion numbers to Uniprot ID

#### Firstly in human

#### Realized that the gene #18 contains two names and only the latter VMA21 is needed
#### thus manually replace the name

In [82]:
df.iloc[18, 1] = 'VMA21'

In [83]:
df.iloc[18, 1]

'VMA21'

In [75]:
regex = re.compile(r'(\n)(\w+|\d+)(\n)')
gene_name_list = list(df['gene name'].unique())
uniprot_id_list_human = []
for gene in gene_name_list:
    res = u.search(gene + '+AND+organism:' + organism_ids['Human'], 
         frmt='tab', columns='id', limit=1)
    mo = regex.search(res)
    if mo is not None:
        uniprot_id = mo.group(2)
    else:
        uniprot_id = 'Not_found'

    uniprot_id_list_human.append(uniprot_id)
    print(gene, uniprot_id)

‡SLC22A24
TMEM38A
WFS1
POPDC2
LOC203547/VMA21
KLHL31
