In [21]:
import os, re, openpyxl
import pandas as pd
from time import sleep
from bioservices import UniProt

In [17]:
u = UniProt()

In [19]:
organism_ids = {'Human': '9606', 'Mouse': '10090', 
                'Rat': '10116', 'Chicken': '9031',
               'Frog': '8355', 'Zebrafish': '7955'}

### 1. Data import, extract data necessary cells, and rename columns

In [5]:
# S4 was extracted beforehand to a single file because the whole xlsx is extremely heavy
df = pd.read_excel('./SourceData/Papers/Korfali2012/2012NUCLEUS0047R-SupTables_S4.xlsx')

In [6]:
# prepare column names
columns = df.iloc[1,[0,1,2,3,17,18]]
columns

Table S4. Summary of NETs directly analyzed                  tissue
Unnamed: 1                                                gene name
Unnamed: 2                                          alternate names
Unnamed: 3                                        accession numbers
Unnamed: 17                                    NE:MM ratio by dNSAF
Unnamed: 18                                               reference
Name: 1, dtype: object

In [56]:
# df.head()
# df.tail(n=30)

In [7]:
# Extract necessary cells
df = df.iloc[2:136,[0,1,2,3,17,18]]
# df.head()
# df.tail()

In [8]:
# rename the columns
df.columns = columns
# df.head()

In [9]:
df.tail()

1,tissue,gene name,alternate names,accession numbers,NE:MM ratio by dNSAF,reference
131,,UNC84B,"SUN2, unc-84 homolog B",gi|109482575|ref|XP_001076724.1|;gi|34996501|r...,74.71,"Hodzic, D.M., et al. (2004) J. Biol. Chem. 279..."
132,,SYNE1,"syne-1, nesprin 1 isoform longer, C6orf98",gi|119120865|ref|NP_001073154.1|;gi|23097308|r...,inf,"Apel, E.D., et al. (2000) J. Biol. Chem. 275, ..."
133,,SYNE2,"syne-2, spectrin repeat containing, nuclear en...",gi|33624848|ref|NP_055995.3|;gi|109479539|ref|...,inf,"Zhang, Q., et al. (2001) J. Cell Sci. 114, 44..."
134,,POM121C,"POM121, nuclear pore membrane protein 121",gi|26051278|ref|NP_742017.1|;gi|16758424|ref|N...,inf,"Soderqvist, H., et al. (1997) Eur. J. Biochem...."
135,,NUP210,"gp210, nucleoporin 210",gi|16758020|ref|NP_445774.1|;gi|9055314|ref|NP...,inf,"Gerace, L., et al. (1982) J. Cell Biol. 95, 82..."


In [10]:
# Fill in tissue data
df['tissue'] = df['tissue'].fillna(method='ffill')
df = df.dropna(axis=0)

In [11]:
# df.head()
df.tail()

1,tissue,gene name,alternate names,accession numbers,NE:MM ratio by dNSAF,reference
131,Original NETs (pre-MudPIT proteomics),UNC84B,"SUN2, unc-84 homolog B",gi|109482575|ref|XP_001076724.1|;gi|34996501|r...,74.71,"Hodzic, D.M., et al. (2004) J. Biol. Chem. 279..."
132,Original NETs (pre-MudPIT proteomics),SYNE1,"syne-1, nesprin 1 isoform longer, C6orf98",gi|119120865|ref|NP_001073154.1|;gi|23097308|r...,inf,"Apel, E.D., et al. (2000) J. Biol. Chem. 275, ..."
133,Original NETs (pre-MudPIT proteomics),SYNE2,"syne-2, spectrin repeat containing, nuclear en...",gi|33624848|ref|NP_055995.3|;gi|109479539|ref|...,inf,"Zhang, Q., et al. (2001) J. Cell Sci. 114, 44..."
134,Original NETs (pre-MudPIT proteomics),POM121C,"POM121, nuclear pore membrane protein 121",gi|26051278|ref|NP_742017.1|;gi|16758424|ref|N...,inf,"Soderqvist, H., et al. (1997) Eur. J. Biochem...."
135,Original NETs (pre-MudPIT proteomics),NUP210,"gp210, nucleoporin 210",gi|16758020|ref|NP_445774.1|;gi|9055314|ref|NP...,inf,"Gerace, L., et al. (1982) J. Cell Biol. 95, 82..."


#### Do proteins in 'Original NETs' have Nucleus locacalization in Uniprot?

In [12]:
original_NETs = df[df['tissue']=='Original NETs (pre-MudPIT proteomics)']['gene name']
original_NETs = list(original_NETs)
original_NETs

['TMEM43',
 'LBR',
 'TOR1AIP1',
 'TMPO',
 'EMD',
 'NRM',
 'LEMD3',
 'UNC84A',
 'UNC84B',
 'SYNE1',
 'SYNE2',
 'POM121C',
 'NUP210']

#### Do not rerun the below w/o obvious necessity

In [45]:
for protein in original_NETs:
    res = u.search(protein+'+AND+organism:' + organism_ids['Human'], 
         frmt='tab', columns='comment(SUBCELLULAR LOCATION)', limit=1)
    print(res)

Subcellular location [CC]
SUBCELLULAR LOCATION: Endoplasmic reticulum membrane {ECO:0000269|PubMed:32614325}. Nucleus inner membrane; Multi-pass membrane protein. Note=Retained in the inner nuclear membrane through interaction with EMD and A- and B-lamins. The N- and C-termini are oriented towards the nucleoplasm. The majority of the hydrophilic domain resides in the endoplasmic reticulum lumen (By similarity). {ECO:0000250}.

Subcellular location [CC]
SUBCELLULAR LOCATION: Nucleus inner membrane {ECO:0000269|PubMed:8157662}; Multi-pass membrane protein {ECO:0000255}. Endoplasmic reticulum membrane {ECO:0000269|PubMed:21327084}. Cytoplasm {ECO:0000269|PubMed:21327084}. Nucleus {ECO:0000269|PubMed:21327084}. Note=Nucleus; nuclear rim. {ECO:0000269|PubMed:21327084}.

Subcellular location [CC]
SUBCELLULAR LOCATION: Nucleus inner membrane {ECO:0000269|PubMed:12061773, ECO:0000269|PubMed:24275647}; Single-pass membrane protein {ECO:0000269|PubMed:12061773, ECO:0000269|PubMed:24275647}.

Sub

#### Thus only TMEM43 is considered an ER protein, and others are all nucleus

#### Let's keep them

### 2. Link the NCBI acuniquesion numbers to Uniprot ID

#### Firstly in human

#### Realized that the gene #18 contains two names and only the latter VMA21 is needed
#### thus manually replace the name

In [13]:
df.iloc[18, 1] = 'VMA21'

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 119 entries, 3 to 135
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   tissue                119 non-null    object
 1   gene name             119 non-null    object
 2   alternate names       119 non-null    object
 3   accession numbers     119 non-null    object
 4   NE:MM ratio by dNSAF  119 non-null    object
 5   reference             119 non-null    object
dtypes: object(6)
memory usage: 6.5+ KB


In [27]:
regex = re.compile(r'(\n)(\w+|\d+)(\n)')
gene_name_list = list(df['gene name'].unique())
uniprot_id_list_human = []
for gene in gene_name_list:
    res = u.search(gene + '+AND+organism:' + organism_ids['Human'], 
         frmt='tab', columns='id', limit=1)
    mo = regex.search(res)
    if mo is not None:
        uniprot_id = mo.group(2)
    else:
        uniprot_id = 'Not_found'

    uniprot_id_list_human.append(uniprot_id)
    print(gene, uniprot_id)
    
    sleep(5)

TMEM53 Q6P2H8
TMEM120A Q9BXJ8
SCARA5 Q6ZMJ2
TMEM74 Q96NL1
PPAPDC3 Q8NBV4
EGFR P00533
SLC39A14 Q15043
SCCPDH Q8NBX0
WDR33 Q9C0J8
TMEM209 Q96SK2
KIAA1967 Q8N163
TM7SF2 O76062
APH1B Q8WW43
MCAT Q8IVS2
‡SLC22A24 Q8N4F4
TMEM38A Q9H6F2
WFS1 O76024
POPDC2 Q9HBU9
VMA21 Q3ZAQ7
KLHL31 Q9H511
CGRRF1 Q99675
SAMD8 Q96LT4
ATP1B4 Q9UN42
DNAJC16 Q9Y2G8
TMEM57 Q8N5G2
DTNA Q9Y4J8
ATP1B1 P05026
RYR1 P21817
HVCN1 Q96D96
TMEM63A O94886
C9orf46 Q9HBL7
SLC38A10  Q9HBR0
TMEM160 Q9NX00
C18orf19 Q96ND0
CNNM4 Q6P4Q7
MS4A1 P11836
LRRC8A Q8IWT6
FAM3C Q92520
ABCB1 P08183
SLC9A1 P19634
ITPR2 Q14571
TMEM173 Q86WV6
TMEM126A Q9H061
FAM62A Q9BSJ8
C17orf62 Q9BQA9
C20orf30 Q96A57
SEC11C Q9BY50
MARCHV Not_found
FAM105A Q9NUU6
DHRS7 Q9Y394
SLC25A22 Q9H936
DAK Q3LXA3
NCLN Q969V3
SQSTM1 Q13501
IAG2 Q9H0U3
TAPBPL Q9BX59
TMUB1 Q9BVT8
AYTL1 Q7L5N7
C20orf3 Q9HDC9
C17orf32 Q8N511
C14orf1 Q9UKR5
TOR1AIP2 Q8NFQ8
ALG2 Q9H553
KIAA1161 Q6NSJ0
TMTC3 Q6ZXV5
LRRC37A A6NMS7
MOSPD3 O75425
ALG10B Q5I7T1
CRELD1 Q96HD1
TMEM161A Q9NX61
TMEM70 Q

In [28]:
_df = pd.DataFrame()
_df['gene name'] = gene_name_list
_df['Uniprot_id'] = uniprot_id_list_human

#### Realized that the only gene with ID not found, MARCHV, is actually MARCH5
#### this gene has a entry on uniprot and its ID was found
#### Thus replaced

In [31]:
_df[_df['Uniprot_id'] == 'Not_found'].index

Int64Index([47], dtype='int64')

In [33]:
_df.loc[47, 'Uniprot_id'] 

'Not_found'

In [34]:
_df.loc[47, 'Uniprot_id'] = 'Q9NX47'

In [35]:
_df.loc[47, 'Uniprot_id'] 

'Q9NX47'

### 3. Merge of main df with _df

In [36]:
df_output_hs = df.merge(_df, how='outer', on='gene name')

In [37]:
df_output_hs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 119 entries, 0 to 118
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   tissue                119 non-null    object
 1   gene name             119 non-null    object
 2   alternate names       119 non-null    object
 3   accession numbers     119 non-null    object
 4   NE:MM ratio by dNSAF  119 non-null    object
 5   reference             119 non-null    object
 6   Uniprot_id            119 non-null    object
dtypes: object(7)
memory usage: 7.4+ KB


In [38]:
df_output_hs.to_csv('./Output/Korfali2012_Hs.csv')