In [1]:
import os, re, openpyxl
import pandas as pd
from time import sleep
from bioservices import UniProt

In [2]:
u = UniProt()

In [3]:
organism_ids = {'Human': '9606', 'Mouse': '10090', 
                'Rat': '10116', 'Cattle': '9913',
                'Chicken': '9031', 'Frog': '8355', 
                'Zebrafish': '7955', 'Fly': '7227', 'Worm': '6239'}

### 1. Data import, extract data necessary cells, and rename columns

In [4]:
# S4 was extracted beforehand to a single file because the whole xlsx is extremely heavy
df = pd.read_excel('./SourceData/Papers/Korfali2012/2012NUCLEUS0047R-SupTables_S4.xlsx')

In [5]:
# prepare column names
columns = df.iloc[1,[0,1,2,3,17,18]]
columns

Table S4. Summary of NETs directly analyzed                  tissue
Unnamed: 1                                                gene name
Unnamed: 2                                          alternate names
Unnamed: 3                                        accession numbers
Unnamed: 17                                    NE:MM ratio by dNSAF
Unnamed: 18                                               reference
Name: 1, dtype: object

In [6]:
# df.head()
# df.tail(n=30)

In [7]:
# Extract necessary cells
df = df.iloc[2:136,[0,1,2,3,17,18]]
# df.head()
# df.tail()

In [8]:
# rename the columns
df.columns = columns
# df.head()

In [9]:
df.tail()

1,tissue,gene name,alternate names,accession numbers,NE:MM ratio by dNSAF,reference
131,,UNC84B,"SUN2, unc-84 homolog B",gi|109482575|ref|XP_001076724.1|;gi|34996501|r...,74.71,"Hodzic, D.M., et al. (2004) J. Biol. Chem. 279..."
132,,SYNE1,"syne-1, nesprin 1 isoform longer, C6orf98",gi|119120865|ref|NP_001073154.1|;gi|23097308|r...,inf,"Apel, E.D., et al. (2000) J. Biol. Chem. 275, ..."
133,,SYNE2,"syne-2, spectrin repeat containing, nuclear en...",gi|33624848|ref|NP_055995.3|;gi|109479539|ref|...,inf,"Zhang, Q., et al. (2001) J. Cell Sci. 114, 44..."
134,,POM121C,"POM121, nuclear pore membrane protein 121",gi|26051278|ref|NP_742017.1|;gi|16758424|ref|N...,inf,"Soderqvist, H., et al. (1997) Eur. J. Biochem...."
135,,NUP210,"gp210, nucleoporin 210",gi|16758020|ref|NP_445774.1|;gi|9055314|ref|NP...,inf,"Gerace, L., et al. (1982) J. Cell Biol. 95, 82..."


In [10]:
# Fill in tissue data
df['tissue'] = df['tissue'].fillna(method='ffill')
df = df.dropna(axis=0)

In [11]:
# df.head()
df.tail()

1,tissue,gene name,alternate names,accession numbers,NE:MM ratio by dNSAF,reference
131,Original NETs (pre-MudPIT proteomics),UNC84B,"SUN2, unc-84 homolog B",gi|109482575|ref|XP_001076724.1|;gi|34996501|r...,74.71,"Hodzic, D.M., et al. (2004) J. Biol. Chem. 279..."
132,Original NETs (pre-MudPIT proteomics),SYNE1,"syne-1, nesprin 1 isoform longer, C6orf98",gi|119120865|ref|NP_001073154.1|;gi|23097308|r...,inf,"Apel, E.D., et al. (2000) J. Biol. Chem. 275, ..."
133,Original NETs (pre-MudPIT proteomics),SYNE2,"syne-2, spectrin repeat containing, nuclear en...",gi|33624848|ref|NP_055995.3|;gi|109479539|ref|...,inf,"Zhang, Q., et al. (2001) J. Cell Sci. 114, 44..."
134,Original NETs (pre-MudPIT proteomics),POM121C,"POM121, nuclear pore membrane protein 121",gi|26051278|ref|NP_742017.1|;gi|16758424|ref|N...,inf,"Soderqvist, H., et al. (1997) Eur. J. Biochem...."
135,Original NETs (pre-MudPIT proteomics),NUP210,"gp210, nucleoporin 210",gi|16758020|ref|NP_445774.1|;gi|9055314|ref|NP...,inf,"Gerace, L., et al. (1982) J. Cell Biol. 95, 82..."


#### How many genes?

In [12]:
len(list(df['gene name'].unique()))

119

#### Do proteins in 'Original NETs' have Nucleus locacalization in Uniprot?

In [12]:
original_NETs = df[df['tissue']=='Original NETs (pre-MudPIT proteomics)']['gene name']
original_NETs = list(original_NETs)
original_NETs

['TMEM43',
 'LBR',
 'TOR1AIP1',
 'TMPO',
 'EMD',
 'NRM',
 'LEMD3',
 'UNC84A',
 'UNC84B',
 'SYNE1',
 'SYNE2',
 'POM121C',
 'NUP210']

#### Do not rerun the below w/o obvious necessity

In [45]:
# for protein in original_NETs:
#     res = u.search(protein+'+AND+organism:' + organism_ids['Human'], 
#          frmt='tab', columns='comment(SUBCELLULAR LOCATION)', limit=1)
#     print(res)

Subcellular location [CC]
SUBCELLULAR LOCATION: Endoplasmic reticulum membrane {ECO:0000269|PubMed:32614325}. Nucleus inner membrane; Multi-pass membrane protein. Note=Retained in the inner nuclear membrane through interaction with EMD and A- and B-lamins. The N- and C-termini are oriented towards the nucleoplasm. The majority of the hydrophilic domain resides in the endoplasmic reticulum lumen (By similarity). {ECO:0000250}.

Subcellular location [CC]
SUBCELLULAR LOCATION: Nucleus inner membrane {ECO:0000269|PubMed:8157662}; Multi-pass membrane protein {ECO:0000255}. Endoplasmic reticulum membrane {ECO:0000269|PubMed:21327084}. Cytoplasm {ECO:0000269|PubMed:21327084}. Nucleus {ECO:0000269|PubMed:21327084}. Note=Nucleus; nuclear rim. {ECO:0000269|PubMed:21327084}.

Subcellular location [CC]
SUBCELLULAR LOCATION: Nucleus inner membrane {ECO:0000269|PubMed:12061773, ECO:0000269|PubMed:24275647}; Single-pass membrane protein {ECO:0000269|PubMed:12061773, ECO:0000269|PubMed:24275647}.

Sub

#### Thus only TMEM43 is considered an ER protein, and others are all nucleus

#### Let's keep them

### 2. Link the NCBI accession numbers to Uniprot ID

#### For now human only

#### Realized that the gene #18 contains two names and only the latter VMA21 is needed
#### thus manually replace the name

In [13]:
df.iloc[18, 1] = 'VMA21'

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 119 entries, 3 to 135
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   tissue                119 non-null    object
 1   gene name             119 non-null    object
 2   alternate names       119 non-null    object
 3   accession numbers     119 non-null    object
 4   NE:MM ratio by dNSAF  119 non-null    object
 5   reference             119 non-null    object
dtypes: object(6)
memory usage: 6.5+ KB


#### Retrieve Uniprot ID of human genes by using uniprot module

In [27]:
regex = re.compile(r'(\n)(\w+|\d+)(\n)')
gene_name_list = list(df['gene name'].unique())
uniprot_id_list_human = []
for gene in gene_name_list:
    res = u.search(gene + '+AND+organism:' + organism_ids['Human'], 
         frmt='tab', columns='id', limit=1)
    mo = regex.search(res)
    if mo is not None:
        uniprot_id = mo.group(2)
    else:
        uniprot_id = 'Not_found'

    uniprot_id_list_human.append(uniprot_id)
    print(gene, uniprot_id)
    
    sleep(5)

TMEM53 Q6P2H8
TMEM120A Q9BXJ8
SCARA5 Q6ZMJ2
TMEM74 Q96NL1
PPAPDC3 Q8NBV4
EGFR P00533
SLC39A14 Q15043
SCCPDH Q8NBX0
WDR33 Q9C0J8
TMEM209 Q96SK2
KIAA1967 Q8N163
TM7SF2 O76062
APH1B Q8WW43
MCAT Q8IVS2
‡SLC22A24 Q8N4F4
TMEM38A Q9H6F2
WFS1 O76024
POPDC2 Q9HBU9
VMA21 Q3ZAQ7
KLHL31 Q9H511
CGRRF1 Q99675
SAMD8 Q96LT4
ATP1B4 Q9UN42
DNAJC16 Q9Y2G8
TMEM57 Q8N5G2
DTNA Q9Y4J8
ATP1B1 P05026
RYR1 P21817
HVCN1 Q96D96
TMEM63A O94886
C9orf46 Q9HBL7
SLC38A10  Q9HBR0
TMEM160 Q9NX00
C18orf19 Q96ND0
CNNM4 Q6P4Q7
MS4A1 P11836
LRRC8A Q8IWT6
FAM3C Q92520
ABCB1 P08183
SLC9A1 P19634
ITPR2 Q14571
TMEM173 Q86WV6
TMEM126A Q9H061
FAM62A Q9BSJ8
C17orf62 Q9BQA9
C20orf30 Q96A57
SEC11C Q9BY50
MARCHV Not_found
FAM105A Q9NUU6
DHRS7 Q9Y394
SLC25A22 Q9H936
DAK Q3LXA3
NCLN Q969V3
SQSTM1 Q13501
IAG2 Q9H0U3
TAPBPL Q9BX59
TMUB1 Q9BVT8
AYTL1 Q7L5N7
C20orf3 Q9HDC9
C17orf32 Q8N511
C14orf1 Q9UKR5
TOR1AIP2 Q8NFQ8
ALG2 Q9H553
KIAA1161 Q6NSJ0
TMTC3 Q6ZXV5
LRRC37A A6NMS7
MOSPD3 O75425
ALG10B Q5I7T1
CRELD1 Q96HD1
TMEM161A Q9NX61
TMEM70 Q

In [28]:
_df = pd.DataFrame()
_df['gene name'] = gene_name_list
_df['Uniprot_id'] = uniprot_id_list_human

#### Realized that the only gene with ID not found, MARCHV, is actually MARCH5
#### this gene has a entry on uniprot and its ID was found
#### Thus replaced

In [31]:
_df[_df['Uniprot_id'] == 'Not_found'].index

Int64Index([47], dtype='int64')

In [33]:
_df.loc[47, 'Uniprot_id'] 

'Not_found'

In [34]:
_df.loc[47, 'Uniprot_id'] = 'Q9NX47'

In [35]:
_df.loc[47, 'Uniprot_id'] 

'Q9NX47'

### 3. Merge of main df with _df

In [36]:
df_output_hs = df.merge(_df, how='outer', on='gene name')

In [37]:
df_output_hs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 119 entries, 0 to 118
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   tissue                119 non-null    object
 1   gene name             119 non-null    object
 2   alternate names       119 non-null    object
 3   accession numbers     119 non-null    object
 4   NE:MM ratio by dNSAF  119 non-null    object
 5   reference             119 non-null    object
 6   Uniprot_id            119 non-null    object
dtypes: object(7)
memory usage: 7.4+ KB


In [38]:
df_output_hs.to_csv('./Output/Korfali2012_Hs.csv')

### 4. Conversion of human uniprot ID to mouse and rat

In [15]:
df_new = pd.read_csv('./Output/Korfali2012_Hs.csv')

In [16]:
df_new.head()

Unnamed: 0.1,Unnamed: 0,tissue,gene name,alternate names,accession numbers,NE:MM ratio by dNSAF,reference,Uniprot_id
0,0,liver enriched,TMEM53,"NET4, transmembrane protein 53",ref|NP_081113.1|,2.57,"This study and Schirmer, E.C., et al. (2003). ...",Q6P2H8
1,1,liver enriched,TMEM120A,"NET29, transmembrane protein induced by tumor ...",ref|NP_766129.1|,inf,"This study and Malik, P., et al. (2010) Cell M...",Q9BXJ8
2,2,liver enriched,SCARA5,"NET33, PREDICTED: similar to protease, serine,...",gi|109502608|ref|XP_001066668.1|,0.1,"This study and Malik, P., et al. (2010) Cell M...",Q6ZMJ2
3,3,liver enriched,TMEM74,"NET36, PREDICTED: hypothetical protein [Rattus...",ref|XP_001063530.1|,3.36,"This study and Malik, P., et al. (2010) Cell M...",Q96NL1
4,4,liver enriched,PPAPDC3,"NET39, phosphatidic acid phosphatase type 2 do...",gi|59891419|ref|NP_001012349.1|;gi|34147436|re...,4.42,"This study and Schirmer, E.C., et al. (2003). ...",Q8NBV4


In [17]:
def get_uniprot_id(gene, organism):
    regex = re.compile(r'(\n)(\w+|\d+)(\n)')
    
    res = u.search('gene:' + gene + '+AND+organism:' + organism_ids[organism], 
             frmt='tab', columns='id', limit=1)
    mo = regex.search(res)
    if mo is not None:
        uniprot_id = mo.group(2)
    else:
        uniprot_id = 'Not_found'
    
    sleep(5)

    return uniprot_id

#### Rat

In [32]:
for i, gene in enumerate(df_new['gene name']):
    uniprot_id = get_uniprot_id(gene, 'Rat')
    df_new.loc[i, 'Uniprot_id_Rn'] = uniprot_id
    print(gene, uniprot_id)

TMEM53 D3ZPB8
TMEM120A Q5HZE2
SCARA5 D4A213
TMEM74 D3ZR33
PPAPDC3 Q5FVJ3
EGFR G3V6K6
SLC39A14 D3ZZM0
SCCPDH Q6AY30
WDR33 F1LT09
TMEM209 Q68FR5
KIAA1967 Not_found
TM7SF2 Q5BK21
APH1B Q0PY50
MCAT D3ZPF2
‡SLC22A24 Q76M99
TMEM38A A6ZIQ8
WFS1 E9PT53
POPDC2 Q6P722
VMA21 D3ZM03
KLHL31 D3ZLT6
CGRRF1 P97587
SAMD8 Q641X0
ATP1B4 Q9R193
DNAJC16 Q5FVM7
TMEM57 Q4V7D3
DTNA D4A772
ATP1B1 P07340
RYR1 F1LMY4
HVCN1 D3ZIU8
TMEM63A D4A2I3
C9orf46 Not_found
SLC38A10  E9PT23
TMEM160 D3ZZU4
C18orf19 Not_found
CNNM4 P0C588
MS4A1 D4A4Y3
LRRC8A Q4V8I7
FAM3C Q810F4
ABCB1 P43245
SLC9A1 P26431
ITPR2 P29995
TMEM173 F1M391
TMEM126A Q5HZA9
FAM62A Q9Z1X1
C17orf62 Not_found
C20orf30 Not_found
SEC11C Q9WTR7
MARCHV Not_found
FAM105A Q3B7D8
DHRS7 Q5RJY4
SLC25A22 Q5FVG4
DAK Q4KLZ6
NCLN Q5XIA1
SQSTM1 O08623
IAG2 O35777
TAPBPL D4A6L1
TMUB1 Q53AQ4
AYTL1 P0C1Q3
C20orf3 Not_found
C17orf32 Not_found
C14orf1 Not_found
TOR1AIP2 Q6P752
ALG2 G3V7W1
KIAA1161 Not_found
TMTC3 D3ZUJ8
LRRC37A F1M5B9
MOSPD3 Q4KLG1
ALG10B O88788
CRELD1 Q4V7

#### Mouse

In [33]:
for i, gene in enumerate(df_new['gene name']):
    uniprot_id = get_uniprot_id(gene, 'Mouse')
    df_new.loc[i, 'Uniprot_id_Mm'] = uniprot_id
    print(gene, uniprot_id)

TMEM53 Q9D0Z3
TMEM120A Q8C1E7
SCARA5 Q8K299
TMEM74 Q8BQU7
PPAPDC3 Q91WB2
EGFR Q01279
SLC39A14 Q75N73
SCCPDH Q8R127
WDR33 Q8K4P0
TMEM209 Q8BRG8
KIAA1967 Not_found
TM7SF2 Q71KT5
APH1B Q8C7N7
MCAT Q8R3F5
‡SLC22A24 Not_found
TMEM38A Q3TMP8
WFS1 P56695
POPDC2 Q9ES82
VMA21 Q78T54
KLHL31 Q8BWA5
CGRRF1 Q8BMJ7
SAMD8 Q9DA37
ATP1B4 Q99ME6
DNAJC16 Q80TN4
TMEM57 Q7TQE6
DTNA Q9D2N4
ATP1B1 P14094
RYR1 E9PZQ0
HVCN1 Q3U2S8
TMEM63A Q91YT8
C9orf46 Not_found
SLC38A10  Q5I012
TMEM160 Q9D938
C18orf19 Not_found
CNNM4 Q69ZF7
MS4A1 P20490
LRRC8A Q80WG5
FAM3C Q91VU0
ABCB1 P06795
SLC9A1 Q61165
ITPR2 Q9Z329
TMEM173 Q3TBT3
TMEM126A Q9D8Y1
FAM62A Q3U7R1
C17orf62 Not_found
C20orf30 Not_found
SEC11C Q9D8V7
MARCHV Not_found
FAM105A Q3TVP5
DHRS7 Q99J47
SLC25A22 Q9D6M3
DAK Q8VC30
NCLN Q8VCM8
SQSTM1 Q64337
IAG2 Q9CQY5
TAPBPL Q8VD31
TMUB1 Q9JMG3
AYTL1 Q8BYI6
C20orf3 Not_found
C17orf32 Not_found
C14orf1 Not_found
TOR1AIP2 Q8BYU6
ALG2 P12815
KIAA1161 Q69ZQ1
TMTC3 Q8BRH0
LRRC37A B1AWG4
MOSPD3 Q8BGG6
ALG10B Q3UGP8
CRELD1 Q91X

In [40]:
df_new.to_csv('./Output/Korfali2012_HsMmRn.csv', index=False)

In [43]:
df_new[df_new['Uniprot_id_Hs'].str.contains('330')]

Unnamed: 0.1,Unnamed: 0,tissue,gene name,alternate names,accession numbers,NE:MM ratio by dNSAF,reference,Uniprot_id_Hs,Uniprot_id_Rn,Uniprot_id_Mm
87,87,ALL tissues,C9ORF5,"hypothetical protein LOC23731, mNET*",gi|14042923|ref|NP_114401.1|;gi|109476539|ref|...,inf,This study and GSW and ECS unpublished results,Q9H330,Not_found,Not_found


#### Cattle

In [18]:
for i, gene in enumerate(df_new['gene name']):
    uniprot_id = get_uniprot_id(gene, 'Cattle')
    df_new.loc[i, 'Uniprot_id_Rn'] = uniprot_id
    print(gene, uniprot_id)

TMEM53 Q2TBP5
TMEM120A Q05B45
SCARA5 A5PJQ2
TMEM74 F1N044
PPAPDC3 Q08DF9
EGFR A0A3Q1MHB0
SLC39A14 A5D7L5
SCCPDH Q3T067
WDR33 E1BCT7
TMEM209 E1BBC2
KIAA1967 Q0V8K2
TM7SF2 Q8WMV1
APH1B A0A3Q1LNM1
MCAT E1BPG0
‡SLC22A24 Not_found
TMEM38A A4FV75
WFS1 G3N348
POPDC2 A5PJG2
VMA21 A2VDK9
KLHL31 G3N1Y0
CGRRF1 E1BIW8
SAMD8 F1MYS1
ATP1B4 A7MB71
DNAJC16 A7MBJ0
TMEM57 Q2TLZ3
DTNA A0A3Q1NLH2
ATP1B1 G3MWR4
RYR1 A0A3Q1MUQ4
HVCN1 A0A3Q1NF89
TMEM63A A7MB88
C9orf46 Not_found
SLC38A10  A0A3Q1LQW5
TMEM160 Q24JY6
C18orf19 Not_found
CNNM4 F1MK24
MS4A1 F1MKC6
LRRC8A Q08E42
FAM3C A5PKI3
ABCB1 A0A3Q1MB23
SLC9A1 Q28036
ITPR2 Q8WN96
TMEM173 Q2KI99
TMEM126A Q32L86
FAM62A A0JN43
C17orf62 Not_found
C20orf30 Not_found
SEC11C Q2KI36
MARCHV Not_found
FAM105A A6QLS5
DHRS7 Q1RMJ5
SLC25A22 Q08DK4
DAK Q58DK4
NCLN A0A3Q1NCG7
SQSTM1 A0A3Q1LVE4
IAG2 Not_found
TAPBPL F1N3G5
TMUB1 Q3ZBI9
AYTL1 Not_found
C20orf3 Not_found
C17orf32 Not_found
C14orf1 Not_found
TOR1AIP2 A5PKH0
ALG2 A4FUG6
KIAA1161 Not_found
TMTC3 E1BG63
LRRC37A Not_

In [19]:
df_new = df_new.rename(columns={'Uniprot_id_Rn': 'Uniprot_id_Bt'})

In [21]:
df_new_Bt = df_new.loc[:, ['gene name', 'Uniprot_id_Bt']]

In [24]:
df_new_Bt.to_csv('./Output/Korfali2012_Bt.csv', index=False)