In [38]:
import os, re, requests, json, openpyxl, warnings, importlib
import numpy as np
import pandas as pd
from urllib.request import urlopen
from time import sleep

from my_utils import *

warnings.filterwarnings('ignore')

# Wilkie et al 2010: Rat skeletal muscle

In [43]:
# Excel import
wb = openpyxl.load_workbook('./SourceData/Wilkie_2010/mcp.M110.003129-6.xlsx')
print(wb.get_sheet_names())

# sheet selected
sheet = wb.get_sheet_by_name('Extracted')
_df_wilkie = pd.DataFrame(sheet.values)

# convert to df
df_wilkie = pd.DataFrame(_df_wilkie.iloc[3:, 1])
df_wilkie.columns = ['Gene_name']
df_wilkie = df_wilkie.reset_index(drop=True)
df_wilkie.head()

['Table 1', 'Extracted']


Unnamed: 0,Gene_name
0,UNC84B
1,TOR1AIP1
2,LBR
3,UNC84A
4,NUP210


In [44]:
# find UniProt Entry and UniProt-registered gene names
for i in range(len(df_wilkie)):
    
    gene = df_wilkie.iloc[i, 0]
    entry_converted, gene_obtained = get_UniProtEntry(gene)
    
    df_wilkie.loc[i, 'Entry_Wilkie'] = entry_converted
    df_wilkie.loc[i, 'Gene_name_obtained'] = gene_obtained
    
    sleep(1)

In [48]:
# Clean and export
df_wilkie = df_wilkie.drop(['Gene_name'], axis=1)
df_wilkie.to_csv('./IntermediateProducts/Wilkie_2010.csv', index=False)

# Schirmer et al 2003: mouse liver

### See Schirmer2003.ipynb
### Output is Schirmer2003/Output.csv

# Korfali et al 2010: Mouse leukocytes

In [46]:
# Excel import
df_Korfali_2010 = pd.read_excel('./SourceData/Korfali_2010/rk1-139_fromTableS5.xlsx')

# Convert to df
df_Korfali_2010= pd.DataFrame(df_Korfali_2010.iloc[1:, 1])
df_Korfali_2010.columns = ['Gene_name']
df_Korfali_2010 = df_Korfali_2010.reset_index(drop=True)

In [47]:
# Manual inspection and correction of gene names
df_Korfali_2010.iloc[15, 0] = 'DHRS7'
df_Korfali_2010.iloc[39, 0] = 'NOC2L'
df_Korfali_2010.iloc[58, 0] = 'METTL7A'
df_Korfali_2010.iloc[104, 0] = 'TMEM189'

In [48]:
# find UniProt Entry and UniProt-registerd gene names
for i in range(len(df_Korfali_2010)):
    
    gene = df_Korfali_2010.iloc[i, 0]
    entry_converted, gene_obtained = get_UniProtEntry(gene)
    
    df_Korfali_2010.loc[i, 'Entry_Korfali_2010'] = entry_converted
    df_Korfali_2010.loc[i, 'Gene_name_obtained'] = gene_obtained
    
    if i % 40 == 0: print(i, entry_converted, gene_obtained)
    
    sleep(1)

0 Q9UH99 SUN2
40 Q3LXA3 TKFC
80 Q6NXT6 TAPT1
120 Q9Y2W6 TDRKH


In [16]:
# Clean and export
df_Korfali_2010 = df_Korfali_2010.drop(['Gene_name'], axis=1)
df_Korfali_2010.to_csv('./IntermediateProducts/ProteomePapers/Korfali_2010.csv', index=False)

# Korfali et al 2012: 

In [57]:
# S4 was extracted beforehand to a single file because the whole xlsx is extremely heavy
df_Korfali_2012 = pd.read_excel('./SourceData/Korfali_2012/2012NUCLEUS0047R-SupTables_S4.xlsx')

In [58]:
# prepare column names
columns = df_Korfali_2012.iloc[1,[0,1,2,3,17,18]]
columns

Table S4. Summary of NETs directly analyzed                  tissue
Unnamed: 1                                                gene name
Unnamed: 2                                          alternate names
Unnamed: 3                                        accession numbers
Unnamed: 17                                    NE:MM ratio by dNSAF
Unnamed: 18                                               reference
Name: 1, dtype: object

In [53]:
# Extract necessary cells
df_Korfali_2012 = df_Korfali_2012.iloc[2:136,[0,1,2,3,17,18]]

# rename the columns
df_Korfali_2012.columns = columns

# Fill in tissue data
df_Korfali_2012['tissue'] = df_Korfali_2012['tissue'].fillna(method='ffill')
df_Korfali_2012 = df_Korfali_2012.dropna(axis=0)

# Reindex
df_Korfali_2012 = df_Korfali_2012.reset_index(drop=True)

In [55]:
print("The number of genes: ", len(list(df_Korfali_2012['gene name'].unique())))

The number of genes:  119


In [27]:
# Manual inspection and correction of gene names
# #15 should be SLC22A24
# #18 contains two names and only the latter VMA21 is needed
# #47 should be MARCH5
df_Korfali_2012.iloc[14, 1] = 'SLC22A24'
df_Korfali_2012.iloc[18, 1] = 'VMA21'
df_Korfali_2012.iloc[47, 1] = 'MARCH5'
df_Korfali_2012.iloc[97, 1] = 'ATL3'

In [28]:
# select columns
df_Korfali_2012 = df_Korfali_2012[['tissue', 'gene name', 'NE:MM ratio by dNSAF']]

In [29]:
# find Uniprot Entry and UniProt-registered gene names
for i in range(len(df_Korfali_2012)):
    
    gene = df_Korfali_2012.iloc[i, 1]
    entry_converted, gene_obtained = get_UniProtEntry(gene)
    
    df_Korfali_2012.loc[i, 'Entry_Korfali_2012'] = entry_converted
    df_Korfali_2012.loc[i, 'Gene_name_obtained'] = gene_obtained
    
    sleep(1)

In [32]:
# Clean and export
df = df.drop(columns=['Gene_name_obtained'], axis=1)
df.to_csv('./IntermediateProducts/ProteomePapers/Korfali_2012.csv', index=False)