In [1]:
import os, re, requests, json, openpyxl, warnings
import numpy as np
import pandas as pd
from urllib.request import urlopen
from time import sleep

warnings.filterwarnings('ignore')

In [2]:
def get_url(url, **kwargs):
    '''
    Obatin a response from a given url
    '''
    response = requests.get(url, **kwargs);

    if not response.ok:
        print(response.text)
        response.raise_for_status()
        sys.exit()

    return response

# Wilkie et al 2010: Rat skeletal muscle

In [20]:
# Excel import
wb = openpyxl.load_workbook('./SourceData/Wilkie_2010/mcp.M110.003129-6.xlsx')
print(wb.get_sheet_names())

# sheet selected
sheet = wb.get_sheet_by_name('Extracted')
_df_wilkie = pd.DataFrame(sheet.values)

# convert to df
df_wilkie = pd.DataFrame(_df_wilkie.iloc[3:, 1])
df_wilkie.columns = ['Gene_name']
df_wilkie = df_wilkie.reset_index(drop=True)
df_wilkie

['Table 1', 'Extracted']


  print(wb.get_sheet_names())
  sheet = wb.get_sheet_by_name('Extracted')


In [42]:
# uniprot API URL
WEBSITE_API = "https://rest.uniprot.org/uniprotkb/"

# organism id
organism_id_list = {'Homo sapiens': '9606', 'Mus musculus': '10090'}
organism_id = organism_id_list['Homo sapiens']

# find Uniprot Entry and gene names for each gene
for i in range(len(df_wilkie)):
    
    # gene name
    gene = df_wilkie.iloc[i, 0]

    try:
        # get response with gene name the query
        r = get_url(f'{WEBSITE_API}/search?query=gene:{gene}+AND+organism_id:{organism_id}&fields=accession,gene_names')
        result = r.json()['results'][0]
        
        ## get human or mouse entry
        entry_converted = result['primaryAccession']
        ## get gene name and check the names match between human's or mouse's and the given organism's
        gene_obtained = result['genes'][0]['geneName']['value']
    
    except:
        entry_converted = 'Not_found'
    
    # Put the obtained Entry and gene name
    df_wilkie.loc[i, 'Entry_Wilkie'] = entry_converted
    df_wilkie.loc[i, 'Gene_name_obtained'] = gene_obtained
    
    # take a break and go next
    sleep(1)

In [46]:
# Manually check the result
# df_wilkie.head(n=50)
# df_wilkie.tail(n=50)

In [47]:
df_wilkie = df_wilkie.drop(['Gene_name'], axis=1)

In [48]:
df_wilkie.to_csv('./IntermediateProducts/Wilkie_2010.csv', index=False)

# Schirmer et al 2003: mouse liver

### See Schirmer2003.ipynb
### Output is Schirmer2003/Output.csv

# Korfali et al 2010: Mouse leukocytes

In [21]:
# Excel import
df_Korfali_2010 = pd.read_excel('./SourceData/Korfali_2010/rk1-139_fromTableS5.xlsx')

# convert to df
df_Korfali_2010= pd.DataFrame(df_Korfali_2010.iloc[1:, 1])
df_Korfali_2010.columns = ['Gene_name']
df_Korfali_2010 = df_Korfali_2010.reset_index(drop=True)

In [22]:
# Manual inspection and correction
df_Korfali_2010.iloc[15, 0] = 'DHRS7'
df_Korfali_2010.iloc[39, 0] = 'NOC2L'

In [23]:
# uniprot API URL
WEBSITE_API = "https://rest.uniprot.org/uniprotkb/"

# organism id
organism_id_list = {'Homo sapiens': '9606', 'Mus musculus': '10090'}
organism_id = organism_id_list['Homo sapiens']

# find Uniprot Entry and gene names for each gene
for i in range(len(df_Korfali_2010)):
    
    # gene name
    gene = df_Korfali_2010.iloc[i, 0]

    try:
        # get response with gene name the query
        r = get_url(f'{WEBSITE_API}/search?query=gene:{gene}+AND+organism_id:{organism_id}&fields=accession,gene_names')
        result = r.json()['results'][0]
        
        ## get human or mouse entry
        entry_converted = result['primaryAccession']
        ## get gene name and check the names match between human's or mouse's and the given organism's
        gene_obtained = result['genes'][0]['geneName']['value']
    
    except:
        entry_converted = 'Not_found'
    
    # Put the obtained Entry and gene name
    df_Korfali_2010.loc[i, 'Entry_Korfali_2010'] = entry_converted
    df_Korfali_2010.loc[i, 'Gene_name_obtained'] = gene_obtained
    
    if i % 40 == 0: print(i, entry_converted, gene_obtained)
    
    # take a break and go next
    sleep(1)

0 Q9UH99 SUN2
40 Q3LXA3 TKFC
80 Q6NXT6 TAPT1
120 Q9Y2W6 TDRKH


In [28]:
# # Manually check the result
# df_Korfali_2010.head(n=50)
# # df_Korfali_2010.tail(n=50)

In [26]:
df_Korfali_2010 = df_Korfali_2010.drop(['Gene_name'], axis=1)

In [27]:
df_Korfali_2010.to_csv('./IntermediateProducts/Korfali_2010.csv', index=False)

# Korfali et al 2012: 

In [22]:
# S4 was extracted beforehand to a single file because the whole xlsx is extremely heavy
df = pd.read_excel('./SourceData/Korfali_2012/2012NUCLEUS0047R-SupTables_S4.xlsx')

In [23]:
# prepare column names
columns = df.iloc[1,[0,1,2,3,17,18]]
columns

Table S4. Summary of NETs directly analyzed                  tissue
Unnamed: 1                                                gene name
Unnamed: 2                                          alternate names
Unnamed: 3                                        accession numbers
Unnamed: 17                                    NE:MM ratio by dNSAF
Unnamed: 18                                               reference
Name: 1, dtype: object

In [24]:
# Extract necessary cells
df = df.iloc[2:136,[0,1,2,3,17,18]]

In [25]:
# rename the columns
df.columns = columns

# Fill in tissue data
df['tissue'] = df['tissue'].fillna(method='ffill')
df = df.dropna(axis=0)

# Reindex
df = df.reset_index(drop=True)

#### How many genes?

In [26]:
len(list(df['gene name'].unique()))

119

In [27]:
# #15 should be SLC22A24
# #18 contains two names and only the latter VMA21 is needed
# #47 should be MARCH5
df.iloc[14, 1] = 'SLC22A24'
df.iloc[18, 1] = 'VMA21'
df.iloc[47, 1] = 'MARCH5'
df.iloc[97, 1] = 'ATL3'

In [28]:
# print(df.columns)

# select columns
df = df[['tissue', 'gene name', 'NE:MM ratio by dNSAF']]

In [29]:
# uniprot API URL
WEBSITE_API = "https://rest.uniprot.org/uniprotkb/"

# organism id
organism_id_list = {'Homo sapiens': '9606', 'Mus musculus': '10090'}
organism_id = organism_id_list['Homo sapiens']

# find Uniprot Entry and gene names for each gene
for i in range(len(df)):
    
    # gene name
    gene = df.iloc[i, 1]

    try:
        # get response with gene name the query
        r = get_url(f'{WEBSITE_API}/search?query=gene:{gene}+AND+organism_id:{organism_id}&fields=accession,gene_names')
        result = r.json()['results'][0]
        
        ## get human or mouse entry
        entry_converted = result['primaryAccession']
        ## get gene name and check the names match between human's or mouse's and the given organism's
        gene_obtained = result['genes'][0]['geneName']['value']
    
    except:
        entry_converted = 'Not_found'
    
    # Put the obtained Entry and gene name
    df.loc[i, 'Entry_Korfali_2012'] = entry_converted
    df.loc[i, 'Gene_name_obtained'] = gene_obtained
    
    # take a break and go next
    sleep(1)

In [31]:
# drop gene name obtained
df = df.drop(columns=['Gene_name_obtained'], axis=1)

In [32]:
# Export
df.to_csv('./IntermediateProducts/ProteomePapers/Korfali_2012.csv', index=False)