In [3]:
import os, re, requests, json, openpyxl
import numpy as np
import pandas as pd
from urllib.request import urlopen
from time import sleep

In [2]:
def get_url(url, **kwargs):
    '''
    Obatin a response from a given url
    '''
    response = requests.get(url, **kwargs);

    if not response.ok:
        print(response.text)
        response.raise_for_status()
        sys.exit()

    return response

# Wilkie et al 2010: Rat skeletal muscle

In [20]:
# Excel import
wb = openpyxl.load_workbook('./SourceData/Wilkie_2010/mcp.M110.003129-6.xlsx')
print(wb.get_sheet_names())

# sheet selected
sheet = wb.get_sheet_by_name('Extracted')
_df_wilkie = pd.DataFrame(sheet.values)

# convert to df
df_wilkie = pd.DataFrame(_df_wilkie.iloc[3:, 1])
df_wilkie.columns = ['Gene_name']
df_wilkie = df_wilkie.reset_index(drop=True)
df_wilkie

['Table 1', 'Extracted']


  print(wb.get_sheet_names())
  sheet = wb.get_sheet_by_name('Extracted')


In [42]:
# uniprot API URL
WEBSITE_API = "https://rest.uniprot.org/uniprotkb/"

# organism id
organism_id_list = {'Homo sapiens': '9606', 'Mus musculus': '10090'}
organism_id = organism_id_list['Homo sapiens']

# find Uniprot Entry and gene names for each gene
for i in range(len(df_wilkie)):
    
    # gene name
    gene = df_wilkie.iloc[i, 0]

    try:
        # get response with gene name the query
        r = get_url(f'{WEBSITE_API}/search?query=gene:{gene}+AND+organism_id:{organism_id}&fields=accession,gene_names')
        result = r.json()['results'][0]
        
        ## get human or mouse entry
        entry_converted = result['primaryAccession']
        ## get gene name and check the names match between human's or mouse's and the given organism's
        gene_obtained = result['genes'][0]['geneName']['value']
    
    except:
        entry_converted = 'Not_found'
    
    # Put the obtained Entry and gene name
    df_wilkie.loc[i, 'Entry_Wilkie'] = entry_converted
    df_wilkie.loc[i, 'Gene_name_obtained'] = gene_obtained
    
    # take a break and go next
    sleep(1)

In [46]:
# Manually check the result
# df_wilkie.head(n=50)
# df_wilkie.tail(n=50)

In [47]:
df_wilkie = df_wilkie.drop(['Gene_name'], axis=1)

In [48]:
df_wilkie.to_csv('./IntermediateProducts/Wilkie_2010.csv', index=False)

# Schirmer et al 2003: mouse liver

### See Schirmer2003.ipynb
### Output is Schirmer2003/Output.csv

# Korfali et al 2012: 

In [56]:
# S4 was extracted beforehand to a single file because the whole xlsx is extremely heavy
df = pd.read_excel('./SourceData/Korfali_2012/2012NUCLEUS0047R-SupTables_S4.xlsx')

In [57]:
# prepare column names
columns = df.iloc[1,[0,1,2,3,17,18]]
columns

Table S4. Summary of NETs directly analyzed                  tissue
Unnamed: 1                                                gene name
Unnamed: 2                                          alternate names
Unnamed: 3                                        accession numbers
Unnamed: 17                                    NE:MM ratio by dNSAF
Unnamed: 18                                               reference
Name: 1, dtype: object

In [58]:
# Extract necessary cells
df = df.iloc[2:136,[0,1,2,3,17,18]]

In [59]:
# rename the columns
df.columns = columns
# df.head()

In [60]:
# Fill in tissue data
df['tissue'] = df['tissue'].fillna(method='ffill')
df = df.dropna(axis=0)

#### How many genes?

In [61]:
len(list(df['gene name'].unique()))

119

#### Realized that the gene #18 contains two names and only the latter VMA21 is needed
#### thus manually replace the name

In [62]:
df.iloc[18, 1] = 'VMA21'

In [64]:
df.head(n=50)

1,tissue,gene name,alternate names,accession numbers,NE:MM ratio by dNSAF,reference
3,liver enriched,TMEM53,"NET4, transmembrane protein 53",ref|NP_081113.1|,2.57,"This study and Schirmer, E.C., et al. (2003). ..."
4,liver enriched,TMEM120A,"NET29, transmembrane protein induced by tumor ...",ref|NP_766129.1|,inf,"This study and Malik, P., et al. (2010) Cell M..."
5,liver enriched,SCARA5,"NET33, PREDICTED: similar to protease, serine,...",gi|109502608|ref|XP_001066668.1|,0.1,"This study and Malik, P., et al. (2010) Cell M..."
6,liver enriched,TMEM74,"NET36, PREDICTED: hypothetical protein [Rattus...",ref|XP_001063530.1|,3.36,"This study and Malik, P., et al. (2010) Cell M..."
7,liver enriched,PPAPDC3,"NET39, phosphatidic acid phosphatase type 2 do...",gi|59891419|ref|NP_001012349.1|;gi|34147436|re...,4.42,"This study and Schirmer, E.C., et al. (2003). ..."
8,liver enriched,EGFR,epidermal growth factor receptor,ref|NP_113695.1|,0.14,"Klein, C., et al., (2004) Biochemistry 43, 158..."
9,liver enriched,SLC39A14,"NET34, solute carrier family 39 (zinc transpor...",ref|XP_001070144.1,0.15,"Malik, P., et al. (2010) Cell Mol Life Sci 67,..."
10,liver enriched,SCCPDH,"NET11, saccharopine dehydrogenase (putative)",gi|55770836|ref|NP_057086.2|;gi|62078699|ref|N...,0.16,"This study and Malik, P., et al. (2010) Cell M..."
11,liver enriched,WDR33,"NET14, WD repeat domain 33",gi|109507061|ref|XP_001058355.1|,inf,"This study and Malik, P., et al. (2010) Cell M..."
12,liver enriched,TMEM209,"NET31, hypothetical protein LOC84928",gi|66348165|ref|NP_116231.2|;gi|40254356|ref|N...,inf,"This study and Schirmer, E.C., et al. (2003). ..."


In [None]:
# MarchV and ‡SLC22A24 should be corrected