## Korali et al. 2012
### Rat liver and leukocytes and human muscle cells

In [1]:
import re
import pandas as pd
from time import sleep
from modules.my_utils import get_url, find_duplicate
import modules.my_config as my_config

### 1. Data import, extract data necessary cells, and rename columns

In [7]:
# S4 was extracted beforehand to a single file because the whole xlsx is extremely heavy
df = pd.read_excel('./SourceData/Papers/Korfali2012/2012NUCLEUS0047R-SupTables_S4.xlsx')

In [8]:
# prepare column names
columns = df.iloc[1,[0,1,2,3,17,18]]
print(columns)

Table S4. Summary of NETs directly analyzed                  tissue
Unnamed: 1                                                gene name
Unnamed: 2                                          alternate names
Unnamed: 3                                        accession numbers
Unnamed: 17                                    NE:MM ratio by dNSAF
Unnamed: 18                                               reference
Name: 1, dtype: object


In [9]:
# Extract necessary cells
df = df.iloc[2:136,[0,1,2,3,17,18]]
# rename the columns
df.columns = columns
# Fill in tissue data
df['tissue'] = df['tissue'].fillna(method='ffill')
df = df.dropna(axis=0)

#### How many genes?

In [10]:
print(len(list(df['gene name'].unique())), 'genes in the dataset')

119 genes in the dataset


### 2. Link the NCBI accession numbers to Uniprot ID

#### Firstly let's get IDs in human
#### Realized that the gene #18 contains two names and only the latter VMA21 is needed
#### Thus manually replaced the name

In [11]:
df.iloc[18, 1] = 'VMA21'

#### Retrieve Uniprot ID of human genes by using uniprot module

In [12]:
df_retrieved = pd.DataFrame(columns=['Original_Gene_Name', 'Gene_Name', 'Entry_Korfali2012'])

for i in range(len(df)):
    # default values
    entry_converted = 'Not_found'
    gene_obtained = 'Not_found'
    
    gene = df.iloc[i, 1]
    
    try:
        # Query the gene name to get the Uniprot ID and the Uniprot-registered gene name
        params = {
        "query": f'gene:{gene} AND organism_id:{my_config.organism_id_list["Homo sapiens"]}',
        "fields": "accession, gene_names",
        "format": "json"
        }
        
        r = get_url(my_config.WEBSITE_API, params=params)
        result = r.json()['results'][0]
        entry_converted = result['primaryAccession']
        gene_obtained = result['genes'][0].get("geneName", {}).get('value', 'Not_found')
        
    except Exception as e:
        print(f'Error in fetching {gene} in index {i}: {e}') 

    # Put the obtained Entry and gene name
    df_retrieved.loc[i, 'Original_Gene_Name'] = gene
    df_retrieved.loc[i, 'Entry_Korfali2012'] = entry_converted
    df_retrieved.loc[i, 'Gene_Name'] = gene_obtained
    
    # Log    
    if i % 10 == 0: print(i, gene, entry_converted, gene_obtained)
    
    # take a break and go next
    sleep(1)

0 TMEM53 Q6P2H8 TMEM53
10 KIAA1967 Q8N163 CCAR2
20 CGRRF1 Q99675 CGRRF1
30 C9orf46 Q9HBL7 PLGRKT
40 ITPR2 Q14571 ITPR2
Error in fetching MARCHV in index 47: list index out of range
50 SLC25A22 Q9H936 SLC25A22
60 C14orf1 Q9UKR5 ERG28
70 TMEM70 Q9BUB7 TMEM70
80 CISD2 Q8N5K1 CISD2
90 METTL7A Q9H8H3 TMT1A
Error in fetching ATLA3 in index 97: list index out of range
100 TMEM214 Q6NUQ4 TMEM214
110 EMD P50402 EMD


#### Let's see why two genes were not found by manual checking on UniProt
##### MARCHV has to be MARCH5
##### ATLA3 has to be ATL3
#### UniprotIDs for both genes were found
#### It is impressive that all of 119 genes are still valid, isn't it?

In [16]:
df_retrieved.loc[47, 'Entry_Korfali2012'] = 'Q9NX47'
df_retrieved.loc[47, 'Gene_Name'] = 'MARCHV'
df_retrieved.loc[97, 'Entry_Korfali2012'] = 'Q6DD88'
df_retrieved.loc[97, 'Gene_Name'] = 'ATL3'

#### Merge with main df to get NE:MM ratio

In [17]:
df_output = df_retrieved.merge(df, how='left', left_on='Original_Gene_Name', right_on='gene name')

In [18]:
df_output.columns

Index(['Original_Gene_Name', 'Gene_Name', 'Entry_Korfali2012', 'UniprotID',
       'tissue', 'gene name', 'alternate names', 'accession numbers',
       'NE:MM ratio by dNSAF', 'reference'],
      dtype='object')

In [19]:
df_output = df_output[['Gene_Name', 'Entry_Korfali2012', 'tissue', 'NE:MM ratio by dNSAF']]
# rename the columns
df_output.columns = ['Gene_Name', 'Entry_Korfali2012', 'Tissue', 'NE_MM_ratio']

In [20]:
print("Number of unique genes in the dataset:", len(df_output['Entry_Korfali2012'].unique()))

Number of unique genes in the dataset: 119


In [21]:
df_output.to_csv('./Output/Korfali2012_Hs.csv', index=False)