## Korfali et al. 2010
### Mouse leukocytes

In [3]:
import pandas as pd
from time import sleep
from my_utils import get_url, find_duplicate
import my_config

In [9]:
# Gene names from Korfali_2010 table S5 were manually extracted from the PDF file and then saved as an excel file
# read the excel file
df_Korfali_2010_raw = pd.read_excel('./SourceData/Papers/Korfali_2010/rk1-139_fromTableS5.xlsx')

# Clean up to a new df
df_Korfali_2010= pd.DataFrame(df_Korfali_2010_raw.iloc[1:, 1])
df_Korfali_2010.columns = ['Original_Gene_Name']
df_Korfali_2010 = df_Korfali_2010.reset_index(drop=True)

In [17]:
print('How many genes?', len(df_Korfali_2010.Original_Gene_Name))

How many genes? 139


In [11]:
# find Uniprot Entry and gene names for each gene
for i in range(len(df_Korfali_2010)):
    # default values
    entry_converted = 'Not_Found'
    gene_obtained = 'Not_Found'
    
    # gene name
    gene = df_Korfali_2010.iloc[i, 0]

    try:
        # Query the gene name to get the Uniprot ID and the Uniprot-registered gene name
        params = {
        "query": f'gene:{gene} AND organism_id:{my_config.organism_id_list["Homo sapiens"]}',
        "fields": "accession, gene_names",
        "format": "json"
        }
        
        r = get_url(my_config.WEBSITE_API, params=params)
        result = r.json()['results'][0]
        entry_converted = result['primaryAccession']
        gene_obtained = result['genes'][0].get('geneName', {}).get('value', 'Not_Found')
    
    except Exception as e:
        print(f'Error in fetching {gene} in index {i}: {e}') 
    
    # Put the obtained Entry and gene name
    df_Korfali_2010.loc[i, 'Entry_Korfali_2010'] = entry_converted
    df_Korfali_2010.loc[i, 'Gene_Name'] = gene_obtained
    
    # Log
    if i % 10 == 0: print(i, gene, entry_converted, gene_obtained)
    
    # take a break and go next
    sleep(1)

0 UNC84B Q9UH99 SUN2
10 LEMD3 Q9Y2U8 LEMD3
Error in fetching NDHRS7 in index 15: list index out of range
20 NCLN Q969V3 NCLN
30 SCCPDH Q8NBX0 SCCPDH
Error in fetching NNOC2L in index 39: list index out of range
40 DAK Q3LXA3 TKFC
50 ZMIZ2 Q8NF64 ZMIZ2
Error in fetching NMTTL7A in index 58: list index out of range
60 CKAP4 Q07065 CKAP4
70 C9orf46 Q9HBL7 PLGRKT
80 TAPT1 Q6NXT6 TAPT1
Error in fetching LOC100130633 in index 88: list index out of range
90 C15orf48 Q9C002 NMES1
100 C16orf54 Q6UWD8 C16orf54
Error in fetching TMPP in index 102: list index out of range
Error in fetching NTMM189 in index 104: list index out of range
110 TMEM179B Q7Z7N9 TMEM179B
120 TDRKH Q9Y2W6 TDRKH
130 TMEM93 Q9BV81 EMC6
Error in fetching MGC3196 in index 132: list index out of range


#### What are the missing gene?
#### Manual checking turned out the genes below were misspelled during the pdf extraction process. Manually searched UniProt and fixed the table

In [12]:
df_Korfali_2010.loc[15, 'Gene_Name'] = 'DHRS7'
df_Korfali_2010.loc[15, 'Entry_Korfali_2010'] = 'Q9Y394'
df_Korfali_2010.loc[39, 'Gene_Name'] = 'NOC2L'
df_Korfali_2010.loc[39, 'Entry_Korfali_2010'] = 'Q9Y3T9'
df_Korfali_2010.loc[58, 'Gene_Name'] = 'METTL7A'
df_Korfali_2010.loc[58, 'Entry_Korfali_2010'] = 'Q9H8H3'
df_Korfali_2010.loc[104, 'Gene_Name'] = 'TMEM189'
df_Korfali_2010.loc[104, 'Entry_Korfali_2010'] = 'A5PLL7'

#### The other three proteins that were not found - they likely have been removed from UniProt record. Even in the original paper they were hypothetical proteins
#### Thus it is ok to remove those three proteins

In [13]:
df_Korfali_2010_cleaned = df_Korfali_2010[df_Korfali_2010.Entry_Korfali_2010 != 'Not_Found']

In [14]:
print('How many genes found?', len(df_Korfali_2010_cleaned.Entry_Korfali_2010))

How many genes found? 136


In [15]:
df_Korfali_2010_cleaned = df_Korfali_2010_cleaned.drop(['Original_Gene_Name'], axis=1)

# Duplicate check
print('Duplicate: ', find_duplicate(df_Korfali_2010_cleaned.Entry_Korfali_2010.to_list()))

Duplicate:  []


In [16]:
df_Korfali_2010_cleaned.to_csv('./Output/Korfali_2010.csv', index=False)