## Cheng et al. 2019
### 243 proteins from mesenchymal cell line

In [1]:
import numpy as np
import pandas as pd
from time import sleep
from modules.my_utils import get_url, find_duplicate
import modules.my_config as my_config

In [2]:
df_Cheng2019 = pd.read_excel('./SourceData/Papers/Cheng2019/Supplementary Table S3_clustering & peptide seq.xlsx')

  for idx, row in parser.parse():
  for idx, row in parser.parse():


In [3]:
# column names
columns = df_Cheng2019.iloc[0, :].tolist()
# extract values
df_Cheng2019 = df_Cheng2019.iloc[1:, :]
# rename columns
df_Cheng2019.columns = columns

In [4]:
df_Cheng2019 = df_Cheng2019[['Accession No.', 'Protein', 'NE Enrich Score in U', 'NE Enrich Score in A', 'NE Enrich Score in M']]
df_Cheng2019 = df_Cheng2019.rename({'Accession No.': 'Entry_Cheng2019'}, axis=1)

## Convert the mouse entries to humans

In [11]:
my_dict = {'entry_mr': [], 'gene_name': [], 'entry_h': []}

for entry in df_Cheng2019.Entry_Cheng2019.to_list():
    # default values 
    gene_name = entry_hs = 'Not_found' 
    
    try:
        # get the gene name for the entry
        params = {
        "query": f'accession:{entry}',
        "fields": "gene_names",
        "format": "json"
        }
        
        r = get_url(my_config.WEBSITE_API, params=params)
        gene_name = r.json()['results'][0]['genes'][0]['geneName']['value']
        sleep(1)

        # find human version of the entry and gene name
        params = {
        "query": f'gene:{gene_name} AND organism_id:{my_config.organism_id_list["Homo sapiens"]}',
        "fields": "accession, gene_names",
        "format": "json"
        }
        
        r2 = get_url(my_config.WEBSITE_API, params=params)
        entry_hs = r2.json()['results'][0]['primaryAccession']
        sleep(1)

    except Exception as e:
        print(f'Error in fetching {entry}: {e}') 
    
    my_dict['entry_mr'].append(entry)
    my_dict['gene_name'].append(gene_name)
    my_dict['entry_h'].append(entry_hs)
    
    if len(my_dict['entry_mr']) % 40 == 0:
        print(entry, gene_name, entry_hs)

Q9CY27 Tecr Q9NZ01
Error in fetching Q8R3Y5: 'genes'
Error in fetching Q9CYI0: 'genes'
Error in fetching Q9CXL3: 'genes'
Error in fetching Q8C3W1: 'genes'
Q8R0Y8 Slc25a42 Q86VD7
Error in fetching P06339: list index out of range
Q8BX90 Fndc3a Q9Y2H6
Q8BM55 Tmem214 Q6NUQ4
Q9CXY9 Pigk Q92643
Q6ZPR5 Smpd4 Q9NXE4


In [12]:
_df = pd.DataFrame(my_dict)
df = df_Cheng2019.merge(_df, left_on='Entry_Cheng2019', right_on='entry_mr', how='left')
df['geneName_match'] = np.where((df.Protein == df.gene_name), 1, 0)

## Check the unmatched genes

In [13]:
df[df.geneName_match == 0]

Unnamed: 0,Entry_Cheng2019,Protein,NE Enrich Score in U,NE Enrich Score in A,NE Enrich Score in M,entry_mr,gene_name,entry_h,geneName_match
49,Q6ZQM8,Ugt1a7c,0.547455,0.544253,0.29506,Q6ZQM8,Ugt1a7,Q9HAW7,0
72,Q3TYS2,,0.547859,0.720219,0.219828,Q3TYS2,Cybc1,Q9BQA9,0
73,Q8R3Y5,,0.187178,0.573158,0.600026,Q8R3Y5,Not_found,Not_found,0
74,Q9CYI0,,0.104929,0.615445,0.41298,Q9CYI0,Not_found,Not_found,0
75,Q9CXL3,,0.084613,0.0,0.671932,Q9CXL3,Not_found,Not_found,0
76,Q8C3W1,,0.059845,1.0,0.572675,Q8C3W1,Not_found,Not_found,0
77,Q9CQE8,,0.0,0.544111,0.60397,Q9CQE8,RTRAF,Q9Y224,0
174,Q3KNM2,42799,0.214956,0.275087,0.561878,Q3KNM2,Marchf5,Q9NX47,0


### Ugt1a7 is the correct name for the row 49 - so this is ok

### March5 was incorrectly translated to 42799 by MS Excel so that's also fine

### For "Not found", manual correction by manual UniProt search was done

In [14]:
df.loc[73, 'gene_name'] = 'C19orf47'
df.loc[73, 'entry_h'] = 'Q8N9M1'
df.loc[74, 'gene_name'] = 'C17orf75'
df.loc[74, 'entry_h'] = 'Q9HAS0'
df.loc[74, 'gene_name'] = 'C17orf75'
df.loc[74, 'entry_h'] = 'Q9HAS0'
df.loc[75, 'gene_name'] = 'C7orf50'
df.loc[75, 'entry_h'] = 'Q9BRJ6'
df.loc[76, 'gene_name'] = 'C1orf198'
df.loc[76, 'entry_h'] = 'Q9H425'

## Duplicate check

In [15]:
find_duplicate(df.entry_h.to_list())

['P42166']

In [16]:
df[df.entry_h == 'P42166']

Unnamed: 0,Entry_Cheng2019,Protein,NE Enrich Score in U,NE Enrich Score in A,NE Enrich Score in M,entry_mr,gene_name,entry_h,geneName_match
187,Q61029,Tmpo,0.863934,0.595488,0.68432,Q61029,Tmpo,P42166,1
190,Q61033,Tmpo,0.785527,0.522138,0.525115,Q61033,Tmpo,P42166,1


#### Q61029 is Lap2b and Q61033 is Lap2a in mouse but have the same gene name Tmpo because they are isofroms
#### In fact Human Lap2b is P42167 instead of P42166
#### So Let's correct P42166 in row 187 to P42167

In [18]:
df.loc[187, 'entry_h'] = 'P42167'

## Final cleanup

In [19]:
df = df[['gene_name','entry_h', 
         'NE Enrich Score in U','NE Enrich Score in A', 'NE Enrich Score in M']]
df = df[df.entry_h != 'Not_found']

## Export

In [20]:
df.to_csv('./Output/Cheng2019.csv', index=False)