## Cheng et al. 2019
### 243 proteins from mesenchymal cell line

In [1]:
import numpy as np
import pandas as pd
from time import sleep
from modules.my_utils import get_url, find_duplicate
import modules.my_config as my_config

In [13]:
df_Cheng2019 = pd.read_excel('./SourceData/Papers/Cheng2019/Supplementary Table S3_clustering & peptide seq.xlsx')

  for idx, row in parser.parse():
  for idx, row in parser.parse():


In [14]:
# column names
columns = df_Cheng2019.iloc[0, :].tolist()
print(columns)
# extract values
df_Cheng2019 = df_Cheng2019.iloc[1:, :]
# rename columns
df_Cheng2019.columns = columns

['Accession No.', 'Protein', 'Description', 'Known NE', 'TM region', 'Total SPC in U', 'Total SPC in A', 'Total SPC in M', 'NE Enrich Score in U', 'NE Enrich Score in A', 'NE Enrich Score in M', 'Cluster #']


In [15]:
df_Cheng2019 = df_Cheng2019[['Accession No.', 'Protein', 'NE Enrich Score in U', 'NE Enrich Score in A', 'NE Enrich Score in M']]
df_Cheng2019 = df_Cheng2019.rename({'Accession No.': 'Entry_Cheng2019'}, axis=1)
print("Number of proteins in Cheng2019: ", df_Cheng2019.Entry_Cheng2019.nunique())

Number of proteins in Cheng2019:  243


### 1. Convert the mouse entries to humans

In [20]:
my_dict = {'entry_mr': [], 'gene_name': [], 'entry_h': []}

for i, entry in enumerate(df_Cheng2019.Entry_Cheng2019.to_list()):
    # default values 
    gene_name = 'Not_found'
    entry_hs = 'Not_found' 
    
    try:
        # get the gene name for the entry
        params = {
        "query": f'accession:{entry}',
        "fields": "gene_names",
        "format": "json"
        }
        
        r = get_url(my_config.WEBSITE_API, params=params)
        gene_name = r.json()['results'][0].get('genes', [{}])[0].get('geneName', {}).get('value', 'Not_found')
        sleep(1)   

        # find human version of the entry and gene name
        params = {
        "query": f'gene:{gene_name} AND organism_id:{my_config.organism_id_list["Homo sapiens"]}',
        "fields": "accession, gene_names",
        "format": "json"
        }
        
        r2 = get_url(my_config.WEBSITE_API, params=params)
        entry_hs = r2.json()['results'][0]['primaryAccession']
        sleep(1)

    except Exception as e:
        print(f'Error in fetching {entry} in index {i}: {e}')
    
    my_dict['entry_mr'].append(entry)
    my_dict['gene_name'].append(gene_name)
    my_dict['entry_h'].append(entry_hs)
    
    # Log
    if i % 30 == 0: print(i, entry, gene_name, entry_hs)

0 Q9CQU3 Rer1 O15258
30 O35083 Agpat1 Q99943
60 Q8R2Y3 Dolk Q9UPQ8
Error in fetching Q8R3Y5 in index 73: list index out of range
Error in fetching Q9CYI0 in index 74: list index out of range
Error in fetching Q8C3W1 in index 76: list index out of range
90 Q9JI99 Sgpp1 Q9BX95
Error in fetching P06339 in index 116: list index out of range
120 P70227 Itpr3 Q14573
150 Q64435 Ugt1a6 P19224
180 Q8K0C4 Cyp51a1 Q16850
210 Q6GQT9 Nomo1 Q15155
240 Q8C0L0 Tmx4 Q9H1E5


In [50]:
_df = pd.DataFrame(my_dict)

### 2. Error checking and correction
#### What proteins were not found?
#### Manual checking with UniProt found gene names and IDs for the three entires below
#### P06339 in index 116 seems to be specific to mouse. Keep it "Not_found"

In [51]:
_df.loc[73, 'gene_name'] = 'C19orf47'
_df.loc[73, 'entry_h'] = 'Q8N9M1'
_df.loc[74, 'gene_name'] = 'C17orf75'
_df.loc[74, 'entry_h'] = 'Q9HAS0'
_df.loc[76, 'gene_name'] = 'C1orf198'
_df.loc[76, 'entry_h'] = 'Q9H425'

In [52]:
df = df_Cheng2019.merge(_df, left_on='Entry_Cheng2019', right_on='entry_mr', how='left')

#### Check the entries where the mouse protein name does not match the obtained gene name

In [54]:
df['geneName_match'] = np.where((df.Protein == df.gene_name), 1, 0)
df.loc[df.geneName_match == 0, ['Protein', 'gene_name', 'entry_mr', 'entry_h']]

Unnamed: 0,Protein,gene_name,entry_mr,entry_h
49,Ugt1a7c,Ugt1a7,Q6ZQM8,Q9HAW7
72,,Cybc1,Q3TYS2,Q9BQA9
73,,C19orf47,Q8R3Y5,Q8N9M1
74,,C17orf75,Q9CYI0,Q9HAS0
75,,C7orf50,Q9CXL3,Q9BRJ6
76,,C1orf198,Q8C3W1,Q9H425
77,,RTRAF,Q9CQE8,Q9Y224
174,42799,Marchf5,Q3KNM2,Q9NX47


#### Ugt1a7 is the correct name for the protein in index 49 - so this is ok
#### March5 was incorrectly translated to 42799 by MS Excel so that's also fine

#### Finally, check duplicated entries

In [28]:
find_duplicate(df.entry_h.to_list())

['P42166']

In [55]:
df.loc[df.entry_h == 'P42166', ['Entry_Cheng2019', 'Protein', 'gene_name', 'entry_h']]

Unnamed: 0,Entry_Cheng2019,Protein,gene_name,entry_h
187,Q61029,Tmpo,Tmpo,P42166
190,Q61033,Tmpo,Tmpo,P42166


#### Q61029 is Lap2b and Q61033 is Lap2a in mouse but have the same gene name Tmpo because they are isofroms
#### In fact Human Lap2b is P42167 instead of P42166
#### So Let's correct P42166 in row 187 to P42167

In [56]:
df.loc[187, 'entry_h'] = 'P42167'

#### Final cleanup

In [57]:
df = df[['gene_name','entry_h', 
         'NE Enrich Score in U','NE Enrich Score in A', 'NE Enrich Score in M']]
df = df[df.entry_h != 'Not_found']

In [58]:
print("Number of proteins in Cheng2019: ", df.entry_h.nunique())

Number of proteins in Cheng2019:  242


### 3. Export

In [59]:
df.to_csv('./Output/Cheng2019.csv', index=False)