## Cheng et al. 2019
### 243 proteins from mesenchymal cell line

In [17]:
import requests, json
import numpy as np
import pandas as pd
from time import sleep

In [2]:
# Cheng 2019 data
df_Cheng2019 = pd.read_excel('./SourceData/Papers/Cheng2019/Supplementary Table S3_clustering & peptide seq.xlsx')

  warn(msg)
  warn(msg)


In [3]:
# column names
columns = df_Cheng2019.iloc[0, :].tolist()

# extract values
df_Cheng2019 = df_Cheng2019.iloc[1:, :]

# rename columns
df_Cheng2019.columns = columns

In [4]:
df_Cheng2019.columns

Index(['Accession No.', 'Protein', 'Description', 'Known NE', 'TM region',
       'Total SPC in U', 'Total SPC in A', 'Total SPC in M',
       'NE Enrich Score in U', 'NE Enrich Score in A', 'NE Enrich Score in M',
       'Cluster #'],
      dtype='object')

In [5]:
df_Cheng2019 = df_Cheng2019[['Accession No.', 'Protein', 'NE Enrich Score in U', 'NE Enrich Score in A', 'NE Enrich Score in M']]

df_Cheng2019 = df_Cheng2019.rename({'Accession No.': 'Entry_Cheng2019'}, axis=1)

In [6]:
df_Cheng2019.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 243 entries, 1 to 243
Data columns (total 5 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Entry_Cheng2019       243 non-null    object
 1   Protein               237 non-null    object
 2   NE Enrich Score in U  243 non-null    object
 3   NE Enrich Score in A  243 non-null    object
 4   NE Enrich Score in M  243 non-null    object
dtypes: object(5)
memory usage: 9.6+ KB


## Convert the mouse entries to humans

In [7]:
def get_url(url, **kwargs):
    '''
    Obatin a response from a given url
    '''
    response = requests.get(url, **kwargs);

    if not response.ok:
        print(response.text)
        response.raise_for_status()
        sys.exit()

    return response

In [22]:
# uniprot API URL
WEBSITE_API = "https://rest.uniprot.org/uniprotkb"

# organism id
organism_id_list = {'Homo sapiens': '9606', 'Mus musculus': '10090'}
organism_id = organism_id_list['Homo sapiens']

my_dict = {'entry_mr': [], 'gene_name': [], 'entry_h': []}

for entry in df_Cheng2019.Entry_Cheng2019:
    try:
        # get response with the query, then the gene name in it
        r = get_url(f'{WEBSITE_API}/search?query=accession:{entry}&fields=gene_names')
        gene_name = r.json()['results'][0]['genes'][0]['geneName']['value']
        sleep(1)

        # find the entry of the human version of the gene
        r2 = get_url(f'{WEBSITE_API}/search?query=gene:{gene_name}+AND+organism_id:{organism_id}&fields=accession,gene_names')
        entry_hs = r2.json()['results'][0]['primaryAccession']
        sleep(1)

    except:
        gene_name = 'Not_found'
        entry_hs = 'Not_found'    
    
    my_dict['entry_mr'].append(entry)
    my_dict['gene_name'].append(gene_name)
    my_dict['entry_h'].append(entry_hs)
    
    if len(my_dict['entry_mr'])%20 == 0:
        print(entry, gene_name, entry_hs)

P41216 Acsl1 P33121
Q9CY27 Tecr Q9NZ01
Q9DCK3 Tspan4 O14817
Q8R0Y8 Slc25a42 Q86VD7
Q8K358 Pigu Q9H490
Q8BX90 Fndc3a Q9Y2H6
Q78IS1 Tmed3 Q9Y3Q3
Q8BM55 Tmem214 Q6NUQ4
Q8CHT3 Ints5 Q6P9B9
Q9CXY9 Pigk Q92643
Q8BSY0 Asph Q12797
Q6ZPR5 Smpd4 Q9NXE4


In [23]:
_df = pd.DataFrame(my_dict)
df = df_Cheng2019.merge(_df, left_on='Entry_Cheng2019', right_on='entry_mr', how='left')
df['geneName_match'] = np.where((df.Protein == df.gene_name), 1, 0)

In [30]:
df[df.geneName_match == 0]

Unnamed: 0,Entry_Cheng2019,Protein,NE Enrich Score in U,NE Enrich Score in A,NE Enrich Score in M,entry_mr,gene_name,entry_h,geneName_match
49,Q6ZQM8,Ugt1a7c,0.547455,0.544253,0.29506,Q6ZQM8,Ugt1a7,Q9HAW7,0
72,Q3TYS2,,0.547859,0.720219,0.219828,Q3TYS2,Cybc1,Q9BQA9,0
73,Q8R3Y5,,0.187178,0.573158,0.600026,Q8R3Y5,C19orf47,Q8N9M1,0
74,Q9CYI0,,0.104929,0.615445,0.41298,Q9CYI0,C17orf75,Q9HAS0,0
75,Q9CXL3,,0.084613,0.0,0.671932,Q9CXL3,C7orf50,Q9BRJ6,0
76,Q8C3W1,,0.059845,1.0,0.572675,Q8C3W1,C1orf198,Q9H425,0
77,Q9CQE8,,0.0,0.544111,0.60397,Q9CQE8,RTRAF,Q9Y224,0
116,P06339,H2-T23,0.219917,0.215039,0.524884,P06339,Not_found,Not_found,0
174,Q3KNM2,42799,0.214956,0.275087,0.561878,Q3KNM2,Marchf5,Q9NX47,0


#### Manual assertation
#### The only proteins not found in human was H2-T23
#### thus the total becomes 242 proteins

In [29]:
df.loc[73, 'gene_name'] = 'C19orf47'
df.loc[73, 'entry_h'] = 'Q8N9M1'
df.loc[74, 'gene_name'] = 'C17orf75'
df.loc[74, 'entry_h'] = 'Q9HAS0'
df.loc[74, 'gene_name'] = 'C17orf75'
df.loc[74, 'entry_h'] = 'Q9HAS0'
df.loc[75, 'gene_name'] = 'C7orf50'
df.loc[75, 'entry_h'] = 'Q9BRJ6'
df.loc[76, 'gene_name'] = 'C1orf198'
df.loc[76, 'entry_h'] = 'Q9H425'

In [31]:
df.columns

Index(['Entry_Cheng2019', 'Protein', 'NE Enrich Score in U',
       'NE Enrich Score in A', 'NE Enrich Score in M', 'entry_mr', 'gene_name',
       'entry_h', 'geneName_match'],
      dtype='object')

In [33]:
df = df[['gene_name','entry_h', 
         'NE Enrich Score in U','NE Enrich Score in A', 'NE Enrich Score in M']]
df = df[df.gene_name != 'Not_found']

In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 242 entries, 0 to 242
Data columns (total 5 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   gene_name             242 non-null    object
 1   entry_h               242 non-null    object
 2   NE Enrich Score in U  242 non-null    object
 3   NE Enrich Score in A  242 non-null    object
 4   NE Enrich Score in M  242 non-null    object
dtypes: object(5)
memory usage: 11.3+ KB


In [35]:
df.to_csv('./Output/Cheng2019.csv', index=False)