# CPTAC ID mapping

BCM CPTAC phosphoproteomics provides ensembl ID + site, but a lot of webtools has ID like gene + site, so we have to map the ensembl ID to the gene name that match with webtool.

## Setup

In [None]:
from katlas.core import *

from tqdm import tqdm

import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

In [None]:
class CPTAC_ID:
    @staticmethod
    def list_cancer():
        return ['HNSCC','GBM','COAD','CCRCC','LSCC','BRCA','UCEC','LUAD','PDAC','OV']
    
    @staticmethod
    def get_id(cancer, is_Tumor=True):
        """
        Fetches the data from the given URL and returns a DataFrame
        """
        
        ID_URL = f"https://zenodo.org/records/8196130/files/bcm-{cancer.lower()}-mapping-gencode.v34.basic.annotation-mapping.txt.gz"
        if is_Tumor:
            DATA_URL = f"https://cptac-pancancer-data.s3.us-west-2.amazonaws.com/data_freeze_v1.2_reorganized/{cancer.upper()}/{cancer.upper()}_phospho_site_abundance_log2_reference_intensity_normalized_Tumor.txt"
        
        else:
            DATA_URL = f"https://cptac-pancancer-data.s3.us-west-2.amazonaws.com/data_freeze_v1.2_reorganized/{cancer.upper()}/{cancer.upper()}_phospho_site_abundance_log2_reference_intensity_normalized_Normal.txt"

            
        # Load ID data
        ref = pd.read_csv(ID_URL, compression='gzip', sep='\t')\
        [['protein','gene','gene_name']]\
        .drop_duplicates().reset_index(drop=True)
        
        
        # Load CPTAC phosphoproteomics data
        try:
            raw = pd.read_csv(DATA_URL, sep='\t')
        except Exception as e:
            print(f'{cancer}{e}')
        else:
            info = pd.DataFrame({'gene':raw.idx.str.split('|').str[0],
                         'site':raw.idx.str.split('|').str[2],
                         'site_seq':raw.idx.str.split('|').str[3]})
            # print(raw.columns[1:]) # patient ID

            print(f'the {cancer} dataset length is: {info.shape[0]}')

            # Merge ensembl ID with gene name
            info = info.merge(ref,'left')
            print(f'after id mapping, the length is {info.shape[0]}')

            print(f'{info.gene_name.isna().sum()} sites does not have a mapped gene name')
            
            # LinkedOmics
            info['gene_site'] = info['gene_name'] + '_' + info['site']
            
            # LinkedOmicsKB
            info['ENSP_site'] = info['protein'].str.split('.').str[0] + '_' + info['site']

            return info

In [None]:
cancer_list = CPTAC_ID.list_cancer()

In [None]:
cancer_list

['HNSCC', 'GBM', 'COAD', 'CCRCC', 'LSCC', 'BRCA', 'UCEC', 'LUAD', 'PDAC', 'OV']

In [None]:
tumor = [CPTAC_ID.get_id(cancer,is_Tumor=True) for cancer in cancer_list]
normal = [CPTAC_ID.get_id(cancer,is_Tumor=False) for cancer in cancer_list]
all_list = tumor+normal

the HNSCC dataset length is: 55270
after id mapping, the length is 214151
0 sites does not have a mapped gene name
the GBM dataset length is: 63410
after id mapping, the length is 261115
0 sites does not have a mapped gene name
the COAD dataset length is: 35487
after id mapping, the length is 130147
0 sites does not have a mapped gene name
the CCRCC dataset length is: 54238
after id mapping, the length is 213737
0 sites does not have a mapped gene name
the LSCC dataset length is: 65481
after id mapping, the length is 249575
0 sites does not have a mapped gene name
the BRCA dataset length is: 49871
after id mapping, the length is 175637
0 sites does not have a mapped gene name
the UCEC dataset length is: 64977
after id mapping, the length is 250006
0 sites does not have a mapped gene name
the LUAD dataset length is: 61705
after id mapping, the length is 236430
0 sites does not have a mapped gene name
the PDAC dataset length is: 50220
after id mapping, the length is 195218
0 sites does n

In [None]:
all_df = pd.concat(all_list,ignore_index=True)

In [None]:
linkedomics = all_df.drop_duplicates('gene_site').reset_index(drop=True)
linkedomicsKB = all_df.drop_duplicates('ENSP_site').reset_index(drop=True)

In [None]:
linkedomics[:3]

Unnamed: 0,gene,site,site_seq,protein,gene_name,gene_site,ENSP_site
0,ENSG00000003056.8,S267,DDQLGEESEERDDHL,ENSP00000000412.3,M6PR,M6PR_S267,ENSP00000000412_S267
1,ENSG00000048028.11,S1053,PPTIRPNSPYDLCSR,ENSP00000003302.4,USP28,USP28_S1053,ENSP00000003302_S1053
2,ENSG00000004776.13,S16,PSWLRRASAPLPGLS,ENSP00000004982.3,HSPB6,HSPB6_S16,ENSP00000004982_S16


In [None]:
# linkedomics.to_parquet('linkedomics_ID.parquet')
# linkedomicsKB.to_parquet('linkedomicsKB_ID.parquet')

In [None]:
linkedomics.shape

(126602, 7)

In [None]:
linkedomicsKB.shape

(489994, 7)