In [1]:
### This note book analyze figure 5 - synonymous candidate result
# Author: Yiyun

import pandas as pd
import os,pickle
import numpy as np
from collections import Counter
from multiprocessing import Pool

***
### Synonymous candidates in CGC/PCAWG - TSG, oncogenes...

In [2]:
### Read synonymous candidate list and create pivot table for genes
dir_out_fig4 = './figure4/'
feature_type = 'histology';syn_nsyn = 'syn';run = 'cohort_090821'
df_syn = pd.read_csv(os.path.join(dir_out_fig4,feature_type+'.syn.df_all_forheatmap.'+run+'.csv'),index_col = 0)

### read the census gene file and get cancer census gene list
dir_anno = '../anno_ref/anlyze-manuscript'
df_census = pd.read_csv(os.path.join(dir_anno,'Census_all.csv'))
df_census = df_census.set_index('Gene Symbol')
df_pcawg = pd.read_csv(os.path.join(dir_anno, 'TableS1_compendium_mutational_drivers.csv'))
df_pcawg = df_pcawg[df_pcawg['Element_type'] == 'cds']
df_pcawg = df_pcawg[df_pcawg['Category'].isin(['both','discovery_unique'])]
lpg = set(df_pcawg["Gene"].unique())

In [3]:
### Find overlap between synonymous candidate and CGC genes
df_incgc = pd.DataFrame() # Dataframe for gene that is in Cancer Gene Census
for gene in df_syn['gene']:
    if gene in df_census.index.tolist():
        df_incgc = pd.concat([df_incgc,df_census.loc[gene]], axis = 1)
    if gene in lpg:
        print(gene)

In [4]:
df_incgc

Unnamed: 0,BCL2,SRSF2,CALR,NACA
Name,B-cell CLL/lymphoma 2,serine/arginine-rich splicing factor 2,calreticulin,nascent-polypeptide-associated complex alpha p...
Entrez GeneId,596,6427,811,4666
Genome Location,18:63123346-63320128,17:76734115-76737333,19:12938607-12944489,12:56712433-56725299
Tier,1,1,1,2
Hallmark,,,Yes,
Chr Band,21.33,25.1,13.13,13.3
Somatic,yes,yes,yes,yes
Germline,,,,
Tumour Types(Somatic),"NHL, CLL","MDS, CLL","MPN, MDS",NHL
Tumour Types(Germline),,,,


***
### Analyze synonymous mutations in the candidate gene list

In [6]:
out_dir = './figure5'

In [7]:
### Read synonymous candidate list and create pivot table for genes
cohorts = ['histology','organ','origin','system','pancancer']
dir_out = './figure4/'

dir_maf = '../maf_out/maf_cohorts_060121'
feature_type = 'histology'

# Read the significant gene dataframe after FDR calculation
feature_type = 'histology';syn_nsyn = 'syn';run = 'cohort_072221';threshold = 1;
df_syn = pd.read_csv(os.path.join(dir_out,feature_type+'.syn.df_all_forheatmap.'+run+'.'+str(threshold)+'.csv'),index_col = 0)
df_syn = df_syn.set_index('gene')

In [11]:
df_syn

Unnamed: 0_level_0,X,p,q,feature,exp.nonexp,FDR
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
BCL2,0,0.0,0.0,Lymph-BNHL,exp,1.518254e-18
ITLN1,54504,0.0,0.0,CNS-PiloAstro,exp,1.518254e-18
XIRP2,109020,0.0,0.0,Skin-Melanoma,exp,1.518254e-18
SOX18,272521,0.0,0.0,Eso-AdenoCA,exp,1.518254e-18
SIGLEC15,345195,0.0,0.0,Panc-AdenoCA,exp,1.518254e-18
TP53I3,345196,0.0,0.0,Panc-AdenoCA,exp,1.518254e-18
CALR,399696,0.0,0.0,Uterus-AdenoCA,exp,1.518254e-18
PPWD1,417864,0.0,0.0,Head-SCC,exp,1.518254e-18
PURA,563208,0.0,0.0,Breast-AdenoCA,exp,1.518254e-18
HIST1H2BK,1,2e-06,0.014953,Lymph-BNHL,exp,5.211509e-18


**Analyze individual genes**
1. ITLN1: Cancer gene, but no literature support it's role in CNS-PiloAstro. The synonymous mutation is also found in TCGA
    - High load, the same histology
2. XIRP2: Surrogate of UV exposure. Could explain so many synonymous mutations? However, patients with mutations in these genes seem to liver longer,, the other 9 affected genes' ranking? and the patient disease subtype
    - Same, highload
3. SOX18: Had been found upregulated in other cancer types and related to poor prognostic. The HMG domain binds transcription factors.
    - Same
    - Highload, retrotransposon
4. SIGLEC15: Related to Immune system, and seems high expression lower survival rate -- new star
    - One of the major
    - Not high load
5. TP53I3: downstream of P53? seems may have a role but not fully studied
    - One of major, Highload
6. CALR: protein upregulated in subtypes? 
7. PPWD1: shallow work from other cancer types
    - Same histology
    - But syn patient do not have a high load, but only one patient
8. HIST1H2BK: potential role in other tumor types
    - Diffuse large B, follicular,
    - High load
9. AKAP2: R binding domain, RGS domain, interacts with Galpha. 
    - All 3 synonymous patients are primary tumors. 
    - Mutation load of syn patients on a higher end. Have high retrotransposon event signal
10. NOL9: DLBCL, subtype, copynumber vairation
    - Primary, Diffuse largeB, follicular
    - One syn patient, higher than normal mutation load and CNA event. 
11. NACA: downregulated in melanoma
    - High load, one histology
12. DMRTB1: Maybe testis?
    - Several histology types
    - High load
13. ACTRT2: nothing significant
    - Mutation load of syn patients on a higher end
14. ITPR2: upregulated in AML
    - High load, one histology
15. ACVRL1: ALK1, TGF receptor related. Found high mutation rate in prostate cancer. Kinase domain. TGF-beta expression related, so it is a interesting gene in prostate cancer
    - All 3 synonymous patients are primary tumors. 
    - Mutation load of syn patients on a higher end.
16. TPM2: Loss associated with RHOA activation in colorectal cancer. Isoforms found in breast cancer cell lines. downregulation of TPM is a sign of transformed cells.
    - Lobular carcinoma, minor
    - Not high load but only one
17. SH3BGR:
    - Different types
    - Not highload but retrotransposon
18. XFP69:
    - Same, not highload but retrotransposon

**Other questions to consider**  
1. If mutaion in important protein domain?
2. The histology subtype
3. If the patients have other mutations, cnv, methylation
4. If mutation in other cancer patient dataset