In [1]:
### This script plot data for figure 3 -- non-synonymous result
import pandas as pd
import os, pickle
import numpy as np

***
### Get significant genes for non-synonymous result -- heatmap

In [8]:
dir_anlyze = '../mutsig_out/anlyze'
dir_out = './figure4/'
cohort = ['histology','organ','origin','system','pancancer']

In [12]:
def get_sign(feature_type,run, syn_nsyn):
    df_all = pd.DataFrame()

    sig_pathlist = pickle.load(open(os.path.join(dir_anlyze, feature_type +'.'+syn_nsyn+'.sig_genes.pathlist.'+run+'.pkl'),'rb'))
    for fpath in sig_pathlist:
        df_res_feat = pd.read_csv(fpath,sep = '\t')
        df_res_feat['feature'] = fpath.split('/')[-1].split('.')[0]
        # Filter genes proto-cadherin genes and find significant genes
        df_res_feat = df_res_feat[~df_res_feat['gene'].str.startswith('PCDH')]
        df_res_sig = df_res_feat[df_res_feat['q']<0.1]        
        df_all = pd.concat([df_all,df_res_sig], ignore_index = True)
    
    # Save dataframe
#     df_all.to_csv(os.path.join(dir_out,feature_type+'.'+syn_nsyn+'.df_all_forheatmap.'+run+'.csv'))
    return df_all

In [13]:
df_all_new = get_sign('histology', 'cohort_072221', 'nsyn')
lnsyn_new = df_all_new['gene'].unique().tolist()

In [None]:
df_all_old = get_sign('histology', '062121', 'nsyn')
lnsyn_old = df_all_old['gene'].unique().tolist()

***
### Venn Diagram

In [14]:
### Read PCAWG file and get df of driver genes, including PCAWG unique and both
dir_refs = '../anno_ref/anlyze-manuscript/'
df_pcawg = pd.read_csv(os.path.join(dir_refs, 'TableS1_compendium_mutational_drivers.csv'))
df_pcawg = df_pcawg[df_pcawg['Element_type'] == 'cds']
df_pcawg = df_pcawg[df_pcawg['Category'].isin(['both','discovery_unique'])]
lpg = set(df_pcawg["Gene"].unique())
# If PCAWG exclusive driver list
lpg_only = set(df_pcawg[df_pcawg['Category'] =='discovery_unique']['Gene'].tolist())

### Read the census gene file and get cancer census gene list
lcgc = []
census_file = os.path.join(dir_refs, 'Census_all.csv')
census_info = open(census_file,"r")
next(census_info)
for lines in census_info:
    line_split = lines.split(',')
    gene = line_split[0]
    if gene not in lcgc:
        lcgc.append(gene)

### Read nsyn candidate gene list
cohorts = ['histology','organ','origin','system','pancancer']
dir_out = './figure3/'; dir_anlyze = '../mutsig_out/anlyze'
feature_type = 'histology';syn_nsyn = 'nsyn';run = 'cohort_072221'

sig_pathlist = pickle.load(open(os.path.join(dir_anlyze, feature_type +'.'+syn_nsyn+'.sig_genes.pathlist.062121.pkl'),'rb'))
lfeat = [i.split('/')[-1].split('.')[0] for i in sig_pathlist]
df_nsyn = pd.read_csv(os.path.join(dir_out,feature_type+'.'+syn_nsyn+'.df_all_forheatmap.'+run+'.csv'),index_col = 0)
lnsyn = df_nsyn['gene'].unique().tolist()

In [16]:
def calc_venn_size(lpcawg, census_list, lsig_nsyn):
    n_nsyn = len(set(lsig_nsyn)); n_cgc = len(set(census_list)); n_pe = len(lpcawg)
    print(f'MutSigCVsyn nonsynonymous has {len(set(lsig_nsyn))} genes')
    print(f'Concensus list has {len(set(census_list))} genes')
    print(f'pcawg exclusive and both drivers has {len(set(lpcawg))} genes')
    
    ### Intersection of one with another
    ovp_cgc_nsyn = len(set(lsig_nsyn).intersection(set(census_list)))
    ovp_cgc_pe = len(set(census_list).intersection(set(lpcawg)))
    ovp_pe_nsyn = len(set(lsig_nsyn).intersection(set(lpcawg)))
    print(f'MutSigCVsyn nonsynonymous and CGC {len(set(lsig_nsyn).intersection(set(census_list)))}')
    print(f'MutSigCVsyn nonsynonymous and PCAWG both&exclusive {len(set(lsig_nsyn).intersection(set(df_pcawg["Gene"].unique())))}')
    print(f'PCAWG both&exclusive and CGC {len(set(census_list).intersection(set(df_pcawg["Gene"].unique())))}')
    ### Intersection of all
    s1 = set(census_list).intersection(set(df_pcawg['Gene'].unique()))
    s2 = set(s1).intersection(set(lsig_nsyn))
    inter_all = len(s2)
    print(f'Intersection of all is {len(s2)}' )
    
    ### Calculate venn size
    s3_cgc_nsyn = ovp_cgc_nsyn -inter_all
    s5_cgc_pe = ovp_cgc_pe - inter_all
    s6_nsyn_pe = ovp_pe_nsyn - inter_all
    s1_cgc = n_cgc-inter_all - s3_cgc_nsyn - s5_cgc_pe
    s2_nsyn = n_nsyn - inter_all - s3_cgc_nsyn - s6_nsyn_pe
    s4_pe = n_pe -inter_all -s5_cgc_pe - s6_nsyn_pe

    venn_subset = (s1_cgc, s2_nsyn, s3_cgc_nsyn, s4_pe,s5_cgc_pe, s6_nsyn_pe, inter_all)
    
    s2 = s2 # CGC and PCAWG
    pcawg_only = set(lsig_nsyn).intersection(set(lpcawg)) - s2
    lcgc_only = set(lsig_nsyn).intersection(set(census_list))-s2
    lnovel = set(lsig_nsyn) - pcawg_only - lcgc_only - s2
    
    return venn_subset, s2 , pcawg_only, lcgc_only, lnovel

In [20]:
### PCAWG driver
venn_size, cgc_pg, pg_only, cgc_only, novel = calc_venn_size(lpg, lcgc,lnsyn)
dict_venn = {
    'size':venn_size,
    'cgc_pcawg':cgc_pg,
    'pcawg_only': pg_only,
    'cgc_only': cgc_only,
    'novel': novel,
    'pcawg_exclusive':set(lnsyn).intersection(set(lpg_only))
}
# pickle.dump(dict_venn, open(os.path.join(dir_out, 'dict_venn_072221.pkl'), 'wb'))

MutSigCVsyn nonsynonymous has 144 genes
Concensus list has 723 genes
pcawg exclusive and both drivers has 150 genes
MutSigCVsyn nonsynonymous and CGC 84
MutSigCVsyn nonsynonymous and PCAWG both&exclusive 91
PCAWG both&exclusive and CGC 108
Intersection of all is 72


In [31]:
venn_size

(606, 44, 9, 22, 39, 20, 69)

In [19]:
### Exclusive driver overlap
print(f'pcawg exclusive drivers has {len(set(lpg_only))} genes')
print(set(lnsyn).intersection(set(lpg_only)))

pcawg exclusive drivers has 15 genes
{'PRKCD', 'KLHL6', 'GRB2', 'RRAGC', 'PLK1', 'TMEM30A'}
