In [1]:
### This script plot data for figure 3 -- non-synonymous result
import pandas as pd
import os, pickle
import numpy as np

***
### Get significant genes for non-synonymous result -- heatmap

In [25]:
dir_anlyze = '../mutsig_out/anlyze'
dir_out = './figure3/'
dir_res = '../mutsig_out/'
cohort = ['histology','organ','origin','system','pancancer']

# output directory
dir_out_figure = '../manuscript/figures_data/'
dir_out_table = '../manuscript/tables/'

if not os.path.exists(dir_out_figure):
    os.makedirs(dir_out_figure)
if not os.path.exists(dir_out_table):
    os.makedirs(dir_out_table)

In [3]:
### For feature type, get the number of successfully run 
def get_res_path(feature_type, run,  syn_nsyn = None):
    dir_res_feat = os.path.join(dir_res,syn_nsyn,run)
    # Append the path of significant gene files into a list
    lsig_f = []
    for feat in os.listdir(dir_res_feat):
        fsig = [i for i in os.listdir(os.path.join(dir_res_feat,feat)) \
                if i.endswith('sig_genes.txt')]
        if len(fsig) > 0: 
            fsig_path = os.path.join(dir_res_feat, feat, fsig[0])
            lsig_f.append(fsig_path)
    # pickle.dump(lsig_f, open(os.path.join(dir_anlyze, \
    #         feature_type+'.'+syn_nsyn+'.sig_genes.pathlist.'+run+'.pkl'),'wb'))

In [4]:
get_res_path('histology', 'cohort_new_nohypermutator', 'nsyn')

In [2]:
def get_sign(feature_type,run, syn_nsyn):
    df_all = pd.DataFrame()

    sig_pathlist = pickle.load(open(os.path.join(dir_anlyze, feature_type +'.'+syn_nsyn+'.sig_genes.pathlist.'+run+'.pkl'),'rb'))
    for fpath in sig_pathlist:
        df_res_feat = pd.read_csv(fpath,sep = '\t')
        df_res_feat['feature'] = fpath.split('/')[-1].split('.')[0]
        # Filter genes proto-cadherin genes and find significant genes
        df_res_feat = df_res_feat[~df_res_feat['gene'].str.startswith('PCDH')]
        df_res_sig = df_res_feat[df_res_feat['q']<0.1]        
        df_all = pd.concat([df_all,df_res_sig], ignore_index = True)
    
    # Save dataframe
    # df_all.to_csv(os.path.join(dir_out_figure,run+'.nsyn_forheatmap.csv'))
    return df_all

In [5]:
df_all_new = get_sign('histology', 'cohort_new_nohypermutator', 'nsyn')
lnsyn_new = df_all_new['gene'].unique().tolist()

In [6]:
len(lnsyn_new) # How many significant non-synonymous genes

133

In [9]:
len(df_all_new['feature'].unique().tolist()) # how many unique cohorts

29

In [13]:
len(df_all_new[df_all_new['gene'] == 'TP53']) # TP53 is significant across how many cohorts

21

***
### Venn Diagram

In [18]:
### Read PCAWG file and get df of driver genes, including PCAWG unique and both
dir_refs = '../data/anno_refs/'
df_pcawg = pd.read_csv(os.path.join(dir_refs, 'TableS1_compendium_mutational_drivers.csv'))
df_pcawg = df_pcawg[df_pcawg['Element_type'] == 'cds']
df_pcawg = df_pcawg[df_pcawg['Category'].isin(['both','discovery_unique'])]
lpg = set(df_pcawg["Gene"].unique())
# If PCAWG exclusive driver list
lpg_only = set(df_pcawg[df_pcawg['Category'] =='discovery_unique']['Gene'].tolist())

### Read the census gene file and get cancer census gene list
lcgc = []
census_file = os.path.join(dir_refs, 'Census_all.csv')
census_info = open(census_file,"r")
next(census_info)
for lines in census_info:
    line_split = lines.split(',')
    gene = line_split[0]
    if gene not in lcgc:
        lcgc.append(gene)

### Read nsyn candidate gene list
dir_anlyze = '../mutsig_out/anlyze'
feature_type = 'histology';syn_nsyn = 'nsyn'
run = 'cohort_new_nohypermutator'

sig_pathlist = pickle.load(open(os.path.join(dir_anlyze, feature_type +'.'+syn_nsyn+'.sig_genes.pathlist.'+run+'.pkl'),'rb'))
lfeat = [i.split('/')[-1].split('.')[0] for i in sig_pathlist]
df_nsyn = pd.read_csv(os.path.join(dir_out_figure,run+'.nsyn_forheatmap.csv'),index_col = 0)
lnsyn = df_nsyn['gene'].unique().tolist()

In [12]:
def calc_venn_size(lpcawg, census_list, lsig_nsyn):
    n_nsyn = len(set(lsig_nsyn)); n_cgc = len(set(census_list)); n_pe = len(lpcawg)
    print(f'MutSigCVsyn nonsynonymous has {len(set(lsig_nsyn))} genes')
    print(f'Concensus list has {len(set(census_list))} genes')
    print(f'pcawg exclusive and both drivers has {len(set(lpcawg))} genes')
    
    ### Intersection of one with another
    ovp_cgc_nsyn = len(set(lsig_nsyn).intersection(set(census_list)))
    ovp_cgc_pe = len(set(census_list).intersection(set(lpcawg)))
    ovp_pe_nsyn = len(set(lsig_nsyn).intersection(set(lpcawg)))
    print(f'MutSigCVsyn nonsynonymous and CGC {len(set(lsig_nsyn).intersection(set(census_list)))}')
    print(f'MutSigCVsyn nonsynonymous and PCAWG both&exclusive {len(set(lsig_nsyn).intersection(set(df_pcawg["Gene"].unique())))}')
    print(f'PCAWG both&exclusive and CGC {len(set(census_list).intersection(set(df_pcawg["Gene"].unique())))}')
    ### Intersection of all
    s1 = set(census_list).intersection(set(df_pcawg['Gene'].unique()))
    s2 = set(s1).intersection(set(lsig_nsyn))
    inter_all = len(s2)
    print(f'Intersection of all is {len(s2)}' )
    
    ### Calculate venn size
    s3_cgc_nsyn = ovp_cgc_nsyn -inter_all
    s5_cgc_pe = ovp_cgc_pe - inter_all
    s6_nsyn_pe = ovp_pe_nsyn - inter_all
    s1_cgc = n_cgc-inter_all - s3_cgc_nsyn - s5_cgc_pe
    s2_nsyn = n_nsyn - inter_all - s3_cgc_nsyn - s6_nsyn_pe
    s4_pe = n_pe -inter_all -s5_cgc_pe - s6_nsyn_pe

    venn_subset = (s1_cgc, s2_nsyn, s3_cgc_nsyn, s4_pe,s5_cgc_pe, s6_nsyn_pe, inter_all)
    
    s2 = s2 # CGC and PCAWG
    pcawg_only = set(lsig_nsyn).intersection(set(lpcawg)) - s2
    lcgc_only = set(lsig_nsyn).intersection(set(census_list))-s2
    lnovel = set(lsig_nsyn) - pcawg_only - lcgc_only - s2
    
    return venn_subset, s2 , pcawg_only, lcgc_only, lnovel

In [13]:
### PCAWG driver
venn_size, cgc_pg, pg_only, cgc_only, novel = calc_venn_size(lpg, lcgc,lnsyn)

MutSigCVsyn nonsynonymous has 133 genes
Concensus list has 723 genes
pcawg exclusive and both drivers has 150 genes
MutSigCVsyn nonsynonymous and CGC 78
MutSigCVsyn nonsynonymous and PCAWG both&exclusive 88
PCAWG both&exclusive and CGC 108
Intersection of all is 69


In [14]:
venn_size

(606, 36, 9, 23, 39, 19, 69)

In [15]:
### Exclusive driver overlap
print(f'pcawg exclusive drivers has {len(set(lpg_only))} genes')
print(set(lnsyn).intersection(set(lpg_only)))

pcawg exclusive drivers has 15 genes
{'KLHL6', 'PLK1', 'GRB2', 'PRKCD', 'RRAGC', 'TMEM30A'}


In [26]:
# Save the PCAWG exclusive driver
df_pcawg[df_pcawg['Category'].isin(['discovery_unique'])][['Gene','Element_type','Category']].to_csv("%s/tables2-pcawg_exclusive.csv" % dir_out_table)

In [27]:
df_pcawg[df_pcawg['Category'].isin(['discovery_unique'])]

Unnamed: 0,Element,Gene,Ensembl,Element_type,Category,MoF,Tissue
572,gc19_pc.cds::gencode::TMEM30A::ENSG00000112697.11,TMEM30A,ENSG00000112697.11,cds,discovery_unique,LoF,Lymph-BNHL
573,gc19_pc.cds::gencode::PLK1::ENSG00000166851.10,PLK1,ENSG00000166851.10,cds,discovery_unique,unknown,Stomach-AdenoCA
574,gc19_pc.cds::gencode::PA2G4::ENSG00000170515.9,PA2G4,ENSG00000170515.9,cds,discovery_unique,unknown,Head-SCC
575,gc19_pc.cds::gencode::SRSF7::ENSG00000115875.14,SRSF7,ENSG00000115875.14,cds,discovery_unique,unknown,Lymph-BNHL
576,gc19_pc.cds::gencode::CAMK1::ENSG00000134072.6,CAMK1,ENSG00000134072.6,cds,discovery_unique,unknown,Liver-HCC
577,gc19_pc.cds::gencode::TMSB4X::ENSG00000205542.6,TMSB4X,ENSG00000205542.6,cds,discovery_unique,LoF,Lymph-BNHL
578,gc19_pc.cds::gencode::KLHL6::ENSG00000172578.7,KLHL6,ENSG00000172578.7,cds,discovery_unique,unknown,Lymph-BNHL
579,gc19_pc.cds::gencode::RRAGC::ENSG00000116954.7,RRAGC,ENSG00000116954.7,cds,discovery_unique,Act,Lymph-BNHL
580,gc19_pc.cds::gencode::GRB2::ENSG00000177885.9,GRB2,ENSG00000177885.9,cds,discovery_unique,unknown,Lymph-BNHL
581,gc19_pc.cds::gencode::DYRK1A::ENSG00000157540.15,DYRK1A,ENSG00000157540.15,cds,discovery_unique,unknown,Liver-HCC


***Get Seperate pivot tables***

In [16]:
idx0 = df_nsyn.loc[df_nsyn['q'] == 0, 'q'].index
df_nsyn.loc[idx0, 'q'] = 1E-30
df_nsyn['q'] = -np.log10(df_nsyn['q'])

In [17]:
df_pivot_nsyn = df_nsyn.pivot_table('q', index='gene', columns='feature')
for feat in lfeat:
    if feat not in df_pivot_nsyn.columns:
#         df_pivot_syn[feat] = float('NaN')
        print(f'No significant genes: {feat}')
df_pivot_nsyn['n_sig_gene'] = df_pivot_nsyn.notnull().sum(axis=1)
df_pivot_gene_nsyn = df_pivot_nsyn.sort_values(by = 'n_sig_gene', ascending = False)
df_pivot_gene_nsyn = df_pivot_gene_nsyn.iloc[:, 0:-1]

No significant genes: SoftTissue-Liposarc
No significant genes: Cervix-SCC
No significant genes: Myeloid-MPN


In [18]:
dict_venn = {
    'size':venn_size,
    'cgc_pcawg':cgc_pg,
    'pcawg_only': pg_only,
    'cgc_only': cgc_only,
    'novel': novel,
    'pcawg_exclusive':set(lnsyn).intersection(set(lpg_only))
}

In [19]:
df_pivot_cgc_pcawg = df_pivot_gene_nsyn.loc[dict_venn['cgc_pcawg']]
df_pivot_pcawg_only = df_pivot_gene_nsyn.loc[dict_venn['pcawg_only']]
df_pivot_cgc_only = df_pivot_gene_nsyn.loc[dict_venn['cgc_only']]
df_pivot_novel = df_pivot_gene_nsyn.loc[dict_venn['novel']]
df_pivot_pcawg_exclusive = df_pivot_gene_nsyn.loc[dict_venn['pcawg_exclusive']]
# dict_venn['cgc_only'].transpose().dropna(axis=1, how='all')

AttributeError: 'set' object has no attribute 'transpose'

In [20]:
dict_venn = {
    'size':venn_size,
    'cgc_pcawg':df_pivot_cgc_pcawg,
    'pcawg_only': df_pivot_pcawg_only,
    'cgc_only': df_pivot_cgc_only,
    'novel': df_pivot_novel,
    'pcawg_exclusive':df_pivot_pcawg_exclusive
}

In [22]:
pickle.dump(dict_venn, open(os.path.join(dir_out_figure, 'dict_venn.pkl'), 'wb'))

In [23]:
pickle.dump(df_pivot_gene_nsyn, open(os.path.join(dir_out_figure, 'df_nsyn_pivot.pkl'), 'wb'))