In [26]:
# This notebook visualize the p-vals across all tumors
# This notebook start from the non-exp gene mapping in expression file
import pandas as pd
import os, pickle, shutil, time
from tqdm import tqdm
from collections import Counter
import numpy as np

In [2]:
### Generate the label dictionary
dir_cohort = '../anno_ref/cohorts'
feature_type = 'histology'
df_feat = pd.read_csv(os.path.join(dir_cohort,feature_type+'.csv'))
dict_feat_np = dict(Counter(df_feat[feature_type]))
dict_feat_np = {k: v for k, v in sorted(dict_feat_np.items(), key=lambda item: item[1])}
lfeat = list(dict_feat_np)

In [3]:
# output directory
dir_out_figure = '../manuscript/figures_data/'

if not os.path.exists(dir_out_figure):
    os.makedirs(dir_out_figure)

***
### Save sig_genes.txt paths

In [4]:
### For feature type, get the number of successfully run 
dir_res = '../mutsig_out/'
dir_anlyze = '../mutsig_out/anlyze'
def get_res_path(feature_type, run,  syn_nsyn = None):
    dir_res_feat = os.path.join(dir_res,syn_nsyn,run,feature_type)

    # Append the path of significant gene files into a list
    lsig_f = []
    for feat in os.listdir(dir_res_feat):
        fsig = [i for i in os.listdir(os.path.join(dir_res_feat,feat)) \
                if i.endswith('sig_genes.txt')]
        if len(fsig) > 0: 
            fsig_path = os.path.join(dir_res_feat, feat, fsig[0])
            lsig_f.append(fsig_path)
    # pickle.dump(lsig_f, open(os.path.join(dir_anlyze, \
    #         feature_type+'.'+syn_nsyn+'.sig_genes.pathlist.'+run+'.pkl'),'wb'))

In [5]:
# features = ['histology','organ','origin','system','pancancer']
get_res_path('histology', 'cohort_090821', 'syn')
get_res_path('histology', 'cohort_090821', 'nsyn')

***
### Find and save common non-exp genes in all tumor types

In [7]:
### Filepath
exp_dir = '../anno_ref/ICGC/pcawg_rnaseq/'
gene_tophat = 'tophat_star_fpkm.v2.aliquot_gl.tsv'

### Get all histology names
feature_type = 'histology'
df_feat = pd.read_csv(os.path.join(dir_cohort,feature_type+'.csv'))
lfeat = df_feat['histology'].unique().tolist()

# Out dir 
dir_out_fig4 = './figure4/'

In [8]:
# Read aliquot id information
df_exp_info = pd.read_csv(os.path.join(exp_dir,'rnaseq.metadata.tsv'), sep = '\t')

# Read expression information
df_exp = pd.read_csv(os.path.join(exp_dir,gene_tophat),sep = '\t', index_col = 0)

In [9]:
df_tumor_exp_info = df_exp_info[df_exp_info['tumor.normal']=='tumor']
histologies = df_tumor_exp_info['histology_abbreviation'].unique().tolist()
print(f'{len(histologies)} have expression information, other cohorts use the common non-expression genes(intersection) from these cohorts.')

27 have expression information, other cohorts use the common non-expression genes(intersection) from these cohorts.


In [10]:
### make histology-nonexpressed genes dictionary, the threshold is 1
dict_nexp = {}; lnexp = []
for his in histologies:
    df_his_info = df_tumor_exp_info[df_tumor_exp_info['histology_abbreviation'] == his]
    ltumor_id = df_his_info['aliquot_id'].unique().tolist()
    df_exp_tumor = df_exp[ltumor_id]
    nonexp_genes = df_exp_tumor[df_exp_tumor<1].dropna().index.tolist()
    dict_nexp[his] = nonexp_genes
    lnexp.append(nonexp_genes)
    
result = set(lnexp[0]).intersection(*lnexp[1:])

In [14]:
with open(os.path.join(dir_out_fig4,'common_nonexp_ids.csv'), 'w') as f:
    # f.write('gene'+'\n')
    for genes in list(result):
            f.write(genes+'\n')

### Get p-val's from all feats -- For FDR  

In [19]:
### Read non-expressed gene name file
df_ne_gene = pd.read_csv(os.path.join(dir_out_fig4,'common_nonexp_names.csv'))
lne = df_ne_gene['genes']

In [20]:
# Read all gene list
lgene = pickle.load(open('../anno_ref/proc_refs/gene_name_list_062121.pkl','rb'))
print(f'There are total {len(set(lgene).intersection(set(lne)))} \
non-expressed genes out of {len(lgene)} genes')

cohort = ['histology','organ','origin','system','pancancer']
dir_anlyze = '../mutsig_out/anlyze'

There are total 1057 non-expressed genes out of 19225 genes


In [24]:
### Get pvals -- input for FDR calculation
def get_allpvals(feature_type,run, syn_nsyn):
    global lne, dir_anlyze
    df_exp = pd.DataFrame(); df_ne = pd.DataFrame()
    sig_pathlist = pickle.load(open(os.path.join(dir_anlyze, feature_type +'.'+syn_nsyn+'.sig_genes.pathlist.'+run+'.pkl'),'rb'))
    for fpath in sig_pathlist:
        df_res_feat = pd.read_csv(fpath,sep = '\t')
        df_res_feat['feature'] = fpath.split('/')[-1].split('.')[0]
        df_nonexp_p = df_res_feat[df_res_feat['gene'].isin(lne)][['gene','p','q','feature']].reset_index(drop = True)
        df_nonexp_p['exp/nonexp'] = 'nonexp'
        df_exp_p = df_res_feat[~df_res_feat['gene'].isin(lne)][['gene','p','q','feature']].reset_index(drop = True)
        df_exp_p['exp/nonexp'] = 'exp'
        
        df_exp = pd.concat([df_exp,df_exp_p], ignore_index = True)
        df_ne = pd.concat([df_ne,df_nonexp_p], ignore_index = True)
        

    df_all = pd.concat([df_exp,df_ne], ignore_index = True)

    # Save all p-val dataframe
    df_all.to_csv(os.path.join(dir_out_fig4,feature_type+'.'+syn_nsyn+'.df_all_forFDR.'+run+'.csv'))
    
    return df_all
df = get_allpvals('histology','cohort_090821','syn')

***After FDR calculation*** -- save processed dataframe for figures.py

In [27]:
# Read the significant gene dataframe after FDR calculation
feature_type = 'histology';syn_nsyn = 'syn';run = 'cohort_090821';
df_syn = pd.read_csv(os.path.join(dir_out_fig4,feature_type+'.syn.df_all_forheatmap.'+run+'.csv'),\
index_col = 0)
df_syn['FDR'] = -np.log10(df_syn['FDR']) # Transform into log scale

In [28]:
df_pivot_syn = df_syn.pivot_table('FDR', index='gene', columns='feature')
for feat in lfeat:
    if feat not in df_pivot_syn.columns:
#         df_pivot_syn[feat] = float('NaN')
        print(f'No significant genes: {feat}')
df_pivot_syn['n_sig_gene'] = df_pivot_syn.notnull().sum(axis=1)
df_pivot_gene_syn = df_pivot_syn.sort_values(by = 'n_sig_gene', ascending = False)
df_pivot_gene_syn = df_pivot_gene_syn.iloc[:, 0:-1]

No significant genes: Ovary-AdenoCA
No significant genes: Liver-HCC
No significant genes: CNS-Oligo
No significant genes: Kidney-RCC
No significant genes: Thy-AdenoCA
No significant genes: ColoRect-AdenoCA
No significant genes: Lung-AdenoCA
No significant genes: CNS-Medullo
No significant genes: CNS-GBM
No significant genes: Kidney-ChRCC
No significant genes: Stomach-AdenoCA
No significant genes: Lung-SCC
No significant genes: Bladder-TCC
No significant genes: Myeloid-AML
No significant genes: Biliary-AdenoCA
No significant genes: Cervix-AdenoCA
No significant genes: Bone-Osteosarc
No significant genes: Breast-DCIS
No significant genes: Myeloid-MPN
No significant genes: Myeloid-MDS
No significant genes: Bone-Cart
No significant genes: Bone-Osteoblast
No significant genes: Bone-Epith
No significant genes: Bone-Benign


In [29]:
# df_pivot_gene_syn.to_csv(os.path.join(dir_out_figure, 'syn_forheatmap.csv'))

***
### Candidate number between synonymous and non-synonymous

In [9]:
cohorts = ['histology','organ','origin','system','pancancer']
dir_out = './figure4/'
dir_anlyze = '../mutsig_out/anlyze'

#Load synonymous result
feature_type = 'histology';syn_nsyn = 'syn';run = 'cohort_072221';threshold =1;
syn_pathlist = pickle.load(open(os.path.join(dir_anlyze, feature_type+'.'+syn_nsyn+'.sig_genes.pathlist.'+run+'.pkl'),'rb'))
lfeat_syn = [i.split('/')[-1].split('.')[0] for i in syn_pathlist]
df_syn = pd.read_csv(os.path.join(dir_out,feature_type+'.syn.df_all_forheatmap.'+run+'.'+str(threshold)+'.csv'),\
                     index_col = 0)

#Load nonsynonymous result
feature_type = 'histology';syn_nsyn = 'nsyn';run = 'cohort_072221'
nsyn_pathlist = pickle.load(open(os.path.join(dir_anlyze, feature_type+'.'+syn_nsyn+'.sig_genes.pathlist.'+run+'.pkl'),'rb'))
lfeat = [i.split('/')[-1].split('.')[0] for i in nsyn_pathlist]
df_nsyn = pd.read_csv(os.path.join(dir_out,feature_type+'.nsyn.df_all_forheatmap.'+run+'.csv'),\
                      index_col = 0)

In [18]:
###  Build dataframe for histology types |syn candidates |nonsyn candidates
df = pd.DataFrame(columns = ['syn','nsyn','np'], index = lfeat)
for feat in lfeat:
    nsig_syn = len(df_syn[df_syn['feature'] == feat])
    nsig_nsyn = len(df_nsyn[df_nsyn['feature'] == feat])
    df.loc[feat, 'syn'] = nsig_syn
    df.loc[feat,'nsyn'] = nsig_nsyn
df = df[['syn','nsyn']]
df = pd.melt(df.reset_index(), id_vars='index', var_name="syn.nsyn", value_name="nsig") 
# Append patient number to each histology types
for feat in lfeat:
    idx = df[df['index'] == feat].index
    df.loc[idx, 'index'] = '(n='+str(dict_feat_np[feat])+') '+feat
# Save file
df.columns = [feature_type, 'syn.nsyn','nsig']
# df.to_csv(os.path.join(dir_out, feature_type+'.syn_nsyn.'+run+'.nsig.csv'))