In [1]:
# This notebook visualize the p-vals across all tumors
# This notebook start from the non-exp gene mapping in expression file
import pandas as pd
import os, pickle, shutil, time
from tqdm import tqdm
from collections import Counter

In [12]:
### Generate the label dictionary
dir_cohort = '../anno_ref/cohorts'
feature_type = 'histology'
df_feat = pd.read_csv(os.path.join(dir_cohort,feature_type+'.csv'))
dict_feat_np = dict(Counter(df_feat[feature_type]))
dict_feat_np = {k: v for k, v in sorted(dict_feat_np.items(), key=lambda item: item[1])}
lfeat = list(dict_feat_np)

***
### Save sig_genes.txt paths

In [10]:
### For feature type, get the number of successfully run 
dir_res = '../mutsig_out/'
dir_cohort = '../anno_ref/cohorts'
dir_anlyze = '../mutsig_out/anlyze'
def get_res_path(feature_type, run,  syn_nsyn = None):
    dir_res_feat = os.path.join(dir_res,syn_nsyn,run,feature_type)

    # Append the path of significant gene files into a list
    lsig_f = []
    for feat in os.listdir(dir_res_feat):
        fsig = [i for i in os.listdir(os.path.join(dir_res_feat,feat)) \
                if i.endswith('sig_genes.txt')]
        if len(fsig) > 0: 
            fsig_path = os.path.join(dir_res_feat, feat, fsig[0])
            lsig_f.append(fsig_path)
#     pickle.dump(lsig_f, open(os.path.join(dir_anlyze, \
#             feature_type+'.'+syn_nsyn+'.sig_genes.pathlist.'+run+'.pkl'),'wb'))

In [17]:
# features = ['histology','organ','origin','system','pancancer']
get_res_path('histology', 'cohort_072221', 'syn')

***
### Get failed cohort

In [18]:
### path
dir_res = '../mutsig_out/'
dir_cohort = '../anno_ref/cohorts'
dir_anlyze = '../mutsig_out/anlyze'

In [19]:
def get_failed_cohort(feature_type,run, syn_nsyn):
    # Read feat info df
    df_feat = pd.read_csv(os.path.join(dir_cohort, feature_type+'.csv'))

    # Get dictionary of feat-patient number and sort by patient number
    dict_feat_np = dict(Counter(df_feat[feature_type]))
    dict_feat_np = {k: v for k, v in sorted(dict_feat_np.items(), key=lambda item: item[1])}
#     print(dict_feat_np)

    # Read sig file path list
    pathlist = pickle.load(open(os.path.join(dir_anlyze, feature_type+'.'+syn_nsyn+'.sig_genes.pathlist.'+run+'.pkl'),'rb'))

    # Calculated features
    feat_calced = [i.split('/')[-1].split('.')[0] for i in pathlist]

    # The following features are failed because of too little patients
    feat_failed = [i for i in df_feat[feature_type].unique() if i not in feat_calced ]
    print(f'The below feats failed{feat_failed}')
    
# features = ['histology','organ','origin','system','pancancer']
# for feature in features:
#     get_failed_cohort(feature, 'nsyn')
# get_failed_cohort('histology','nsyn')
get_failed_cohort('histology','cohort_072221','syn')

The below feats failed['Cervix-AdenoCA', 'Breast-DCIS', 'Myeloid-MDS', 'Bone-Cart', 'Bone-Osteoblast', 'Bone-Benign']


### Find badx genes - should print nothing

In [20]:
dir_res = '../mutsig_out/'
def get_badx_genes(feature_type, run, syn_nsyn = None):
    dir_res_feat = os.path.join(dir_res,syn_nsyn,run,feature_type)

    # Append the path of significant gene files into a list
    lsig_f = []
    for feat in os.listdir(dir_res_feat):
        fbadx = [i for i in os.listdir(os.path.join(dir_res_feat,feat)) \
                if i.endswith('sig_genes.txt.badx.txt')]
        if len(fbadx) > 0: 
            fbadx_path = os.path.join(dir_res_feat, feat, fbadx[0])
            if os.stat(fbadx_path).st_size != 0:
                print(f'{feat} has low quality genes')
                with open(fbadx_path, 'r') as f:
                    for lines in f: print(lines)

get_badx_genes('histology','cohort_072221','syn')

***
### Find and save common non-exp genes in all tumor types
### Modify to tumor-specific

In [2]:
### Filepath
exp_dir = '../anno_ref/ICGC/pcawg_rnaseq/'
gene_tophat = 'tophat_star_fpkm.v2.aliquot_gl.tsv'
gene_tophatuq = 'tophat_star_fpkm_uq.v2_aliquot_gl.tsv'

# Out dir 
dir_out = './figure4/'

In [6]:
# Read aliquot id information
df_exp_info = pd.read_csv(os.path.join(exp_dir,'rnaseq.metadata.tsv'), sep = '\t')

# Read expression information
# df_exp_uq = pd.read_csv(os.path.join(exp_dir,gene_tophatuq),sep = '\t', index_col = 0)
df_exp = pd.read_csv(os.path.join(exp_dir,gene_tophat),sep = '\t', index_col = 0)

In [27]:
df_tumor_exp_info = df_exp_info[df_exp_info['tumor.normal']=='tumor']
histologies = df_tumor_exp_info['histology_abbreviation'].unique().tolist()
print(f'{len(histologies)} have expression information, other cohorts use the common non-expression genes(intersection) from these cohorts.')

27 have expression information, other cohorts use the common non-expression genes(intersection) from these cohorts.


In [46]:
### make histology-nonexpressed genes dictionary, the threshold is 1
dict_nexp = {}; lnexp = []
for his in histologies:
    df_his_info = df_tumor_exp_info[df_tumor_exp_info['histology_abbreviation'] == his]
    ltumor_id = df_his_info['aliquot_id'].unique().tolist()
    df_exp_tumor = df_exp[ltumor_id]
#     nonexp_genes = df_exp_uq_tumor[df_exp_uq_tumor<0.5].dropna().index.tolist()
    nonexp_genes = df_exp_tumor[df_exp_tumor<1].dropna().index.tolist()
    print(len(nonexp_genes))
    dict_nexp[his] = nonexp_genes
    lnexp.append(nonexp_genes)

36812
39337
31324
39416
37797
43318
35295
37252
36650
29812
31786
34411
30949
29003
40040
35966
29609
36221
34587
31464
38451
35111
43395
34858
34233
39379
32568


In [37]:
df_nexp = pd.DataFrame.from_dict(dict_nexp, orient='index')

In [44]:
result = set(lnexp[0]).intersection(*lnexp[1:])

In [55]:
with open(os.path.join(dir_out,'nonexp_id_to_name','nonexp-ids_Lymph-BNHL.csv'), 'w') as f:
    f.write('gene'+'\n')
    for genes in dict_nexp['Lymph-BNHL']:
        f.write(genes+'\n')

In [39]:
df_nexp.transpose()

Unnamed: 0,CNS-GBM,CNS-Oligo,Breast-AdenoCA,Breast-LobularCA,Cervix-SCC,Cervix-AdenoCA,ColoRect-AdenoCA,Head-SCC,Thy-AdenoCA,Kidney-RCC,...,Bladder-TCC,Lymph-CLL,Lymph-BNHL,Kidney-ChRCC,Biliary-AdenoCA,Lymph-NOS,Bone-Leiomyo,Stomach-AdenoCA,Eso-AdenoCA,Panc-AdenoCA
0,ENSG00000001626.10,ENSG00000000005.5,ENSG00000002079.8,ENSG00000000005.5,ENSG00000000005.5,ENSG00000000005.5,ENSG00000002079.8,ENSG00000000005.5,ENSG00000000005.5,ENSG00000002079.8,...,ENSG00000000005.5,ENSG00000000003.10,ENSG00000002079.8,ENSG00000000460.12,ENSG00000000005.5,ENSG00000000005.5,ENSG00000001626.10,ENSG00000000005.5,ENSG00000000005.5,ENSG00000000005.5
1,ENSG00000002079.8,ENSG00000001626.10,ENSG00000002745.8,ENSG00000001626.10,ENSG00000002745.8,ENSG00000001626.10,ENSG00000002746.10,ENSG00000001626.10,ENSG00000001626.10,ENSG00000002745.8,...,ENSG00000002079.8,ENSG00000000005.5,ENSG00000003987.9,ENSG00000002079.8,ENSG00000002745.8,ENSG00000001626.10,ENSG00000002079.8,ENSG00000002745.8,ENSG00000002079.8,ENSG00000002745.8
2,ENSG00000002726.15,ENSG00000002079.8,ENSG00000004846.12,ENSG00000002079.8,ENSG00000002746.10,ENSG00000002079.8,ENSG00000003987.9,ENSG00000002746.10,ENSG00000002079.8,ENSG00000004809.9,...,ENSG00000002746.10,ENSG00000000971.11,ENSG00000004846.12,ENSG00000002746.10,ENSG00000002746.10,ENSG00000002079.8,ENSG00000002726.15,ENSG00000002746.10,ENSG00000002746.10,ENSG00000004809.9
3,ENSG00000004809.9,ENSG00000002726.15,ENSG00000004939.9,ENSG00000002745.8,ENSG00000003987.9,ENSG00000002745.8,ENSG00000004809.9,ENSG00000004846.12,ENSG00000004809.9,ENSG00000006059.3,...,ENSG00000003987.9,ENSG00000001617.7,ENSG00000004948.9,ENSG00000004468.8,ENSG00000003987.9,ENSG00000002745.8,ENSG00000002746.10,ENSG00000003987.9,ENSG00000003096.9,ENSG00000004846.12
4,ENSG00000004846.12,ENSG00000004809.9,ENSG00000005421.4,ENSG00000002746.10,ENSG00000004139.9,ENSG00000002746.10,ENSG00000004846.12,ENSG00000004848.6,ENSG00000004846.12,ENSG00000006116.3,...,ENSG00000004809.9,ENSG00000001626.10,ENSG00000005001.5,ENSG00000004809.9,ENSG00000004809.9,ENSG00000002746.10,ENSG00000004939.9,ENSG00000004809.9,ENSG00000003987.9,ENSG00000004939.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43390,,,,,,,,,,,...,,,,,,ENSGR0000264819.1,,,,
43391,,,,,,,,,,,...,,,,,,ENSGR0000265350.1,,,,
43392,,,,,,,,,,,...,,,,,,ENSGR0000265658.1,,,,
43393,,,,,,,,,,,...,,,,,,ENSGR0000266731.1,,,,


In [7]:
### Find genes not expressed in all tumor samples (FPKM-UQ <1)
def get_nexp_genes(run = None, threshold = None):
    global df_exp_info, df_exp_uq, dir_out
    df_tumor_exp_info = df_exp_info[df_exp_info['tumor.normal']=='tumor']
    ltumor_id = df_tumor_exp_info['aliquot_id'].tolist()
#     df_exp_uq_tumor = df_exp_uq[ltumor_id]
    df_exp_tumor = df_exp[ltumor_id]
#     nonexp_genes = df_exp_uq_tumor[df_exp_uq_tumor<0.5].dropna().index.tolist()
    nonexp_genes = df_exp_tumor[df_exp_tumor<threshold].dropna().index.tolist()
    
    return nonexp_genes
    
    # Save the gene list to convert gene id to gene name
#     with open(os.path.join(dir_out,'nonexp_id_to_name','nonexp-ids_'+run+'_'+str(threshold)+'.csv'), 'w') as f:
#         f.write('gene'+'\n')
#         for genes in nonexp_genes:
#             f.write(genes+'\n')

In [8]:
genes = get_nexp_genes(run = '072221', threshold = 1)

In [16]:
for gene in df_exp.index:
    if gene.startswith('ENSG00000163092'):
        print(gene)

ENSG00000163092.15


In [5]:
get_nexp_genes(run = '072221', threshold = 1)

Converted gene name of nonexp genes in R  
Currently in Dropbox folder, will upload later

**Test tumor specific **

In [56]:
dir_name = './figure4/nonexp_id_to_name/out'
df_ne_gene = pd.read_csv(os.path.join(dir_name,'nonexp-names_Lymph-BNHL.csv'))
lne = df_ne_gene['genes']

# Read all gene list
lgene = pickle.load(open('../anno_ref/proc_refs/gene_name_list_062121.pkl','rb'))
print(f'There are total {len(set(lgene).intersection(set(lne)))} \
non-expressed genes out of {len(lgene)} genes')

There are total 3841 non-expressed genes out of 19225 genes


In [57]:
cohort = ['histology','organ','origin','system','pancancer']
dir_anlyze = '../mutsig_out/anlyze'
dir_out = './figure4/'

In [58]:
### Get pvals -- input for FDR calculation
def get_allpvals(feature_type,run, syn_nsyn, threshold):
    df_exp = pd.DataFrame(); df_ne = pd.DataFrame()

    sig_pathlist = pickle.load(open(os.path.join(dir_anlyze, feature_type +'.'+syn_nsyn+'.sig_genes.pathlist.'+run+'.pkl'),'rb'))
    for fpath in sig_pathlist:
        df_res_feat = pd.read_csv(fpath,sep = '\t')
        df_res_feat['feature'] = fpath.split('/')[-1].split('.')[0]
        df_nonexp_p = df_res_feat[df_res_feat['gene'].isin(lne)][['gene','p','q','feature']].reset_index(drop = True)
        df_nonexp_p['exp/nonexp'] = 'nonexp'
        df_exp_p = df_res_feat[~df_res_feat['gene'].isin(lne)][['gene','p','q','feature']].reset_index(drop = True)
        df_exp_p['exp/nonexp'] = 'exp'
        
        df_exp = pd.concat([df_exp,df_exp_p], ignore_index = True)
        df_ne = pd.concat([df_ne,df_nonexp_p], ignore_index = True)
        

    df_all = pd.concat([df_exp,df_ne], ignore_index = True)

#     Save all p-val dataframe
    df_all.to_csv(os.path.join(dir_out,feature_type+'.'+syn_nsyn+'.df_all_forFDR.Lymph-BNHL.csv'))
    return df_all

# for feat in cohort:
#     get_allpvals(feat,'syn')
df = get_allpvals('histology','cohort_072221','syn', '1')

**End CNS-GBM test**

### Get p-val's from all feats -- For FDR  

In [17]:
### Read non-expressed gene name file
threshold = 1
dir_name = './figure4/nonexp_id_to_name/out'
df_ne_gene = pd.read_csv(os.path.join(dir_name,'nonexp-names_072221_'+str(threshold)+'.csv'))
lne = df_ne_gene['genes']

# Read all gene list
lgene = pickle.load(open('../anno_ref/proc_refs/gene_name_list_062121.pkl','rb'))
print(f'There are total {len(set(lgene).intersection(set(lne)))} \
non-expressed genes out of {len(lgene)} genes')

cohort = ['histology','organ','origin','system','pancancer']
dir_anlyze = '../mutsig_out/anlyze'
dir_out = './figure4/'

There are total 1057 non-expressed genes out of 19225 genes


In [19]:
### Get pvals -- input for FDR calculation
def get_allpvals(feature_type,run, syn_nsyn, threshold):
    # Read nonexpressed gene file
    dir_name = './figure4/nonexp_id_to_name/out'
    df_ne_gene = pd.read_csv(os.path.join(dir_name,'nonexp-names_072221_'+str(threshold)+'.csv'))
    lne = df_ne_gene['genes']
    
    df_exp = pd.DataFrame(); df_ne = pd.DataFrame()

    sig_pathlist = pickle.load(open(os.path.join(dir_anlyze, feature_type +'.'+syn_nsyn+'.sig_genes.pathlist.'+run+'.pkl'),'rb'))
    for fpath in sig_pathlist:
        df_res_feat = pd.read_csv(fpath,sep = '\t')
        df_res_feat['feature'] = fpath.split('/')[-1].split('.')[0]
        df_nonexp_p = df_res_feat[df_res_feat['gene'].isin(lne)][['gene','p','q','feature']].reset_index(drop = True)
        df_nonexp_p['exp/nonexp'] = 'nonexp'
        df_exp_p = df_res_feat[~df_res_feat['gene'].isin(lne)][['gene','p','q','feature']].reset_index(drop = True)
        df_exp_p['exp/nonexp'] = 'exp'
        
        df_exp = pd.concat([df_exp,df_exp_p], ignore_index = True)
        df_ne = pd.concat([df_ne,df_nonexp_p], ignore_index = True)
        

    df_all = pd.concat([df_exp,df_ne], ignore_index = True)

#     Save all p-val dataframe
#     df_all.to_csv(os.path.join(dir_out,feature_type+'.'+syn_nsyn+'.df_all_forFDR.'+run+'.'+threshold+'.csv'))
    return df_all

# for feat in cohort:
#     get_allpvals(feat,'syn')
df = get_allpvals('histology','cohort_072221','syn', '5')
df = get_allpvals('histology','cohort_072221','syn', '1')

***
### FDR calculation

***
### Get nsyn q-value for heatmap -- in figure3-anlyze

***
### Candidate number between synonymous and non-synonymous

In [9]:
cohorts = ['histology','organ','origin','system','pancancer']
dir_out = './figure4/'
dir_anlyze = '../mutsig_out/anlyze'

#Load synonymous result
feature_type = 'histology';syn_nsyn = 'syn';run = 'cohort_072221';threshold =1;
syn_pathlist = pickle.load(open(os.path.join(dir_anlyze, feature_type+'.'+syn_nsyn+'.sig_genes.pathlist.'+run+'.pkl'),'rb'))
lfeat_syn = [i.split('/')[-1].split('.')[0] for i in syn_pathlist]
df_syn = pd.read_csv(os.path.join(dir_out,feature_type+'.syn.df_all_forheatmap.'+run+'.'+str(threshold)+'.csv'),\
                     index_col = 0)

#Load nonsynonymous result
feature_type = 'histology';syn_nsyn = 'nsyn';run = 'cohort_072221'
nsyn_pathlist = pickle.load(open(os.path.join(dir_anlyze, feature_type+'.'+syn_nsyn+'.sig_genes.pathlist.'+run+'.pkl'),'rb'))
lfeat = [i.split('/')[-1].split('.')[0] for i in nsyn_pathlist]
df_nsyn = pd.read_csv(os.path.join(dir_out,feature_type+'.nsyn.df_all_forheatmap.'+run+'.csv'),\
                      index_col = 0)

In [18]:
###  Build dataframe for histology types |syn candidates |nonsyn candidates
df = pd.DataFrame(columns = ['syn','nsyn','np'], index = lfeat)
for feat in lfeat:
    nsig_syn = len(df_syn[df_syn['feature'] == feat])
    nsig_nsyn = len(df_nsyn[df_nsyn['feature'] == feat])
    df.loc[feat, 'syn'] = nsig_syn
    df.loc[feat,'nsyn'] = nsig_nsyn
df = df[['syn','nsyn']]
df = pd.melt(df.reset_index(), id_vars='index', var_name="syn.nsyn", value_name="nsig") 
# Append patient number to each histology types
for feat in lfeat:
    idx = df[df['index'] == feat].index
    df.loc[idx, 'index'] = '(n='+str(dict_feat_np[feat])+') '+feat
# Save file
df.columns = [feature_type, 'syn.nsyn','nsig']
# df.to_csv(os.path.join(dir_out, feature_type+'.syn_nsyn.'+run+'.nsig.csv'))