In [1]:
# This notebook visualize the p-vals across all tumors
# This notebook start from the non-exp gene mapping in expression file
import pandas as pd
import os, pickle, shutil, time
from tqdm import tqdm
from collections import Counter

In [2]:
### Generate the label dictionary
dir_cohort = '../anno_ref/cohorts'
feature_type = 'histology'
df_feat = pd.read_csv(os.path.join(dir_cohort,feature_type+'.csv'))
dict_feat_np = dict(Counter(df_feat[feature_type]))
dict_feat_np = {k: v for k, v in sorted(dict_feat_np.items(), key=lambda item: item[1])}
lfeat = list(dict_feat_np)

***
### Save sig_genes.txt paths

In [10]:
### For feature type, get the number of successfully run 
dir_res = '../mutsig_out/'
dir_cohort = '../anno_ref/cohorts'
dir_anlyze = '../mutsig_out/anlyze'
def get_res_path(feature_type, run,  syn_nsyn = None):
    dir_res_feat = os.path.join(dir_res,syn_nsyn,run,feature_type)

    # Append the path of significant gene files into a list
    lsig_f = []
    for feat in os.listdir(dir_res_feat):
        fsig = [i for i in os.listdir(os.path.join(dir_res_feat,feat)) \
                if i.endswith('sig_genes.txt')]
        if len(fsig) > 0: 
            fsig_path = os.path.join(dir_res_feat, feat, fsig[0])
            lsig_f.append(fsig_path)
#     pickle.dump(lsig_f, open(os.path.join(dir_anlyze, \
#             feature_type+'.'+syn_nsyn+'.sig_genes.pathlist.'+run+'.pkl'),'wb'))

In [17]:
# features = ['histology','organ','origin','system','pancancer']
get_res_path('histology', 'cohort_072221', 'syn')

***
### Get failed cohort

In [18]:
### path
dir_res = '../mutsig_out/'
dir_cohort = '../anno_ref/cohorts'
dir_anlyze = '../mutsig_out/anlyze'

In [19]:
def get_failed_cohort(feature_type,run, syn_nsyn):
    # Read feat info df
    df_feat = pd.read_csv(os.path.join(dir_cohort, feature_type+'.csv'))

    # Get dictionary of feat-patient number and sort by patient number
    dict_feat_np = dict(Counter(df_feat[feature_type]))
    dict_feat_np = {k: v for k, v in sorted(dict_feat_np.items(), key=lambda item: item[1])}
#     print(dict_feat_np)

    # Read sig file path list
    pathlist = pickle.load(open(os.path.join(dir_anlyze, feature_type+'.'+syn_nsyn+'.sig_genes.pathlist.'+run+'.pkl'),'rb'))

    # Calculated features
    feat_calced = [i.split('/')[-1].split('.')[0] for i in pathlist]

    # The following features are failed because of too little patients
    feat_failed = [i for i in df_feat[feature_type].unique() if i not in feat_calced ]
    print(f'The below feats failed{feat_failed}')
    
# features = ['histology','organ','origin','system','pancancer']
# for feature in features:
#     get_failed_cohort(feature, 'nsyn')
# get_failed_cohort('histology','nsyn')
get_failed_cohort('histology','cohort_072221','syn')

The below feats failed['Cervix-AdenoCA', 'Breast-DCIS', 'Myeloid-MDS', 'Bone-Cart', 'Bone-Osteoblast', 'Bone-Benign']


### Find badx genes - should print nothing

In [20]:
dir_res = '../mutsig_out/'
def get_badx_genes(feature_type, run, syn_nsyn = None):
    dir_res_feat = os.path.join(dir_res,syn_nsyn,run,feature_type)

    # Append the path of significant gene files into a list
    lsig_f = []
    for feat in os.listdir(dir_res_feat):
        fbadx = [i for i in os.listdir(os.path.join(dir_res_feat,feat)) \
                if i.endswith('sig_genes.txt.badx.txt')]
        if len(fbadx) > 0: 
            fbadx_path = os.path.join(dir_res_feat, feat, fbadx[0])
            if os.stat(fbadx_path).st_size != 0:
                print(f'{feat} has low quality genes')
                with open(fbadx_path, 'r') as f:
                    for lines in f: print(lines)

get_badx_genes('histology','cohort_072221','syn')

***
### Find and save common non-exp genes in all tumor types
### Modify to tumor-specific

In [6]:
### Filepath
exp_dir = '../anno_ref/ICGC/pcawg_rnaseq/'
gene_tophat = 'tophat_star_fpkm.v2.aliquot_gl.tsv'
gene_tophatuq = 'tophat_star_fpkm_uq.v2_aliquot_gl.tsv'

dir_cohort = '../anno_ref/cohorts'
feature_type = 'histology'
df_feat = pd.read_csv(os.path.join(dir_cohort,feature_type+'.csv'))
lfeat = df_feat['histology'].unique().tolist()

# Out dir 
dir_out = './figure4/'

In [29]:
# Read aliquot id information
df_exp_info = pd.read_csv(os.path.join(exp_dir,'rnaseq.metadata.tsv'), sep = '\t')

# Read expression information
# df_exp_uq = pd.read_csv(os.path.join(exp_dir,gene_tophatuq),sep = '\t', index_col = 0)
df_exp = pd.read_csv(os.path.join(exp_dir,gene_tophat),sep = '\t', index_col = 0)

In [30]:
df_tumor_exp_info = df_exp_info[df_exp_info['tumor.normal']=='tumor']
histologies = df_tumor_exp_info['histology_abbreviation'].unique().tolist()
print(f'{len(histologies)} have expression information, other cohorts use the common non-expression genes(intersection) from these cohorts.')

27 have expression information, other cohorts use the common non-expression genes(intersection) from these cohorts.


In [54]:
### make histology-nonexpressed genes dictionary, the threshold is 1
dict_nexp = {}; lnexp = []
for his in histologies:
    df_his_info = df_tumor_exp_info[df_tumor_exp_info['histology_abbreviation'] == his]
    ltumor_id = df_his_info['aliquot_id'].unique().tolist()
    df_exp_tumor = df_exp[ltumor_id]
#     nonexp_genes = df_exp_uq_tumor[df_exp_uq_tumor<0.5].dropna().index.tolist()
    nonexp_genes = df_exp_tumor[df_exp_tumor<1].dropna().index.tolist()
    dict_nexp[his] = nonexp_genes
    lnexp.append(nonexp_genes)
result = set(lnexp[0]).intersection(*lnexp[1:])
# For features that don't have expression information, use the intersection of nonexpressed genes
for feat in lfeat:
    if feat not in dict_nexp:
        dict_nexp[feat] = list(result)

In [55]:
# for keys in dict_nexp: 
#     with open(os.path.join(dir_out,'nonexp_id_to_name','histology-specific_1',keys+'.csv'), 'w') as f:
#         f.write('gene'+'\n')
#         for genes in dict_nexp[keys]:
#             f.write(genes+'\n')

In [37]:
len(dict_nexp['CNS-GBM'])

32531

old- common

In [7]:
### Find genes not expressed in all tumor samples (FPKM-UQ <1)
def get_nexp_genes(run = None, threshold = None):
    global df_exp_info, df_exp_uq, dir_out
    df_tumor_exp_info = df_exp_info[df_exp_info['tumor.normal']=='tumor']
    ltumor_id = df_tumor_exp_info['aliquot_id'].tolist()
#     df_exp_uq_tumor = df_exp_uq[ltumor_id]
    df_exp_tumor = df_exp[ltumor_id]
#     nonexp_genes = df_exp_uq_tumor[df_exp_uq_tumor<0.5].dropna().index.tolist()
    nonexp_genes = df_exp_tumor[df_exp_tumor<threshold].dropna().index.tolist()
    
    return nonexp_genes
    
    # Save the gene list to convert gene id to gene name
#     with open(os.path.join(dir_out,'nonexp_id_to_name','nonexp-ids_'+run+'_'+str(threshold)+'.csv'), 'w') as f:
#         f.write('gene'+'\n')
#         for genes in nonexp_genes:
#             f.write(genes+'\n')

In [8]:
genes = get_nexp_genes(run = '072221', threshold = 1)

In [16]:
for gene in df_exp.index:
    if gene.startswith('ENSG00000163092'):
        print(gene)

ENSG00000163092.15


In [5]:
get_nexp_genes(run = '072221', threshold = 1)

Converted gene name of nonexp genes in R  
Currently in Dropbox folder, will upload later

**Test tumor specific**

In [2]:

# Read all gene list
lgene = pickle.load(open('../anno_ref/proc_refs/gene_name_list_062121.pkl','rb'))

In [3]:
cohort = ['histology','organ','origin','system','pancancer']
dir_anlyze = '../mutsig_out/anlyze'
dir_out = './figure4/'
dir_nexp = './figure4/nonexp_id_to_name/out/histology'

In [4]:
### Get pvals -- input for FDR calculation
def get_allpvals(feature_type,run, syn_nsyn, threshold):
    df_exp = pd.DataFrame(); df_ne = pd.DataFrame()

    sig_pathlist = pickle.load(open(os.path.join(dir_anlyze, feature_type +'.'+syn_nsyn+'.sig_genes.pathlist.'+run+'.pkl'),'rb'))
    for fpath in sig_pathlist:
        df_res_feat = pd.read_csv(fpath,sep = '\t')
        feature = fpath.split('/')[-1].split('.')[0]
        
        # read nonexpressed gene
        print(feature)
        df_ne_gene = pd.read_csv(os.path.join(dir_nexp,feature+'.csv'))
        lne = df_ne_gene['genes']
        print(f'There are total {len(set(lgene).intersection(set(lne)))} non-expressed genes out of {len(lgene)} genes')
        
        # add expressed and nonexpressed into the dataframe
        df_res_feat['feature'] = fpath.split('/')[-1].split('.')[0]
        df_nonexp_p = df_res_feat[df_res_feat['gene'].isin(lne)][['gene','p','q','feature']].reset_index(drop = True)
        df_nonexp_p['exp/nonexp'] = 'nonexp'
        df_exp_p = df_res_feat[~df_res_feat['gene'].isin(lne)][['gene','p','q','feature']].reset_index(drop = True)
        df_exp_p['exp/nonexp'] = 'exp'
        
        df_exp = pd.concat([df_exp,df_exp_p], ignore_index = True)
        df_ne = pd.concat([df_ne,df_nonexp_p], ignore_index = True)
        

    df_all = pd.concat([df_exp,df_ne], ignore_index = True)

#     Save all p-val dataframe
#     df_all.to_csv(os.path.join(dir_out,feature_type+'.'+syn_nsyn+'.df_all_forFDR.histology-specific.csv'))
    return df_all

df = get_allpvals('histology','cohort_072221','syn', '1')

Lymph-BNHL
There are total 2975 non-expressed genes out of 19225 genes
Bone-Osteosarc
There are total 618 non-expressed genes out of 19225 genes
Panc-Endocrine
There are total 618 non-expressed genes out of 19225 genes
CNS-PiloAstro
There are total 618 non-expressed genes out of 19225 genes
CNS-Oligo
There are total 4369 non-expressed genes out of 19225 genes
Myeloid-AML
There are total 618 non-expressed genes out of 19225 genes
Skin-Melanoma
There are total 3337 non-expressed genes out of 19225 genes
CNS-Medullo
There are total 618 non-expressed genes out of 19225 genes
SoftTissue-Liposarc
There are total 618 non-expressed genes out of 19225 genes
SoftTissue-Leiomyo
There are total 618 non-expressed genes out of 19225 genes
Biliary-AdenoCA
There are total 3405 non-expressed genes out of 19225 genes
Kidney-ChRCC
There are total 4077 non-expressed genes out of 19225 genes
Lymph-CLL
There are total 5720 non-expressed genes out of 19225 genes
Lung-SCC
There are total 1914 non-expressed ge

In [83]:
df_1 = df[df['q']<0.1]
df_1[~df_1['gene'].str.startswith('PCDH')]

Unnamed: 0,gene,p,q,feature,exp/nonexp
0,BCL2,0.0,0.0,Lymph-BNHL,exp
1,HIST1H2BK,2e-06,0.014953,Lymph-BNHL,exp
2,NOL9,8e-06,0.051304,Lymph-BNHL,exp
51720,ITLN1,0.0,0.0,CNS-PiloAstro,exp
102020,NACA,1.7e-05,0.016218,Skin-Melanoma,exp
243167,SOX18,0.0,0.0,Eso-AdenoCA,exp
305832,SIGLEC15,0.0,0.0,Panc-AdenoCA,exp
305833,TP53I3,0.0,0.0,Panc-AdenoCA,exp
352731,CALR,0.0,0.0,Uterus-AdenoCA,exp
368713,PPWD1,0.0,0.0,Head-SCC,exp


In [24]:
dict_feat_np

{'Bone-Benign': 1,
 'Cervix-AdenoCA': 2,
 'Myeloid-MDS': 2,
 'Breast-DCIS': 3,
 'Bone-Osteoblast': 5,
 'Bone-Cart': 9,
 'Bone-Epith': 10,
 'Myeloid-AML': 13,
 'Breast-LobularCA': 13,
 'SoftTissue-Leiomyo': 15,
 'CNS-Oligo': 18,
 'Cervix-SCC': 18,
 'SoftTissue-Liposarc': 19,
 'Bladder-TCC': 23,
 'Myeloid-MPN': 23,
 'Biliary-AdenoCA': 33,
 'Bone-Osteosarc': 35,
 'Lung-AdenoCA': 36,
 'CNS-GBM': 39,
 'Uterus-AdenoCA': 43,
 'Kidney-ChRCC': 43,
 'Lung-SCC': 47,
 'Thy-AdenoCA': 48,
 'ColoRect-AdenoCA': 52,
 'Head-SCC': 56,
 'Stomach-AdenoCA': 68,
 'Panc-Endocrine': 81,
 'CNS-PiloAstro': 89,
 'Lymph-CLL': 90,
 'Eso-AdenoCA': 97,
 'Skin-Melanoma': 106,
 'Lymph-BNHL': 107,
 'Ovary-AdenoCA': 109,
 'CNS-Medullo': 141,
 'Kidney-RCC': 143,
 'Breast-AdenoCA': 193,
 'Prost-AdenoCA': 199,
 'Panc-AdenoCA': 230,
 'Liver-HCC': 312}

In [34]:
histologies

['CNS-GBM',
 'CNS-Oligo',
 'Breast-AdenoCA',
 'Breast-LobularCA',
 'Cervix-SCC',
 'Cervix-AdenoCA',
 'ColoRect-AdenoCA',
 'Head-SCC',
 'Thy-AdenoCA',
 'Kidney-RCC',
 'Liver-HCC',
 'Lung-AdenoCA',
 'Lung-SCC',
 'Ovary-AdenoCA',
 'Prost-AdenoCA',
 'Skin-Melanoma',
 'Uterus-AdenoCA',
 'Bladder-TCC',
 'Lymph-CLL',
 'Lymph-BNHL',
 'Kidney-ChRCC',
 'Biliary-AdenoCA',
 'Lymph-NOS',
 'Bone-Leiomyo',
 'Stomach-AdenoCA',
 'Eso-AdenoCA',
 'Panc-AdenoCA']

In [9]:
df_sigs = pd.DataFrame()
for feat in lfeat:
    df_feat = df[df['feature'] == feat].sort_values('p')
    p_threshold = df_feat[df_feat['exp/nonexp'] == 'nonexp']['p'].min()
    df_sig = df_feat[df_feat['p']<p_threshold]
    df_sig = df_sig[df_sig['p']<0.0005]
#     df_sig = df_feat[df_feat['p']<0.000005]

    df_sigs = pd.concat([df_sigs,df_sig])

In [10]:
df_sigs[~df_sigs['gene'].str.startswith('PCDH')]

Unnamed: 0,gene,p,q,feature,exp/nonexp
482729,PTH,0.000245,1.0,Ovary-AdenoCA,exp
482730,GTSF1L,0.000407,1.0,Ovary-AdenoCA,exp
53464,ITLN1,0.0,0.0,CNS-PiloAstro,exp
352289,HTRA3,0.000453,1.0,Liver-HCC,exp
34857,SH3BGR,6.1e-05,1.0,Panc-Endocrine,exp
287367,RDH8,0.000151,1.0,Kidney-RCC,exp
287369,VCAN,0.000328,1.0,Kidney-RCC,exp
532468,ACVRL1,4.5e-05,0.435663,Prost-AdenoCA,exp
304029,TMEM129,0.000124,1.0,Thy-AdenoCA,exp
0,BCL2,0.0,0.0,Lymph-BNHL,exp


**End CNS-GBM test**

### Get p-val's from all feats -- For FDR  

In [17]:
### Read non-expressed gene name file
threshold = 1
dir_name = './figure4/nonexp_id_to_name/out'
df_ne_gene = pd.read_csv(os.path.join(dir_name,'nonexp-names_072221_'+str(threshold)+'.csv'))
lne = df_ne_gene['genes']

# Read all gene list
lgene = pickle.load(open('../anno_ref/proc_refs/gene_name_list_062121.pkl','rb'))
print(f'There are total {len(set(lgene).intersection(set(lne)))} \
non-expressed genes out of {len(lgene)} genes')

cohort = ['histology','organ','origin','system','pancancer']
dir_anlyze = '../mutsig_out/anlyze'
dir_out = './figure4/'

There are total 1057 non-expressed genes out of 19225 genes


In [19]:
### Get pvals -- input for FDR calculation
def get_allpvals(feature_type,run, syn_nsyn, threshold):
    # Read nonexpressed gene file
    dir_name = './figure4/nonexp_id_to_name/out'
    df_ne_gene = pd.read_csv(os.path.join(dir_name,'nonexp-names_072221_'+str(threshold)+'.csv'))
    lne = df_ne_gene['genes']
    
    df_exp = pd.DataFrame(); df_ne = pd.DataFrame()

    sig_pathlist = pickle.load(open(os.path.join(dir_anlyze, feature_type +'.'+syn_nsyn+'.sig_genes.pathlist.'+run+'.pkl'),'rb'))
    for fpath in sig_pathlist:
        df_res_feat = pd.read_csv(fpath,sep = '\t')
        df_res_feat['feature'] = fpath.split('/')[-1].split('.')[0]
        df_nonexp_p = df_res_feat[df_res_feat['gene'].isin(lne)][['gene','p','q','feature']].reset_index(drop = True)
        df_nonexp_p['exp/nonexp'] = 'nonexp'
        df_exp_p = df_res_feat[~df_res_feat['gene'].isin(lne)][['gene','p','q','feature']].reset_index(drop = True)
        df_exp_p['exp/nonexp'] = 'exp'
        
        df_exp = pd.concat([df_exp,df_exp_p], ignore_index = True)
        df_ne = pd.concat([df_ne,df_nonexp_p], ignore_index = True)
        

    df_all = pd.concat([df_exp,df_ne], ignore_index = True)

#     Save all p-val dataframe
#     df_all.to_csv(os.path.join(dir_out,feature_type+'.'+syn_nsyn+'.df_all_forFDR.'+run+'.'+threshold+'.csv'))
    return df_all

# for feat in cohort:
#     get_allpvals(feat,'syn')
df = get_allpvals('histology','cohort_072221','syn', '5')
df = get_allpvals('histology','cohort_072221','syn', '1')

***
### FDR calculation

***
### Get nsyn q-value for heatmap -- in figure3-anlyze

***
### Candidate number between synonymous and non-synonymous

In [9]:
cohorts = ['histology','organ','origin','system','pancancer']
dir_out = './figure4/'
dir_anlyze = '../mutsig_out/anlyze'

#Load synonymous result
feature_type = 'histology';syn_nsyn = 'syn';run = 'cohort_072221';threshold =1;
syn_pathlist = pickle.load(open(os.path.join(dir_anlyze, feature_type+'.'+syn_nsyn+'.sig_genes.pathlist.'+run+'.pkl'),'rb'))
lfeat_syn = [i.split('/')[-1].split('.')[0] for i in syn_pathlist]
df_syn = pd.read_csv(os.path.join(dir_out,feature_type+'.syn.df_all_forheatmap.'+run+'.'+str(threshold)+'.csv'),\
                     index_col = 0)

#Load nonsynonymous result
feature_type = 'histology';syn_nsyn = 'nsyn';run = 'cohort_072221'
nsyn_pathlist = pickle.load(open(os.path.join(dir_anlyze, feature_type+'.'+syn_nsyn+'.sig_genes.pathlist.'+run+'.pkl'),'rb'))
lfeat = [i.split('/')[-1].split('.')[0] for i in nsyn_pathlist]
df_nsyn = pd.read_csv(os.path.join(dir_out,feature_type+'.nsyn.df_all_forheatmap.'+run+'.csv'),\
                      index_col = 0)

In [18]:
###  Build dataframe for histology types |syn candidates |nonsyn candidates
df = pd.DataFrame(columns = ['syn','nsyn','np'], index = lfeat)
for feat in lfeat:
    nsig_syn = len(df_syn[df_syn['feature'] == feat])
    nsig_nsyn = len(df_nsyn[df_nsyn['feature'] == feat])
    df.loc[feat, 'syn'] = nsig_syn
    df.loc[feat,'nsyn'] = nsig_nsyn
df = df[['syn','nsyn']]
df = pd.melt(df.reset_index(), id_vars='index', var_name="syn.nsyn", value_name="nsig") 
# Append patient number to each histology types
for feat in lfeat:
    idx = df[df['index'] == feat].index
    df.loc[idx, 'index'] = '(n='+str(dict_feat_np[feat])+') '+feat
# Save file
df.columns = [feature_type, 'syn.nsyn','nsig']
# df.to_csv(os.path.join(dir_out, feature_type+'.syn_nsyn.'+run+'.nsig.csv'))