In [14]:
### This is a gene specific analysis notebook
import pickle, os
import pandas as pd
import numpy as np
from scipy.stats import mannwhitneyu

In [15]:
# output directory
dir_out_figure = '../manuscript/figures_data/'

if not os.path.exists(dir_out_figure):
    os.makedirs(dir_out_figure)

In [16]:
#-------------Read Data------------------
# Read expression data
dir_refs = '../anno_ref/proc_refs'
exp_dir = '../anno_ref/ICGC/pcawg_rnaseq/'
gene_tophat = 'tophat_star_fpkm.v2.aliquot_gl.tsv'
# Read aliquot id information
df_exp_info = pd.read_csv(os.path.join(exp_dir,'rnaseq.metadata.tsv'), sep = '\t')
# Read expression information
df_exp = pd.read_csv(os.path.join(exp_dir,gene_tophat),sep = '\t', index_col = 0)

In [17]:
gene_name = 'PURA'

In [18]:
# Read the synonymous candidate dataframe
dir_out_fig4 = './figure4/'
feature_type = 'histology';syn_nsyn = 'syn';run = 'cohort_090821'
df_syn = pd.read_csv(os.path.join(dir_out_fig4,feature_type+'.syn.df_all_forheatmap.'+run+'.csv'),index_col = 0)
df_syn = df_syn.set_index('gene')

# Read the significant gene-id dataframe
fsig_name = 'sig_gene_name_id.csv'
df_nsig = pd.read_csv(os.path.join(dir_out_fig4, fsig_name))
df_nsig.columns = ['name', 'id']

dir_maf = '../maf_out/maf_cohorts_060121'

In [19]:
#-------------Syn Patient Expression------------------
def get_gene_exp(gene_name, df_expression):
    global df_nsig
    idx = df_nsig[df_nsig['name'] == gene_name].index
    gene_id = df_nsig.loc[idx, 'id'].values[0]
    df = df_expression.loc[df_expression.index.str.contains(rf'{gene_id}'),:]
    return df

def get_syn_mut(gene_name):
    global df_syn
    histology = df_syn.loc[gene_name,'feature']
    df_maf = pd.read_csv(os.path.join(dir_maf,feature_type, histology+'.csv'), sep = '\t')
    df_maf = df_maf.set_index(['Hugo_Symbol','Variant_Classification','Donor_ID' ])
    df_silent = df_maf.loc[pd.IndexSlice[gene_name, 'Silent',:],:]
    patient = df_silent.index.get_level_values('Donor_ID').unique().tolist()
    
    return df_silent, patient

def get_patient_id(gene_name, patients):
    global df_exp_info, df_syn
    histology = df_syn.loc[gene_name,'feature']
    df = df_exp_info[df_exp_info['histology_abbreviation'] == histology]
    
    # Get tumor, syn and normal patient aliquot id
    normal_id = df[df['tumor.normal'] == 'normal']['aliquot_id']
    df_tumor = df[df['tumor.normal'] == 'tumor']
    tumor_syn_id = df_tumor[df_tumor['icgc_donor_id'].isin(patients)]['aliquot_id']
    tumor_other_id = df_tumor[~df_tumor['icgc_donor_id'].isin(patients)]['aliquot_id']
    
    return normal_id, tumor_syn_id, tumor_other_id

def get_patient_exp(ids, df_gene_exp, tissue_type = None):
    df_for_test = df_gene_exp[ids].transpose()
    df = df_gene_exp[ids].transpose().reset_index()
    df['tumor.normal'] = tissue_type
    df.columns = ['id','exp','tumor.normal']
    
    return df, df_for_test

In [20]:
def get_expression(gene):
    histology = df_syn.loc[gene,'feature']

    df_exp_gene = get_gene_exp(gene, df_exp)
    df_synmut, synp = get_syn_mut(gene)
    id_normal, id_syn, id_other =get_patient_id(gene, synp)

    df_normal, normal_test = get_patient_exp(id_normal, df_exp_gene, 'normal')
    df_tsyn, syn_test = get_patient_exp(id_syn, df_exp_gene,'tumor_syn')
    df_tother, other_test = get_patient_exp(id_other, df_exp_gene, 'tumor_other')
    df_all = pd.concat([df_normal,df_tsyn,df_tother])

    nnorm = len(df_normal['id'].unique())
    print(f'Number of normal patient: {nnorm}')
    nsyn = len(df_tsyn['id'].unique())
    print(f'Number of synonymous patients: {nsyn}')
    nother = len(df_tother['id'].unique())
    print(f'Number of other tumor patients:{nother}')
    
    normal_test.columns = ['expression']
    syn_test.columns = ['expression']
    other_test.columns = ['expression']

    return df_all, normal_test, syn_test,other_test

In [21]:
df_exp_pura, test_normal, test_syn,test_other = get_expression(gene_name)

  exec(code_obj, self.user_global_ns, self.user_ns)


Number of normal patient: 6
Number of synonymous patients: 0
Number of other tumor patients:85


In [22]:
testStat, pval = mannwhitneyu(test_normal, test_other)

In [23]:
dict_pura = {'df_exp':df_exp_pura,'testStat':testStat,'pval':pval}
pickle.dump(dict_pura, open(os.path.join(dir_out_figure, 'dict_pura.pkl'), 'wb'))

-----------------------
Permutation test

In [80]:
np.random.seed(500)

In [58]:
x= np.array(test_other['expression'].tolist()); y = np.array(test_normal['expression'].tolist())
obs_mean_diff = x.mean()-y.mean()
pooled = np.hstack([x,y])

In [81]:
def permutation_test(pool,xsize,ysize):
    permuted = np.random.permutation(pool)
    starX = permuted[:xsize]
    starY = permuted[-ysize:]
    return starX.mean() - starY.mean()

In [91]:
npermute = 10000
lpermute = []
for i in range(npermute):
    lpermute.append(permutation_test(pooled,x.size, y.size))

In [92]:
diffCount = len(np.where(np.array(lpermute<= obs_mean_diff))[0])

In [93]:
float(diffCount)/float(npermute)

0.0059