In [23]:
import pandas as pd
import scipy.stats

In [2]:
promoter_suffix = '2000_symm'

In [3]:
results_csv = '/s/project/mll/sergey/effect_prediction/outrider/input_data/activation/res_filter_out_all.csv.gz' 

In [4]:
#results_csv = '/s/project/mll/sergey/effect_prediction/outrider/input_data/outrider/outrider_all.csv.gz' 

In [5]:
promoter_mutations = f'/s/project/mll/sergey/effect_prediction/promoter_mutations/{promoter_suffix}/mutations.tsv'

vcf_matching = '/s/project/mll/sergey/effect_prediction/promoter_mutations/analysed_samples/vcf_matching.tsv'

splicing_tsv = '/s/project/mll/sergey/MLL_data/processed/significant_splicing.tsv'

structural_tsv = '/s/project/mll/sergey/MLL_data/processed/vale.svs.filtered.tsv.gz'

copy_number_tsv = '/s/project/mll/sergey/MLL_data/processed/cnv_mll.tsv'

aml_variants_tsv='/s/project/mll/sergey/effect_prediction/outrider/input_data/20221207_paper_AML_variants.txt'

In [6]:
results_df = pd.read_csv(results_csv, usecols=[0,1,3]) #activation results converted from Rds

In [7]:
results_df.rename(columns={'sampleID':'array_id'}, inplace=True)

In [8]:
mutations_df = pd.read_csv(promoter_mutations, header=None, sep='\t', names=['chrom','pos','id','ref','alt','geneName','vcf']) #promoter mutations after filtering

In [9]:
# exclude genes with mutations explained by MLL

mutations_mll = pd.read_csv(aml_variants_tsv, sep='\t')

matching_genes = pd.read_csv('/s/project/mll/sergey/effect_prediction/promoter_mutations/ensemble_to_HGNC_GRCh38.tsv.gz', sep='\t', 
                      header=None, names=['geneName', 'symbol'], usecols=[0,1], skiprows=1)

matching_genes = matching_genes[~matching_genes.symbol.isna()]

mutations_mll = mutations_mll[['array_id','symbol']].merge(matching_genes)

results_df = results_df.merge(mutations_mll[['array_id','geneName']],
                   indicator = True, how='left').loc[lambda x : x['_merge']=='left_only'].drop(columns='_merge')

In [10]:
matching_df = pd.read_csv(vcf_matching, header=None, sep='\t', names=['array_id','vcf']).set_index('vcf').squeeze()

In [11]:
mutations_df['array_id'] = mutations_df.vcf.map(matching_df) #mapping between vcf and MLL array_id 

In [12]:
mutations_df.geneName = mutations_df.geneName.apply(lambda x: x.split(',') if ',' in x else x)

In [13]:
mutations_df = mutations_df.explode('geneName') #split by gene where multiple genes per mutation

In [14]:
mutations_counts = mutations_df.groupby(['array_id','geneName']).size().reset_index(name = 'promoter_mutations') #add mutations counts

In [15]:
results_df = results_df.merge(mutations_counts, how='left')

In [16]:
results_df.promoter_mutations = results_df.promoter_mutations.fillna(0)

In [17]:
#remove sample-gene pairs with splicing variants

splicing_df = pd.read_csv(splicing_tsv, sep='\t')

results_df = results_df.merge(splicing_df[['geneName','array_id']],
                   indicator = True, how='left').loc[lambda x : x['_merge']!='right_only'].rename(columns={'_merge':'splicing_variant'})

In [18]:
#remove sample-gene pairs with structural variants

sv_df = pd.read_csv(structural_tsv, sep='\t')

results_df = results_df.merge(sv_df[['geneName','array_id','svtype']],
                   indicator = True, how='left').loc[lambda x : x['_merge']!='right_only'].rename(columns={'_merge':'structural_variant'})

In [19]:
#remove sample-gene pairs with copy number variants

cnv_df = pd.read_csv(copy_number_tsv, sep='\t')

results_df = results_df.merge(cnv_df[['geneName','array_id','call']],
                   indicator = True, how='left').loc[lambda x : x['_merge']!='right_only'].rename(columns={'_merge':'copy_number_variant'})

In [20]:
results_df['abnormal'] = results_df.padjust<0.05

In [21]:
results_df['promoter_variant'] = results_df.promoter_mutations>0
results_df['structural_variant'] = results_df.structural_variant == 'both'
results_df['splicing_variant'] = results_df.splicing_variant == 'both'
results_df['copy_number_variant'] = results_df.copy_number_variant == 'both'

In [None]:
reasons = ['promoter_variant', 'structural_variant', 'splicing_variant', 'copy_number_variant']

res = []

for conf in (True,False):

    for reason in reasons:
        
        print(conf, reason)

        if conf==False:

            flt = ~results_df[[x for x in reasons if x!=reason]].sum(axis=1).astype(bool)
            contig_table = pd.crosstab(results_df.loc[flt][reason], results_df.loc[flt]['abnormal'], margins=True)
        else:
            contig_table = pd.crosstab(results_df[reason], results_df['abnormal'], margins=True)

        mutated_activ_prop = contig_table.loc[True,True]/contig_table.loc['All',True]
        nonmutated_activ_prop = contig_table.loc[True,False]/contig_table.loc['All',False]

        p_value = scipy.stats.fisher_exact(contig_table.iloc[:2,:2])[1]

        res.append((conf,reason,mutated_activ_prop,nonmutated_activ_prop,p_value))
    
    #print(f'Fraction of sample-gene pairs with {reason} among abnormally expressed: {mutated_activ_prop:.3}')
    #print(f'Fraction of sample-gene pairs with {reason} among normally expressed: {nonmutated_activ_prop:.3}')

True promoter_variant
True structural_variant
True splicing_variant


In [None]:
res = pd.DataFrame(res, columns=['Confounders', 'Mutation', 'Frac. in anomal expr.', 'Frac. in normal expr.', 'p-value'])