In [1]:
import pandas as pd
import os
import re

In [2]:
data_dir = '/s/project/mll/sergey/effect_prediction/outrider/input_data/fraser2/'

In [3]:
splicing_df = []

for leukemia_type in os.listdir(data_dir):
    file_name = data_dir + leukemia_type + '/results.tsv'
    if os.path.isfile(file_name):
        df = pd.read_csv(file_name, sep='\t')
        splicing_df.append(df)

In [4]:
splicing_df = pd.concat(splicing_df)
splicing_df = splicing_df[~splicing_df.sampleID.isna()]

In [5]:
splicing_df.rename(columns={'hgncSymbol':'geneHGNC', 'sampleID':'array_id'}, inplace=True)

In [6]:
matching_genes = pd.read_csv('/s/project/mll/sergey/effect_prediction/promoter_mutations/ensemble_to_HGNC_GRCh38.tsv.gz', sep='\t', 
                      header=None, names=['geneName', 'geneHGNC'], usecols=[0,1], skiprows=1)

matching_genes = matching_genes[~matching_genes.geneHGNC.isna()]

known_HGNC = matching_genes.geneHGNC.unique()

In [7]:
def is_multigene(x):
    
    n_hyphen = len(x.split('-'))
    
    if n_hyphen == 3:
        return x!='IGHV3-30-2'
    
    if n_hyphen == 1 or x in known_HGNC:
        return False
    
    return True

In [8]:
splicing_df.geneHGNC = splicing_df.geneHGNC.apply(lambda x: x.split('-') if is_multigene(x) else x)

splicing_df = splicing_df.explode('geneHGNC')

In [9]:
splicing_df = splicing_df.merge(matching_genes, how='left')

In [10]:
splicing_df[['array_id', 'geneHGNC', 'geneName']].to_csv('/s/project/mll/sergey/MLL_data/processed/significant_splicing.tsv', sep='\t', index=None)

In [11]:
splicing_df

Unnamed: 0,array_id,seqnames,start,end,width,strand,geneHGNC,type,pValue,padjust,...,COUNT_OVERLAPS,GENE_COUNTS_FILE,ANNOTATION,isExternal,potentialImpact,causesFrameshift,UTR_overlap,blacklist,distNearestGene,geneName
0,MLL_126369,chr2,127451524,127453521,1998,+,GYPC,jaccard,0.0,0.000003,...,True,,,False,annotatedIntron_reducedUsage,unlikely,no,False,,ENSG00000136732
1,MLL_126369,chr5,179044112,179044514,403,-,HNRNPH1,jaccard,0.0,0.000055,...,True,,,False,annotatedIntron_increasedUsage,unlikely,no,False,,ENSG00000284254
2,MLL_126369,chr5,179044112,179044514,403,-,HNRNPH1,jaccard,0.0,0.000055,...,True,,,False,annotatedIntron_increasedUsage,unlikely,no,False,,ENSG00000169045
3,MLL_126369,chr17,7126557,7126801,245,+,ACADVL,jaccard,0.0,0.002315,...,True,,,False,exonElongation,likely,no,False,,ENSG00000072778
4,MLL_126369,chrX,117738475,117739200,726,+,DOCK11,jaccard,0.0,0.004604,...,True,,,False,exonElongation,unlikely,no,False,,ENSG00000147251
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45902,MUC_00705,chr19,861957,863091,1135,+,CFD,jaccard,0.0,0.035717,...,True,,,False,annotatedIntron_reducedUsage,unlikely,no,False,,ENSG00000197766
45903,MUC_00723,chr12,57926594,57926766,173,-,DCTN2,jaccard,0.0,0.0,...,True,,,False,annotatedIntron_reducedUsage,unlikely,no,False,,ENSG00000175203
45904,MUC_00723,chr21,36253011,36259139,6129,-,RUNX1,jaccard,0.0,0.0,...,True,,,False,annotatedIntron_reducedUsage,unlikely,no,False,,ENSG00000159216
45905,MUC_00723,chr19,16254585,16259533,4949,+,HSH2D,jaccard,0.0,0.000015,...,True,,,False,annotatedIntron_reducedUsage,unlikely,5'-UTR,False,,ENSG00000196684
