In [1]:
import pandas as pd
import numpy as np

In [2]:
cnv_tsv = '/s/project/mll/rawdata/res_from_mll/cnv_calls.txt'

gene_coords = '/s/project/mll/sergey/effect_prediction/promoter_mutations/genes_GRCh37.tsv.gz' #biomart, HGNC

#gene_coords = '/s/project/mll/sergey/effect_prediction/promoter_mutations/genes_GRCh37/genes_GRCh37.bed'

In [3]:
cnv_df = pd.read_csv(cnv_tsv, sep='\t',
                    usecols=[0,1,2,5,6], names=['chrom','start','end','call','array_id'], skiprows=1)

In [4]:
cnv_df = cnv_df[~cnv_df.chrom.isna()]

In [5]:
#gene_df = pd.read_csv(gene_coords, sep='\t', comment='#',
#                      names=['chrom','start','end','geneName'])

In [6]:
gene_df = pd.read_csv(gene_coords, sep='\t', comment='#',usecols=[0,2,3,8],
                      names=['chrom','start','end','geneHGNC'])

tx_start = gene_df.groupby(['geneHGNC','chrom']).start.min()
tx_end = gene_df.groupby(['geneHGNC','chrom']).end.max()

gene_df = pd.concat([tx_start,tx_end],axis=1).reset_index()
gene_df.chrom = gene_df.chrom.apply(lambda x: x.replace('chr',''))

In [7]:
gene_df.sort_values(by = ['chrom','start'], inplace=True)

In [8]:
cnv_df = cnv_df[cnv_df.call!='0'] 

In [9]:
intervals_df = cnv_df[['chrom','start','end']].drop_duplicates()

intervals_df['geneHGNC'] = None
intervals_df['geneHGNC'] = intervals_df['geneHGNC'].astype(object)

In [10]:
def getOverlap(a, b):
    '''
    Returns the amount of overlap between two intervals
    '''
    #print(a,b)
    return max(0, min(a[1], b[1]) - max(a[0], b[0]))

In [11]:
MIN_OVERLAP = 1000 #minimum overlap between CNV and gene to accept CNV

In [12]:
#loop over copy number intervals and check if they overlap with genes

for chrom in cnv_df.chrom.unique():
    
    print(chrom)
    
    #select given chromosome
    intervals_chrom_df = intervals_df[intervals_df.chrom==chrom]
    gene_chrom_df = gene_df[gene_df.chrom==chrom] #gene coordinates in given chromosome
    
    for idx, row in intervals_chrom_df.iterrows():
        int_start, int_end = row.start, row.end
        #interval end should be larger than the gene start
        max_row_idx = np.searchsorted(gene_chrom_df.start, int_end) #gene_df should already be sorted by chromosome and by start
        df = gene_chrom_df.iloc[:max_row_idx].sort_values(by='end')
        #interval start should be smaller than the gene end
        min_row_idx = np.searchsorted(df.end, int_start)
        df = df.iloc[min_row_idx:] #now all intervals remaining in df should overlap
        
        sufficient_overlap = df.apply(lambda x: getOverlap((x.start,x.end),(int_start, int_end)), axis=1)>MIN_OVERLAP
        
        intervals_df.at[idx,'geneHGNC'] = df.loc[sufficient_overlap].geneHGNC.values.tolist() #add the list of genes for given interval

        #if len(df):
        #    assert isOverlap((df.iloc[0].start,df.iloc[0].end),(int_start, int_end))
        #    assert isOverlap((df.iloc[-1].start,df.iloc[-1].end),(int_start, int_end))       

1
2
3
4
5
6
7
8
9
11
12
13
14
15
16
17
18
19
20
21
22
X
10
Y


In [13]:
intervals_df.geneHGNC = intervals_df.geneHGNC.apply(lambda x:x[0] if type(x)==list and len(x)==1 else x) #list to string if only 1 item

In [14]:
cnv_df = cnv_df.merge(intervals_df, how='left')

In [15]:
cnv_df = cnv_df.explode('geneHGNC')

In [16]:
cnv_df = cnv_df[~cnv_df.geneHGNC.isna()]

In [17]:
matching_genes = pd.read_csv('/s/project/mll/sergey/effect_prediction/promoter_mutations/ensemble_to_HGNC_GRCh38.tsv.gz', sep='\t', 
                      header=None, names=['geneName', 'geneHGNC'], usecols=[0,1], skiprows=1)

matching_genes = matching_genes[~matching_genes.geneHGNC.isna()]

In [22]:
cnv_df = pd.merge(cnv_df,matching_genes, how='left')

In [23]:
cnv_df

Unnamed: 0,chrom,start,end,call,array_id,geneHGNC,geneName
0,1,25602001,25664000,-,MLL_16348,RHD,ENSG00000187010
1,1,25602001,25664000,-,MLL_16348,RSRP1,ENSG00000117616
2,1,53593001,53596000,-,MLL_16348,SLC1A7,ENSG00000162383
3,1,142540001,142838000,+,MLL_16348,ANKRD20A12P,
4,1,149037001,149433000,-,MLL_16348,LOC105369140,
...,...,...,...,...,...,...,...
7136769,22,16051001,16864000,+,MLL_62407,DUXAP8,ENSG00000206195
7136770,22,16051001,16864000,+,MLL_62407,PSLNR,ENSG00000225255
7136771,22,16051001,16864000,+,MLL_62407,POTEH-AS1,ENSG00000236666
7136772,22,16051001,16864000,+,MLL_62407,POTEH,ENSG00000198062


In [31]:
#take a subsample to check overlaps

short_cnv = cnv_df[['chrom','start','end','geneHGNC']].drop_duplicates().sample(n=10000)

In [32]:
df = gene_df.set_index(['geneHGNC','chrom'])

In [34]:
#check overlaps

overlaps=short_cnv.groupby('geneHGNC').apply(lambda x: 
                                 x.apply(lambda y: getOverlap((y.start,y.end),
                                                             df.loc[(x.name,y.chrom)].values.tolist())>MIN_OVERLAP,axis=1))


In [35]:
overlaps.mean()

1.0

In [36]:
cnv_df.to_csv('/s/project/mll/sergey/MLL_data/processed/cnv_mll.tsv', sep='\t', index=None)