Find corresponding human 3'UTR regions for each Oligo variant from Griesemer et al. paper:

https://www.sciencedirect.com/science/article/pii/S0092867421009995

* consider only SNPs
* apparently, in Griesemer not only protein-coding genes were considered, so we loose some variants

In [1]:
import pandas as pd
import numpy as np

In [2]:
datadir = '/lustre/groups/epigenereg01/workspace/projects/vale/mlm/mpra/griesemer_2021/'

In [22]:
#clean human 3'UTR, see GRCh38_3_prime_UTR_clean.ipynb

human_utr_df = pd.read_csv(datadir + '../../UTR_coords/GRCh38_3_prime_UTR_clean.bed', sep='\t', 
                       names = ['chrom','human_UTR_start','human_UTR_end','UTR_ID',
                               'score','strand','transcript_ID','canonical','HGNC_Symbol','UTR_len'])

human_utr_df.drop(columns=['score','canonical','HGNC_Symbol','transcript_ID','UTR_len'], inplace=True)

In [23]:
human_utr_df.head()

Unnamed: 0,chrom,human_UTR_start,human_UTR_end,UTR_ID,strand
0,chr1,67092164,67093004,ENST00000684719.1_utr3_7_0_chr1_67092165_r,-
1,chr1,8352403,8355086,ENST00000400908.7_utr3_22_0_chr1_8352404_r,-
2,chr1,75202128,75203726,ENST00000370859.8_utr3_23_0_chr1_75202129_r,-
3,chr1,83865023,83869961,ENST00000260505.13_utr3_20_0_chr1_83865024_r,-
4,chr1,92246401,92246529,ENST00000370360.8_utr3_18_0_chr1_92246402_r,-


In [18]:
# original table from paper supplementary

oligo_df = pd.read_csv(datadir + 'griesemer_supplementary/Oligo_Variant_Info.txt', sep = '\t', dtype={'chrom':'object'})
oligo_df = oligo_df[oligo_df.oligo_starts.str.contains(',')!=True] #remove cases when 3'UTR spans non-contiguous regions
oligo_df.rename(columns={'ref_allele':'ref', 'alt_allele':'alt'}, inplace=True)
oligo_df.drop(columns=['oligo_starts', 'oligo_ends', 'var_start', 'var_end', 'genes', 'transcripts', 'gene_symbols', 'strand'], inplace = True) #will be replaced by GRCh38 columns
oligo_df.chrom = oligo_df.chrom.apply(lambda x:'chr'+x)

In [19]:
#positions from the original table, lifted over to GRCh38 

varpos_GRCh38_bed = pd.read_csv(datadir + 'preprocessing/liftover/varpos_GRCh38.bed', 
                            sep='\t',names=['chrom','var_start','var_end','ref','alt','score','variant_id']).drop(columns='score') #supplementary info to the paper

oligopos_GRCh38_bed = pd.read_csv(datadir + 'preprocessing/liftover/oligopos_GRCh38.bed', 
                            sep='\t',names=['chrom','oligo_start','oligo_end','oligo_id']) 

oligo_df = oligo_df.merge(varpos_GRCh38_bed, how='left')
oligo_df = oligo_df.merge(oligopos_GRCh38_bed, how='left')

In [20]:
#assign oligos with alternative background oligo_start and oligo_end coordinates of the normal oligo

alt_background = ~oligo_df.other_var_in_oligo_window.isna()

varID_to_oligo_coords = oligo_df.loc[~alt_background,['oligo_id', 'oligo_start','oligo_end']].drop_duplicates()
varID_to_oligo_coords.oligo_id = varID_to_oligo_coords.oligo_id + '_2'

merge_df = oligo_df.loc[alt_background,['oligo_id']].merge(varID_to_oligo_coords, how = 'left')

oligo_df.loc[alt_background,'oligo_start'] =  merge_df.oligo_start.values
oligo_df.loc[alt_background,'oligo_end'] =  merge_df.oligo_end.values

In [21]:
#remove variants with unmapped start positions (failed liftover)
oligo_df = oligo_df[(~oligo_df.var_start.isna()) & (~oligo_df.oligo_start.isna())] 

In [None]:
#IMPORTANT to use searchsorted function below
human_utr_df.sort_values(by=['chrom','human_UTR_start'], inplace=True) 

In [30]:
# for each oligo, find human UTR region containing this oligo

res = []

for chrom in oligo_df.chrom.unique():
    chrom_utr_df = human_utr_df[human_utr_df.chrom==chrom]
    for _, row in oligo_df[oligo_df.chrom==chrom].iterrows():
        oligo_start, oligo_end = row.oligo_start, row.oligo_end
        utr_idx = np.searchsorted(chrom_utr_df.human_UTR_start,oligo_start,'right')-1
        while utr_idx<len(chrom_utr_df) and oligo_start>=chrom_utr_df.iloc[utr_idx].human_UTR_start:
            if oligo_end<=chrom_utr_df.iloc[utr_idx].human_UTR_end:
                row['UTR_ID'] = chrom_utr_df.iloc[utr_idx].UTR_ID
                res.append(row)
                break
            utr_idx+=1

        #else:
        #    print(row.chrom, row.var_start)

In [31]:
utr_variants = pd.DataFrame(res).merge(human_utr_df, how='left')

In [33]:
utr_variants.to_csv(datadir + 'preprocessing/GRCh38_UTR_variants.tsv', index=None, sep='\t')