Find corresponding human 3'UTR regions for each Oligo variant from Griesemer et al. paper:

https://www.sciencedirect.com/science/article/pii/S0092867421009995

* consider only SNPs
* apparently, in Griesemer not only protein-coding genes were considered, so we loose some variants

In [1]:
import pandas as pd
import numpy as np

In [2]:
datadir = '/s/project/mll/sergey/effect_prediction/MLM/'

In [3]:
#clean human 3'UTR, see GRCh38_3_prime_UTR_clean.ipynb

human_utr_df = pd.read_csv(datadir + 'UTR_coords/GRCh38_3_prime_UTR_clean.bed', sep='\t', 
                       names = ['chrom','human_UTR_start','human_UTR_end','UTR_ID',
                               'score','strand','transcript_ID','canonical','HGNC_Symbol','UTR_len'])

human_utr_df.drop(columns=['score','canonical','HGNC_Symbol','transcript_ID','UTR_len'], inplace=True)

In [4]:
human_utr_df.sort_values(by=['chrom','human_UTR_start'], inplace=True) #IMPORTANT to use searchsorted function below

human_utr_df.head()

Unnamed: 0,chrom,human_UTR_start,human_UTR_end,UTR_ID,strand
564,chr1,70008,71585,ENST00000641515.2_utr3_2_0_chr1_70009_f,+
565,chr1,944153,944574,ENST00000616016.5_utr3_13_0_chr1_944154_f,+
566,chr1,944202,944693,ENST00000327044.7_utr3_18_0_chr1_944203_r,-
567,chr1,965191,965719,ENST00000338591.8_utr3_11_0_chr1_965192_f,+
568,chr1,974575,975865,ENST00000379410.8_utr3_15_0_chr1_974576_f,+


In [5]:
# original table from paper supplementary

oligo_df = pd.read_csv(datadir + 'griesemer/paper_supplementary/Oligo_Variant_Info.txt', sep = '\t', dtype={'chrom':'object'})
oligo_df = oligo_df[oligo_df.oligo_starts.str.contains(',')!=True] #remove cases when 3'UTR spans non-contiguous regions
oligo_df.rename(columns={'ref_allele':'ref', 'alt_allele':'alt'}, inplace=True)
oligo_df.drop(columns=['oligo_starts', 'oligo_ends', 'var_start', 'var_end', 'genes', 'transcripts', 'gene_symbols', 'strand'], inplace = True) #will be replaced by GRCh38 columns
oligo_df.chrom = oligo_df.chrom.apply(lambda x:'chr'+x)

In [6]:
#positions from the original table, lifted over to GRCh38 

varpos_GRCh38_bed = pd.read_csv(datadir + 'griesemer/liftover/varpos_GRCh38.bed', 
                            sep='\t',names=['chrom','var_start','var_end','ref','alt','score','variant_id']).drop(columns='score') #supplementary info to the paper

oligopos_GRCh38_bed = pd.read_csv(datadir + 'griesemer/liftover/oligopos_GRCh38.bed', 
                            sep='\t',names=['chrom','oligo_start','oligo_end','oligo_id']) #supplementary info to the paper

oligo_df = oligo_df.merge(varpos_GRCh38_bed, how='left')
oligo_df = oligo_df.merge(oligopos_GRCh38_bed, how='left')

In [7]:
#assign oligos with alternative background oligo_start and oligo_end coordinates of the normal oligo

alt_background = ~oligo_df.other_var_in_oligo_window.isna()

varID_to_oligo_coords = oligo_df.loc[~alt_background,['mpra_variant_id', 'oligo_start','oligo_end']].drop_duplicates()
varID_to_oligo_coords.mpra_variant_id = varID_to_oligo_coords.mpra_variant_id + '_2'

merge_df = oligo_df.loc[alt_background,['mpra_variant_id']].merge(varID_to_oligo_coords, how = 'left')

oligo_df.loc[alt_background,'oligo_start'] =  merge_df.oligo_start.values
oligo_df.loc[alt_background,'oligo_end'] =  merge_df.oligo_end.values

In [8]:
oligo_df = oligo_df[(~oligo_df.var_start.isna()) & (~oligo_df.oligo_start.isna())] #remove variants with unmapped start positions (failed liftover)

In [9]:
# for each oligo, find human UTR region containing this oligo

res = []

for chrom in oligo_df.chrom.unique():
    chrom_utr_df = human_utr_df[human_utr_df.chrom==chrom]
    for _, row in oligo_df[oligo_df.chrom==chrom].iterrows():
        oligo_start, oligo_end = row.oligo_start, row.oligo_end
        utr_idx = np.searchsorted(chrom_utr_df.human_UTR_start,oligo_start,'right')-1
        while utr_idx<len(chrom_utr_df) and oligo_start>=chrom_utr_df.iloc[utr_idx].human_UTR_start:
            if oligo_end<=chrom_utr_df.iloc[utr_idx].human_UTR_end:
                row['UTR_ID'] = chrom_utr_df.iloc[utr_idx].UTR_ID
                res.append(row)
                break
            utr_idx+=1

        #else:
        #    print(row.chrom, row.var_start)

In [10]:
utr_variants = pd.DataFrame(res).merge(human_utr_df, how='left')

In [11]:
#utr_variants.to_csv(datadir + 'griesemer/GRCh38_UTR_variants.tsv', index=None, sep='\t')

In [74]:
stop_codon_dist = utr_variants.apply(lambda x: x.var_start-x.human_UTR_start if x.strand=='+' else
                  x.human_UTR_end - x.var_start, axis=1)

In [78]:
stop_codon_dist.describe()

count    18376.000000
mean      1690.222464
std       1960.815881
min          0.000000
25%        413.000000
50%       1046.000000
75%       2213.000000
max      21917.000000
dtype: float64

In [79]:
utr_variants

Unnamed: 0,mpra_variant_id,tag,oligo_id,variant_id,chrom,ref,alt,other_var_in_oligo_window,var_start,var_end,oligo_start,oligo_end,UTR_ID,human_UTR_start,human_UTR_end,strand,stop_codon_dist
0,1_114240320_AC_5'_End,alt,1_114240320_AC_5'_End_alt,1_114240320_AC,chr1,A,AC,,113697697.0,113697698.0,113697604.0,113697704.0,ENST00000369604.6_utr3_18_0_chr1_113696831_r,113696830,113697704,-,7.0
1,1_114240320_AC_5'_End,ref,1_114240320_AC_5'_End_ref,1_114240320_AC,chr1,A,AC,,113697697.0,113697698.0,113697604.0,113697704.0,ENST00000369604.6_utr3_18_0_chr1_113696831_r,113696830,113697704,-,7.0
2,1_1403972_CG,ref,1_1403972_CG_ref,1_1403972_CG,chr1,C,CG,,1468591.0,1468592.0,1468542.0,1468642.0,ENST00000378785.7_utr3_11_0_chr1_1468531_f,1468530,1470163,+,61.0
3,1_1403972_CG,alt,1_1403972_CG_alt,1_1403972_CG,chr1,C,CG,,1468591.0,1468592.0,1468542.0,1468642.0,ENST00000378785.7_utr3_11_0_chr1_1468531_f,1468530,1470163,+,61.0
4,1_156436092_CA,alt,1_156436092_CA_alt,1_156436092_CA,chr1,C,CA,,156466299.0,156466300.0,156466250.0,156466350.0,ENST00000348159.9_utr3_11_0_chr1_156463727_r,156463726,156467644,-,1345.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18375,rs964617,alt,rs964617_alt,rs964617,chr13,C,T,,113099214.0,113099215.0,113099164.0,113099265.0,ENST00000535094.7_utr3_29_0_chr13_113096860_f,113096859,113099742,+,2355.0
18376,rs9652173,alt,rs9652173_alt,rs9652173,chr13,C,T,,75525873.0,75525874.0,75525823.0,75525924.0,ENST00000682242.1_utr3_3_0_chr13_75525214_r,75525213,75526588,-,715.0
18377,rs9652173,ref,rs9652173_ref,rs9652173,chr13,C,T,,75525873.0,75525874.0,75525823.0,75525924.0,ENST00000682242.1_utr3_3_0_chr13_75525214_r,75525213,75526588,-,715.0
18378,rs9943,alt,rs9943_alt,rs9943,chr13,A,G,,39752144.0,39752145.0,39752094.0,39752195.0,ENST00000455146.8_utr3_18_0_chr13_39751094_f,39751093,39752628,+,1051.0
