### Modules

In [None]:
import os, pybedtools
import numpy as np, pandas as pd
from Bio import SeqIO
from collections import OrderedDict
from Bio.Seq import IUPACData
from itertools import product

### Transcript coordination to Genome coordination

In [None]:
tx_file = '/PATH/TO/TX/TABLE/enst_hg38Tables.tsv' # available in m6ATM repository: data/enst_hg38Tables.tar.gz.
ref_tx = '/PATH/TO/TX/REF/GRCh38_rna_ensembl.fa' # reference transcript from ensemble
ref_gn = '/PATH/TO/GENOME/REF/hg38.fa' # reference genome

In [None]:
def tx_to_gn(results, tx_df, ref_dict_gn):
    
    ### settings
    n_kmer = 5
    margin = int((n_kmer-1)*0.5) # 5-mer
    shift_range = range(-2, 3) # range to be fixed

    ### merge with tx_df
    results = results.merge(tx_df, how = 'left', left_on = 'transcript', right_on = 'name')
    results = results[results.chrom.isin(list(ref_dict_gn.keys()))]

    ### gn conversion
    results_gn = results.copy()
    results_gn['gn'] = results_gn.apply(get_gn_pos, ref_dict_gn = ref_dict_gn, margin = margin, axis = 1)
    results_gn['gn_pos'] = [int(i.split('_')[0]) for i in results_gn['gn']]
    results_gn['gn_motif'] = [i.split('_')[1] for i in results_gn['gn']]

    ### try to fix false_table with shifts
    results_true = results_gn.loc[lambda row: row['motif'] == row['gn_motif']]
    results_false = results_gn.loc[lambda row: row['motif'] != row['gn_motif']]

    results_fixed = try_shift(results_false, ref_dict_gn, shift_range = shift_range, margin = margin)
    if len(results_fixed)>0:
        results_gn = pd.concat([results_true, results_fixed], axis = 0)
    else:
        results_gn = results_true

    ### add info
    results_gn = results_gn.reset_index(drop = True)
    results_gn['gn_pos_1'] = [i+1 for i in results_gn['gn_pos']]
    results_gn['gn_site'] = [x+'_'+str(y) for x, y in zip(results_gn['chrom'], results_gn['gn_pos_1'])]

    results_gn = results_gn.groupby('gn_site', as_index = False).agg({'transcript': lambda x: ','.join(x),
                                                                      'position': lambda x: merge_int(x),
                                                                      'motif': 'first',
                                                                      'probability': 'mean',
                                                                      'ratio': 'mean',
                                                                      'm6a': lambda x: ','.join(x),
                                                                      'name2': 'first',
                                                                      'gn_pos': 'first',
                                                                      'gn_pos_1': 'first',
                                                                      'chrom': 'first', 
                                                                      'strand': 'first',
                                                                      'coverage': 'sum',
                                                                      'gn_motif' : 'count'})

    results_gn.rename(columns = {'gn_motif': 'tx_count'}, inplace = True)
    
    return results_gn

def get_gn_pos(row, ref_dict_gn, margin):
    
    chrom = row['chrom']
    strand = row['strand']
    pos = row['position'] # based on transcriptome: 0-start
    
    ### exon info 
    exons_start = [int(i) for i in row['exonStarts'].split(',')[:-1]]
    exons_end = [int(i) for i in row['exonEnds'].split(',')[:-1]]

    ### tx to gn conversion
    exon_len = 0
    if strand == '+':
        for x, y in zip(exons_start, exons_end):
            exon_len_last = exon_len
            exon_len += (y-x)
            if (pos+1)<=exon_len:
                gn_index = x+(pos-exon_len_last)

                kmer = ref_dict_gn[chrom][gn_index-margin:gn_index+margin+1].upper()
                kmer = ''.join(kmer)
                break

    if strand == '-':
        for x, y in zip(exons_start[::-1], exons_end[::-1]):
            exon_len_last = exon_len
            exon_len += (y-x)
            if (pos+1)<=exon_len:
                gn_index = y-(pos-exon_len_last)-1

                kmer = ref_dict_gn[chrom][gn_index-margin:gn_index+margin+1].reverse_complement().upper()
                kmer = ''.join(kmer)
                break
    
    result = str(gn_index)+'_'+kmer
    
    return result

def try_shift(false_table, ref_dict_gn, shift_range = range(-2,3), margin = 2):
    
    t_table = []
    for shift in shift_range:

        table = false_table.copy()
        table.loc[:,'gn_pos'] = [int(i+shift) for i in table.gn_pos.tolist()]
        table.loc[:,'gn_motif'] = [str(ref_dict_gn[chrom][int(pos)-margin:int(pos)+margin+1].upper()) for chrom, pos in zip(table.chrom, table.gn_pos)]

        # true or false
        t_idx = [x == y for x, y in zip(table['motif'], table['gn_motif'])]
        t_row = table.loc[t_idx,:]

        if t_row.shape[0]>0:
            t_table.append(t_row)

    if len(t_table)>0:
        t_table = pd.concat(t_table, axis = 0)

    return t_table

def merge_int(x):
    
    str_list = list(map(str, x))
    merged = ','.join(str_list)
    
    return merged

def get_ref_dict(ref_path):
    
    ref_dict = OrderedDict()
    for record in SeqIO.parse(ref_path, 'fasta'):
        ref_dict[record.id] = record.seq
        
    return ref_dict

def add_kmer(row):
    
    tx = row['transcript']
    pos = row['position']-1
    motif = ref_dict_tx[tx][pos-2:pos+3]
    motif = ''.join(motif)
    
    return motif

def extend_ambiguous(seq):
    
    d = IUPACData.ambiguous_dna_values
    return list(map(''.join, product(*map(d.get, seq)))) 

In [None]:
### load reference files
tx_df = pd.read_csv(tx_file, sep = '\t')
tx_df['name'] = [i.split('.')[0] for i in tx_df['name']]
ref_dict_tx = get_ref_dict(ref_tx)
ref_dict_gn = get_ref_dict(ref_gn)
DRACH = extend_ambiguous('DRACH')

In [None]:
### m6ATM
results = pd.read_csv('../data/hek293tx_m6atm.csv')
results_gn = tx_to_gn(results, tx_df, ref_dict_gn)

### m6Anet
pred_m6anet = pd.read_csv('../data/hek293tx_m6anet.csv')
pred_m6anet.columns = ['transcript', 'position', 'coverage', 'probability', 'motif', 'ratio']
pred_m6anet['m6a'] = '-'
pred_m6anet['transcript'] = [i.split('.')[0] for i in pred_m6anet['transcript']]

pred_m6anet_gn = tx_to_gn(pred_m6anet, tx_df, ref_dict_gn)

### m6aBasecaller
pred_m6abase = pd.read_csv('../data/hek293tx_m6abase.tsv', sep = '\t')
pred_m6abase.columns = ['transcript', 'position', 'ref_base', 'strand', 'm6a', 'coverage', 'base_accuracy', 'ratio', 'probability']
pred_m6abase = pred_m6abase.loc[:,['transcript', 'position', 'coverage', 'probability', 'ratio', 'm6a', 'ref_base']]
pred_m6abase = pred_m6abase[pred_m6abase['ref_base'] == 'A']
pred_m6abase['probability'] = pred_m6abase['probability'].replace(np.nan, 0)
pred_m6abase['motif'] = pred_m6abase.apply(add_kmer, axis = 1)
pred_m6abase = pred_m6abase[pred_m6abase['motif'].isin(DRACH)]
pred_m6abase['transcript'] = [i.split('.')[0] for i in pred_m6abase['transcript']]

pred_m6abase_gn = tx_to_gn(pred_m6abase, tx_df, ref_dict_gn)

### Tombo
pred_tombo = pd.read_csv('../data/hek293tx_tombo.bed', sep = '\t', header = None)
pred_tombo.columns = ['transcript', 'start', 'position', 'id', 'probability']
pred_tombo['ratio'] = 0
pred_tombo['m6a'] = '-'
pred_tombo['motif'] = pred_tombo.apply(add_kmer, axis = 1)

pred_tombo_bed = pybedtools.BedTool.from_dataframe(df = pred_tombo)
coverage_bed = pybedtools.BedTool('../data/hek293_tombo.coverage.plus.bedgraph')
pred_tombo_bed = pred_tombo_bed.intersect(coverage_bed, wb = True, wa = True)
pred_tombo_bed = pred_tombo_bed.to_dataframe()

pred_tombo['coverage'] = pred_tombo_bed['blockStarts']
pred_tombo['transcript'] = [i.split('.')[0] for i in pred_tombo['transcript']]

pred_tombo_gn = tx_to_gn(pred_tombo, tx_df, ref_dict_gn)

### MINES
pred_mines = pd.read_csv('../data/hek293tx_mines.bed', sep = '\t', header = None)
pred_mines.columns = ['transcript', 'start', 'position', 'motif', 'id', '_strand', 'probability', 'coverage']
pred_mines['probability'] = pred_mines['probability'].replace('.', 0)
pred_mines['probability'] = pred_mines['probability'].astype('float')
pred_mines['ratio'] = 0
pred_mines['m6a'] = '-'
pred_mines['transcript'] = [i.split('.')[0] for i in pred_mines['transcript']]

pred_mines_gn = tx_to_gn(pred_mines, tx_df, ref_dict_gn)