Analyse MAF alignment of stop codons for each transcript

For each transcript:

* take species whose stop codon is contiguous and matches the human stop codon
* if MAF alignment strand (species strand) is positive, 3'UTR can be to the right of the stop codon (positive gene) or to the left (negative gene). In the latter case, should be reverse complemented
* if MAF alignment strand is negative, take original sequence for negative genes and reverse complement for positive genes

In [1]:
import pandas as pd
import os
from collections import defaultdict
from pathlib import Path

In [17]:
#clean human 3'UTR, see GRCh38_3_prime_UTR_clean.ipynb

human_utr_df = pd.read_csv('/s/project/mll/sergey/MLM/UTR_coords/GRCh38_3_prime_UTR_clean.bed', sep='\t', 
                       names = ['chrom','human_UTR_start','human_UTR_end','UTR_ID',
                               'score','strand','transcript_ID','canonical','HGNC_Symbol','UTR_len']).set_index('UTR_ID') 

In [18]:
data_dir = '/s/project/mll/sergey/MLM/exp/600_way_stop_codon/maf/' #stop codon alignment for each transcript, see hal_to_maf_array.sh

stop_codon_mafs = [p for p in Path(data_dir).rglob("*.maf")]

In [19]:
def extract_msa(maf_file):
    
    '''
    Get sequences with contiguous alignment around human start codon for a given transcript
    '''

    msa = defaultdict(dict)

    contig_lenghts = dict() #total lengths of contigs for different species

    first_block = 1 #current MSA block is the first in the file

    with open(maf_file, 'r') as f:
        for line in f:
            if line.startswith('s'):
                _, species_contig, start, seq_len, strand, contig_length, seq = line.split()
                start, seq_len = int(start), int(seq_len)
                if first_block:
                    msa[species_contig]['seq'] = seq
                    msa[species_contig]['next_pos'] = start+seq_len #expected start position in the next alignment block
                    msa[species_contig]['strand'] = strand
                    contig_lenghts[species_contig] = int(contig_length)
                elif species_contig in msa.keys() and msa[species_contig]['next_pos'] == start:
                    #extend sequence only if the same contig is already seen in the 1st block, i.e. alignment is contiguous
                    msa[species_contig]['seq'] += seq
                    msa[species_contig]['next_pos'] = start+seq_len
            elif len(msa)>0:
                first_block = 0 
            
    return msa,contig_lenghts

In [20]:
def get_utr_coords(UTR_ID,msa,contig_lenghts):
    
    '''
    Get 3'UTR coordinates for all species
    the UTR length is the human UTR length
    start (end) of UTR corresponds to end (start) of the stop codon
    '''
    
    res = []

    if UTR_ID not in human_utr_df.index:
        return res

    gene_strand = human_utr_df.loc[UTR_ID].strand
    UTR_len = human_utr_df.loc[UTR_ID].UTR_len
    HGNC_Symbol = human_utr_df.loc[UTR_ID].HGNC_Symbol

    def is_stop_codon(seq):
        if gene_strand=='+':
            return seq.upper() in ['TAG','TAA','TGA']
        else:
            return seq.upper() in ['CTA','TTA','TCA']

    for k in msa.keys():
        if k.startswith('Homo_sapiens'):
            homo_sapiens_key = k
            break
                    
    if not is_stop_codon(msa[homo_sapiens_key]['seq'].upper()):
        #if human stop codon isn't correct, return empty array
        return res

    for species_contig,v in msa.items():
        species_seq, species_strand = v['seq'], v['strand']
        if species_seq.upper()==msa[homo_sapiens_key]['seq'].upper(): #if stop codon is the same as for human
            species, *contig = species_contig.split('.')
            contig = '.'.join(contig)
            if gene_strand=='+':
                if species_strand=='+':
                    utr_start = v['next_pos']
                    utr_end = min(v['next_pos'] + UTR_len,contig_lenghts[species_contig])
                else:
                    utr_end = contig_lenghts[species_contig]-v['next_pos']
                    utr_start = max(utr_end - UTR_len,0)
                    #needs reverse complement after FASTA extraction
            else:
                if species_strand=='+':
                    utr_end = v['next_pos'] - 3
                    utr_start = max(utr_end - UTR_len,0)
                    #needs reverse complement after FASTA extraction
                else:
                    utr_start = contig_lenghts[species_contig]-v['next_pos'] + 3
                    utr_end = min(utr_start + UTR_len,contig_lenghts[species_contig])

            res.append((HGNC_Symbol,UTR_ID,UTR_len,gene_strand,species,contig,utr_start,utr_end,species_strand))
            
    return res

In [22]:
res = []

for maf_idx, maf_path in enumerate(stop_codon_mafs):
    
    #loop over all stop codon MAF files and extract 3'UTR coordinates for all species
    
    #maf_file = 'ENST00000698999.1_utr3_4_0_chrX_135775852_r.maf'
            
    msa,contig_lenghts = extract_msa(maf_path)
    
    UTR_ID = maf_path.stem
    
    subtable = get_utr_coords(UTR_ID, msa,contig_lenghts)
    
    res.extend(subtable)
    
    if (maf_idx+1)%100==0:
        print(f'{maf_idx+1}/{len(stop_codon_mafs)} files done')
        
    #if maf_idx+1==1000:
    #    break
        


100/20222 files done
200/20222 files done
300/20222 files done
400/20222 files done
500/20222 files done
600/20222 files done
700/20222 files done
800/20222 files done
900/20222 files done
1000/20222 files done
1100/20222 files done
1200/20222 files done
1300/20222 files done
1400/20222 files done
1500/20222 files done
1600/20222 files done
1700/20222 files done
1800/20222 files done
1900/20222 files done
2000/20222 files done
2100/20222 files done
2200/20222 files done
2300/20222 files done
2400/20222 files done
2500/20222 files done
2600/20222 files done
2700/20222 files done
2800/20222 files done
2900/20222 files done
3000/20222 files done
3100/20222 files done
3200/20222 files done
3300/20222 files done
3400/20222 files done
3500/20222 files done
3600/20222 files done
3700/20222 files done
3800/20222 files done
3900/20222 files done
4000/20222 files done
4100/20222 files done
4200/20222 files done
4300/20222 files done
4400/20222 files done
4500/20222 files done
4600/20222 files do

In [23]:
df = pd.DataFrame(res,
             columns=['HGNC_Symbol','human_UTR_ID','human_UTR_len','human_transcript_strand','species','contig','3_prime_UTR_start','3_prime_UTR_end','MAF_strand'])

In [19]:
df.sort_values(by=['species','contig','3_prime_UTR_start'], inplace=True)

In [20]:
#df.to_csv('/s/project/mll/sergey/MLM/UTR_coords/GRCh38_3_prime_UTR_all_species.tsv', sep='\t',index=None)

In [21]:
df = pd.read_csv('/s/project/mll/sergey/MLM/UTR_coords/GRCh38_3_prime_UTR_all_species.tsv', sep='\t')

In [8]:
len(df)*2000/1024**3 #dataset size in Gb if 2000bp are used for each UTR

7.048007100820541

In [9]:
df.human_UTR_len.sum()/1024**3 #dataset size in Gb if for each UTR the full length is taken

6.585104384459555