In [1]:
%pylab inline
from collections import defaultdict, OrderedDict
import warnings
import gffutils
import pybedtools
from riboraptor.interval import Interval
from riboraptor.fasta import FastaReader
from tqdm import tqdm
import pandas as pd
import copy
import os
import re
from gffutils.pybedtools_integration import tsses
from copy import deepcopy
from collections import OrderedDict, Callable
import errno

def mkdir_p(path):
    try:
        os.makedirs(path)
    except OSError as exc:  # Python >2.5
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else:
            raise
            
class DefaultOrderedDict(OrderedDict):
    # Source: http://stackoverflow.com/a/6190500/562769
    def __init__(self, default_factory=None, *a, **kw):
        if (default_factory is not None and
           not isinstance(default_factory, Callable)):
            raise TypeError('first argument must be callable')
        OrderedDict.__init__(self, *a, **kw)
        self.default_factory = default_factory

    def __getitem__(self, key):
        try:
            return OrderedDict.__getitem__(self, key)
        except KeyError:
            return self.__missing__(key)

    def __missing__(self, key):
        if self.default_factory is None:
            raise KeyError(key)
        self[key] = value = self.default_factory()
        return value

    def __reduce__(self):
        if self.default_factory is None:
            args = tuple()
        else:
            args = self.default_factory,
        return type(self), args, None, None, self.items()

    def copy(self):
        return self.__copy__()

    def __copy__(self):
        return type(self)(self.default_factory, self)

    def __deepcopy__(self, memo):
        import copy
        return type(self)(self.default_factory,
                          copy.deepcopy(self.items()))

    def __repr__(self):
        return 'OrderedDefaultDict(%s, %s)' % (self.default_factory,
                                               OrderedDict.__repr__(self))


Populating the interactive namespace from numpy and matplotlib


In [2]:
gtf = '/home/cmb-panasas2/skchoudh/genomes/hg38/annotation/gencode.v25.annotation.gtf'
gtf_db = '/home/cmb-panasas2/skchoudh/genomes/hg38/annotation/gencode.v25.annotation.gtf.db'
chrsizes = '/home/cmb-panasas2/skchoudh/genomes/hg38/fasta/hg38.chrom.sizes'
fasta = '/home/cmb-panasas2/skchoudh/genomes/hg38/fasta/hg38.fa'

In [3]:
def create_gene_dict(db):
    '''
    Store each feature line db.all_features() as a dict of dicts
    '''
    gene_dict = DefaultOrderedDict(lambda: DefaultOrderedDict(lambda: DefaultOrderedDict(list)))
    for line_no, feature in enumerate(db.all_features()):
        gene_ids = feature.attributes['gene_id']
        feature_type = feature.featuretype
        if feature_type == 'gene':
            if len(gene_ids)!=1:
                logging.warning('Found multiple gene_ids on line {} in gtf'.format(line_no))
                break
            else:
                gene_id = gene_ids[0]
                gene_dict[gene_id]['gene'] = feature
        else:
            transcript_ids = feature.attributes['transcript_id']

            for gene_id in gene_ids:
                for transcript_id in transcript_ids:
                    gene_dict[gene_id][transcript_id][feature_type].append(feature)
    return gene_dict

In [4]:
db = gffutils.FeatureDB(gtf_db, keep_order=True)
gene_dict = create_gene_dict(db)


In [5]:
for x in db.featuretypes():
    print(x)

CDS
Selenocysteine
UTR
exon
gene
start_codon
stop_codon
transcript


In [6]:
fasta_reader = FastaReader(fasta)

In [7]:
def get_UTR_regions(utrs, cds):
    if len(cds)==0:
        return [], []
    utr5_regions = []
    utr3_regions = []    
    cds_sorted = sorted(list(cds), key=lambda x: x.start)
    first_cds = cds_sorted[0]
    last_cds = cds_sorted[-1]
    for orig_utr in utrs:
        utr = deepcopy(orig_utr)
        ## Push all cds at once
        ## Sort later to remove duplicates
        strand = utr.strand
        if utr.start < first_cds.start:
            if utr.stop >= first_cds.start:
                utr.stop = first_cds.start - 1
            if strand == '+':
                utr5_regions.append(utr)
            else:
                utr3_regions.append(utr)
        elif utr.stop > last_cds.stop:
            if utr.start <= last_cds.stop:
                utr.start = last_cds.stop + 1
            if strand == '+':
                utr3_regions.append(utr)
            else:
                utr5_regions.append(utr)
       
    return utr5_regions, utr3_regions

def get_gene_list(gene_dict):
    return list(set(gene_dict.keys()))

# What do we do now?

Our aim to to extract all the dORFs and uORFs. We start with a very general definition of (u|d)ORF:
Given coordinates of the UTR, we will look for star_codons

start_codons = ['ATG', 'AAG', 'ACG', 'ATC', 'GTG', 'AGG', 'ATA', 'ATT', 'CTG', 'TTG']

stop_codons = ['TAG, 'TAA', 'TGA']

Le's deal with uORFs first.


In [93]:
START_CODONS = ['ATG', 'AAG', 'ACG', 'ATC', 'GTG', 'AGG', 'ATA', 'ATT', 'CTG', 'TTG']

STOP_CODONS = ['TAG', 'TAA', 'TGA']


In [8]:
gene_wise_CCDS_dict = DefaultOrderedDict(list)

for line_no, feature in enumerate(db.all_features()):
    gene_ids = feature.attributes['gene_id']    
    feature_type = feature.featuretype
    try:
        tags = feature.attributes['tag']
        if feature_type != 'UTR':
            continue
        
        else:
            assert len(gene_ids) == 1
            gene_id = gene_ids[0]
            transcript_ids = feature.attributes['transcript_id']
            assert len(transcript_ids) == 1
            transcript_id = transcript_ids[0]

            feature.attributes['gene_id'] = '{}_{}'.format(gene_id, transcript_id)
            gene_wise_CCDS_dict[gene_id].append(feature)
    except:
        continue

In [9]:
UTR_5 = []
UTR_3 = []

In [10]:
for gene_id in tqdm(get_gene_list(gene_dict)):   
    utr5_regions, utr3_regions = [], []
    cds_regions = []
    utr_regions = []
    for feature in gene_dict[gene_id].keys():
        if feature == 'gene':
            continue
        cds = list(gene_dict[gene_id][feature]['CDS'])
        utrs = list(gene_dict[gene_id][feature]['UTR'])

        cds = sorted(list(cds), key=lambda x: x.start)
        utrs  = sorted(list(utrs), key=lambda x: x.start)
        
        cds_regions += cds
        utr_regions += utrs

    cds_regions = sorted(list(cds_regions), key=lambda x: x.start)
    utr_regions = sorted(list(utr_regions), key=lambda x: x.start)
    
    utr5_regions, utr3_regions = get_UTR_regions(utr_regions, cds_regions)
    
    UTR_5 += utr5_regions
    UTR_3 += utr3_regions
    
    

100%|██████████| 58037/58037 [00:33<00:00, 1737.24it/s]


In [28]:
len(UTR_5)

103583

In [67]:
to_write = 'transcript_id\tgene_id\tgene_type\tchrom\tstart\tend\tstrand\texon_number\ttags\tis_CCDS\tuORF_type\n'
for utr5 in UTR_5:
    transcript_id = utr5.attributes['transcript_id']
    assert len(transcript_id) == 1
    transcript_id = transcript_id[0]
    gene_id = utr5.attributes['gene_id']
    assert len(gene_id) == 1
    gene_id = gene_id[0]
    
    gene_type = utr5.attributes['gene_type']
    assert len(gene_type) == 1
    gene_type = gene_type[0]

    exon_number = utr5.attributes['exon_number']
    assert len(exon_number) == 1
    exon_number = exon_number[0]
    
    try:
        tags = utr5.attributes['tag']
    except:
        tags = []
    start = utr5.start
    end = utr5.end
    chrom = utr5.chrom
    strand = utr5.strand
    uorf_type = ''
    for tag in tags:
        if 'uORF' in tag:
            uorf_type = tag
            break
    
    if 'CCDS' in tags:
        is_ccds=1
    else:
        is_ccds=0
    to_write += '{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}'.format(transcript_id, gene_id, gene_type,
                                                       chrom, start, end, 
                                                       strand, exon_number, (',').join(sorted(tags)), is_ccds, uorf_type)
    to_write += '\n'
    

In [68]:
"uORF" in "overlapping_uORF"

True

In [69]:
with open('/staging/as/skchoudh/hg38_UTR5.tsv', 'w') as f:
    f.write(to_write)

In [70]:
df = pd.read_table('/staging/as/skchoudh/hg38_UTR5.tsv')

In [71]:
df = df.sort_values(by=['transcript_id', 'gene_id', 'chrom', 'start', 'end', 'strand', 'exon_number']).reset_index()

In [225]:
df.head()

Unnamed: 0,index,transcript_id,gene_id,gene_type,chrom,start,end,strand,exon_number,tags,is_CCDS,uORF_type
0,39280,ENST00000000233.9,ENSG00000004059.10,protein_coding,chr7,127588345,127588498,+,1,"CCDS,appris_principal_1,basic",1,
1,31809,ENST00000000412.7,ENSG00000003056.7,protein_coding,chr12,8946405,8946405,-,2,"CCDS,appris_principal_1,basic",1,
2,31815,ENST00000000412.7,ENSG00000003056.7,protein_coding,chr12,8949488,8949955,-,1,"CCDS,appris_principal_1,basic",1,
3,47208,ENST00000000442.10,ENSG00000173153.13,protein_coding,chr11,64305578,64305736,+,1,"CCDS,NAGNAG_splice_site,appris_principal_3,basic",1,
4,47213,ENST00000000442.10,ENSG00000173153.13,protein_coding,chr11,64307168,64307179,+,2,"CCDS,NAGNAG_splice_site,appris_principal_3,basic",1,


In [75]:
df.uORF_type.value_counts()

upstream_uORF       238
overlapping_uORF     59
Name: uORF_type, dtype: int64

In [76]:
df.head()

Unnamed: 0,index,transcript_id,gene_id,gene_type,chrom,start,end,strand,exon_number,tags,is_CCDS,uORF_type
0,39280,ENST00000000233.9,ENSG00000004059.10,protein_coding,chr7,127588345,127588498,+,1,"CCDS,appris_principal_1,basic",1,
1,31809,ENST00000000412.7,ENSG00000003056.7,protein_coding,chr12,8946405,8946405,-,2,"CCDS,appris_principal_1,basic",1,
2,31815,ENST00000000412.7,ENSG00000003056.7,protein_coding,chr12,8949488,8949955,-,1,"CCDS,appris_principal_1,basic",1,
3,47208,ENST00000000442.10,ENSG00000173153.13,protein_coding,chr11,64305578,64305736,+,1,"CCDS,NAGNAG_splice_site,appris_principal_3,basic",1,
4,47213,ENST00000000442.10,ENSG00000173153.13,protein_coding,chr11,64307168,64307179,+,2,"CCDS,NAGNAG_splice_site,appris_principal_3,basic",1,


The above table is 5'UTR, so the start codon is given by "end+1" for positive strand and "end-1" for negative strand.
Remember everything is 1-based. To verify, we fetch the sequence of these coordinates and see if they are indeed ATG.
for negative coordinates we need to reverse complement.



In [84]:
# first row that is on positive strand
# ENST00000000233.9	ENSG00000004059.10	protein_coding	chr7	127588345	127588498	+	1	CCDS,appris_principal_1,basic	1	NaN
fasta_reader.query(intervals=[Interval('chr7', 127588498+1, 127588498+3)]) 



['ATG']

In [89]:
127588373
fasta_reader.query(intervals=[Interval('chr7', 127588373, 127588373+2)]) 


['AGG']

In [85]:
# second row that is on negative strand
# ENST00000000412.7	ENSG00000003056.7	protein_coding	chr12	8946405	8946405	-	2	CCDS,appris_principal_1,basic	1	NaN
fasta_reader.reverse_complement(fasta_reader.query(intervals=[Interval('chr12', 8946405-3, 8946405-1)])[0] )



'ATG'

# Select a row thats labeleed upstream_uORF and find out why is it so?


In [87]:
df_uORF = df[df.uORF_type=='upstream_uORF']
df_uORF

Unnamed: 0,index,transcript_id,gene_id,gene_type,chrom,start,end,strand,exon_number,tags,is_CCDS,uORF_type
404,103532,ENST00000199448.8,ENSG00000086289.11,protein_coding,chr7,37920561,37920939,+,1,"CCDS,appris_principal_3,basic,upstream_uORF",1,upstream_uORF
2329,7442,ENST00000245121.9,ENSG00000167216.16,protein_coding,chr18,46946824,46946872,+,1,"CCDS,appris_principal_2,basic,upstream_uORF",1,upstream_uORF
3910,57773,ENST00000261192.11,ENSG00000060982.14,protein_coding,chr12,24948933,24949459,-,1,"CCDS,appris_principal_3,basic,upstream_uORF",1,upstream_uORF
5800,103562,ENST00000270538.7,ENSG00000104980.7,protein_coding,chr19,7943652,7943920,-,1,"CCDS,appris_principal_1,basic,upstream_uORF",1,upstream_uORF
9427,18451,ENST00000303212.2,ENSG00000171119.2,protein_coding,chr19,5823802,5824165,+,1,"CCDS,appris_principal_1,basic,upstream_uORF",1,upstream_uORF
12513,97986,ENST00000321702.2,ENSG00000177191.2,protein_coding,chr19,41426779,41426810,-,3,"CCDS,appris_principal_1,basic,upstream_uORF",1,upstream_uORF
12514,97987,ENST00000321702.2,ENSG00000177191.2,protein_coding,chr19,41427284,41427570,-,2,"CCDS,appris_principal_1,basic,upstream_uORF",1,upstream_uORF
12515,97988,ENST00000321702.2,ENSG00000177191.2,protein_coding,chr19,41428596,41428730,-,1,"CCDS,appris_principal_1,basic,upstream_uORF",1,upstream_uORF
12626,14423,ENST00000322354.3,ENSG00000167565.12,protein_coding,chr19,40442081,40442086,-,2,"CCDS,appris_principal_2,basic,upstream_uORF",1,upstream_uORF
12627,14429,ENST00000322354.3,ENSG00000167565.12,protein_coding,chr19,40444215,40444705,-,1,"CCDS,appris_principal_2,basic,upstream_uORF",1,upstream_uORF


In [88]:
#103532	ENST00000199448.8	ENSG00000086289.11	protein_coding	chr7	37920561	37920939	+	1	CCDS,appris_principal_3,basic,upstream_uORF	1	upstream_uORF
sequence = fasta_reader.query([Interval('chr7', 37920561, 37920939)])
sequence

['TCCCCCCTCTTAAAACACGATGCCTCCCAGGATGCTAGTGGCACCACTGCCACTGCATTTCCTGTTGGCAGCAGTGAGCAGTGAAAACCGAAGCGGCAGAAGGCAGTGGCAGCAGGCAGTGGCAGCAGGCAGTGGCCCAGGCAGAAATAGCTCCCGCGCGATTCACTGGAGCCTTCCCCGGGCCCTGGTCCCGGCTACCGGGACTCGCGCGTCCGGATCTCAAAAGCGGCAGAGGCCACCGAAGGGACAGGAAGCACTTTGGTCCAGACCACACTCCCGGCACAGTGCGGAAAGAGCCGGCGGGAGCCACTCTGATCCCGGACGCCTCAGCGCCCCCTTGGGCTTGGGCTTGCCCTCGGGCCGGGGAAGGCTGACCGCG']

In [77]:
eif1_df = pd.read_table('/home/cmb-panasas2/wenzhenl/benchmark/real/eif1_real_app_all_data.txt').sort_values(by='transcript_stable_ID')
#sorted(eif1_df.columns)
eif1_df = eif1_df[['uTIS.ID', 'transcript_stable_ID', 'gene_stable_ID', 'uORF.CDS.pair_ID', 'position.aTIS',
                   'position.uTIS', 'start_codon_aTIS', 'start_codon_uTIS', 'strand',
                   'uORF_length_nt', 'uTIS.AUG.or.non_AUG', 'uTIS.class', 'uTIS_Kozak', 'uTIS_dist_to_aTIS_nt', 'LFQ.si.Ctrl.rep1',
                   'LFQ.si.Ctrl.rep2']]
eif1_df = eif1_df[eif1_df['LFQ.si.Ctrl.rep1']>0]
eif1_df = eif1_df[eif1_df['LFQ.si.Ctrl.rep2']>0]
eif1_df = eif1_df.sort_values(by='uTIS.ID').set_index('uTIS.ID').dropna()
eif1_df.head()

Unnamed: 0_level_0,transcript_stable_ID,gene_stable_ID,uORF.CDS.pair_ID,position.aTIS,position.uTIS,start_codon_aTIS,start_codon_uTIS,strand,uORF_length_nt,uTIS.AUG.or.non_AUG,uTIS.class,uTIS_Kozak,uTIS_dist_to_aTIS_nt,LFQ.si.Ctrl.rep1,LFQ.si.Ctrl.rep2
uTIS.ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
ENST00000000233_127588411,ENST00000000233,ENSG00000004059,ENST00000000233_127588499_127588411,127588499,127588411.0,ATG,CTG,1,264.0,non-AUG,u-oORF,weak,-88.0,29.64103,29.75042
ENST00000000412_8949529,ENST00000000412,ENSG00000003056,ENST00000000412_8946404_8949529,8946404,8949529.0,ATG,CTG,-1,162.0,non-AUG,u-oORF,weak,-43.0,26.05631,26.00443
ENST00000000412_8949563,ENST00000000412,ENSG00000003056,ENST00000000412_8946404_8949563,8946404,8949563.0,ATG,CTG,-1,141.0,non-AUG,u-oORF,weak,-77.0,26.05631,26.00443
ENST00000000412_8949593,ENST00000000412,ENSG00000003056,ENST00000000412_8946404_8949593,8946404,8949593.0,ATG,CTG,-1,30.0,non-AUG,uORF,strong,-107.0,26.05631,26.00443
ENST00000000412_8949612,ENST00000000412,ENSG00000003056,ENST00000000412_8946404_8949612,8946404,8949612.0,ATG,CTG,-1,96.0,non-AUG,uORF,strong,-126.0,26.05631,26.00443


In [249]:
127588411
# this is first row of the eif1_df table
fasta_reader.query(intervals=[Interval('chr7', 127588411, 127588411+2)]) 


['CTG']

In [91]:
#127588390
#	39280	ENST00000000233.9	ENSG00000004059.10	protein_coding	chr7	127588345	127588498	+	1	CCDS,appris_principal_1,basic	1	NaN
fasta_reader.query(intervals=[Interval('chr7', 127588390, 127588390+2)]) 


['AGG']

In [293]:
# ENST00000000412_8949529	ENST00000000412	ENSG00000003056	ENST00000000412_8946404_8949529	8946404	8949529.0	ATG	CTG	-1	162.0	non-AUG	u-oORF	w
#8949529.0	
##************************
fasta_reader.reverse_complement(fasta_reader.query(intervals=[Interval('chr12', 8949529-2, 8949529)])[0] )


'CTG'

In [None]:
re.finditer('')

In [94]:
#103532	ENST00000199448.8	ENSG00000086289.11	protein_coding	chr7	37920561	37920939	+	1	CCDS,appris_principal_3,basic,upstream_uORF	1	upstream_uORF
sequence = fasta_reader.query([Interval('chr7', 37920561, 37920939)])
sequence

['TCCCCCCTCTTAAAACACGATGCCTCCCAGGATGCTAGTGGCACCACTGCCACTGCATTTCCTGTTGGCAGCAGTGAGCAGTGAAAACCGAAGCGGCAGAAGGCAGTGGCAGCAGGCAGTGGCAGCAGGCAGTGGCCCAGGCAGAAATAGCTCCCGCGCGATTCACTGGAGCCTTCCCCGGGCCCTGGTCCCGGCTACCGGGACTCGCGCGTCCGGATCTCAAAAGCGGCAGAGGCCACCGAAGGGACAGGAAGCACTTTGGTCCAGACCACACTCCCGGCACAGTGCGGAAAGAGCCGGCGGGAGCCACTCTGATCCCGGACGCCTCAGCGCCCCCTTGGGCTTGGGCTTGCCCTCGGGCCGGGGAAGGCTGACCGCG']

In [None]:
matches = [match.start(0), ]

In [102]:
matches = re.finditer(re.compile('|'.join(START_CODONS)) , sequence[0])
results = [int(match.group(0)) for match in matches]


ValueError: invalid literal for int() with base 10: 'ACG'

In [99]:
'|'.join(START_CODONS)

'ATG|AAG|ACG|ATC|GTG|AGG|ATA|ATT|CTG|TTG'

In [105]:
for match in matches:
    print(match.groups)
    break

<built-in method groups of _sre.SRE_Match object at 0x2b3e482947e8>


In [109]:
match.pos

0

In [110]:
sequence

['TCCCCCCTCTTAAAACACGATGCCTCCCAGGATGCTAGTGGCACCACTGCCACTGCATTTCCTGTTGGCAGCAGTGAGCAGTGAAAACCGAAGCGGCAGAAGGCAGTGGCAGCAGGCAGTGGCAGCAGGCAGTGGCCCAGGCAGAAATAGCTCCCGCGCGATTCACTGGAGCCTTCCCCGGGCCCTGGTCCCGGCTACCGGGACTCGCGCGTCCGGATCTCAAAAGCGGCAGAGGCCACCGAAGGGACAGGAAGCACTTTGGTCCAGACCACACTCCCGGCACAGTGCGGAAAGAGCCGGCGGGAGCCACTCTGATCCCGGACGCCTCAGCGCCCCCTTGGGCTTGGGCTTGCCCTCGGGCCGGGGAAGGCTGACCGCG']

In [111]:
seq = ''.join(START_CODONS)

In [122]:
seq

'ATGAAGACGATCGTGAGGATAATTCTGTTG'

In [126]:
START_CODONS

['ATG', 'AAG', 'ACG', 'ATC', 'GTG', 'AGG', 'ATA', 'ATT', 'CTG', 'TTG']

In [124]:
seq = 'ATGATGATGATCGTGAGGATAATTCTGTTG'

In [135]:
seq = 'ATTG'

In [147]:
matches = re.finditer(r"(?=({}))".format('|'.join(START_CODONS)) , seq)
#'ATTG'
#matches = re.finditer(r"(?=(ATT|TTG))", seq)
for match in matches:
    print (match.start(0), match.group(1))

 0 ATT
 1 TTG


In [148]:
seq = 'ATTGATTGTTGTG'

In [150]:
seq = 'ATTGATTGTTGTG'
matches = re.finditer(r"(?=({}))".format('|'.join(START_CODONS)) , seq)
#'ATTG'
#matches = re.finditer(r"(?=(ATT|TTG))", seq)
for match in matches:
    print (1+match.start(0), match.group(1))

1 ATT
2 TTG
5 ATT
6 TTG
9 TTG
11 GTG


In [152]:
seq = 'ATTGATTGTTGTG'
matches = re.finditer(re.compile('|'.join(START_CODONS)) , seq)
#'ATTG'
#matches = re.finditer(r"(?=(ATT|TTG))", seq)
for match in matches:
    print (match.start(0), match.group(0))

0 ATT
4 ATT
8 TTG


# So look ahead is required for finding overlapping matches

In [154]:
matches = re.finditer(re.compile('|'.join(START_CODONS)) , sequence[0])
#'ATTG'
#matches = re.finditer(r"(?=(ATT|TTG))", seq)
for match in matches:
    print (match.start(0), match.group(0))

16 ACG
19 ATG
28 AGG
31 ATG
37 GTG
46 CTG
52 CTG
56 ATT
61 CTG
64 TTG
73 GTG
80 GTG
90 AAG
99 AAG
105 GTG
113 AGG
118 GTG
126 AGG
131 GTG
138 AGG
146 ATA
160 ATT
165 CTG
184 CTG
216 ATC
223 AAG
232 AGG
241 AAG
248 AGG
251 AAG
258 TTG
284 GTG
291 AAG
311 CTG
314 ATC
321 ACG
337 TTG
343 TTG
349 TTG
366 AAG
370 CTG


In [156]:
matches = re.finditer(r"(?=({}))".format('|'.join(START_CODONS)) , sequence[0])
#'ATTG'
#matches = re.finditer(r"(?=(ATT|TTG))", seq)
for match in matches:
    print (match.start(0), match.group(1))

16 ACG
19 ATG
28 AGG
31 ATG
37 GTG
46 CTG
52 CTG
56 ATT
61 CTG
64 TTG
73 GTG
80 GTG
90 AAG
99 AAG
100 AGG
105 GTG
113 AGG
118 GTG
126 AGG
131 GTG
138 AGG
146 ATA
160 ATT
165 CTG
184 CTG
216 ATC
223 AAG
232 AGG
241 AAG
242 AGG
248 AGG
251 AAG
258 TTG
284 GTG
291 AAG
311 CTG
314 ATC
321 ACG
337 TTG
343 TTG
349 TTG
366 AAG
367 AGG
370 CTG


In [185]:
def find_matches(string, list_of_codons):
    """Returns 0-based positions"""
    matches = re.finditer(re.compile('|'.join(list_of_codons)) , string)
    matching_positions = OrderedDict()
    for match in matches:
        matching_positions[match.start(0)] = match.group(0)
    return matching_positions

In [248]:
fasta_reader.query([Interval('chr7', 127588345, 127588498)])

['ACGGGGGCGGGCCCGCGGTGACGTCGGGAGGGCAGCGACGCGCGGAGGCGGCGGCGGAGCCTCCTCCTGCTGCTGCTGCGCCCCATCCCCCCGCGGCCGGCCAGTTCCAGCCCGCACCCCGCGTCGGTGCCCGCGCCCCTCCCCGGGCCCCGCC']

In [250]:
fasta_reader.query([Interval('chr7', 127588345, 127588498)])

['ACGGGGGCGGGCCCGCGGTGACGTCGGGAGGGCAGCGACGCGCGGAGGCGGCGGCGGAGCCTCCTCCTGCTGCTGCTGCGCCCCATCCCCCCGCGGCCGGCCAGTTCCAGCCCGCACCCCGCGTCGGTGCCCGCGCCCCTCCCCGGGCCCCGCC']

In [251]:
sequence = fasta_reader.query([Interval('chr7', 127588345, 127588498)])
start_codon_matches = find_matches(sequence[0], START_CODONS)
stop_codon_matches = find_matches(sequence[0], STOP_CODONS)

In [253]:
stop_codon_matches_framewise = OrderedDict()
for frame in [0,1,2]:
    stop_codon_matches_framewise[frame] = list(filter(lambda x: x[0]%3 ==frame, stop_codon_matches.items()))

In [254]:
stop_codon_matches_framewise

OrderedDict([(0, [(18, 'TGA')]), (1, []), (2, [])])

In [255]:
312-90

222

In [256]:
start_codon_matches_framewise

OrderedDict([(0,
              [(0, 'ACG'),
               (45, 'AGG'),
               (66, 'CTG'),
               (69, 'CTG'),
               (72, 'CTG'),
               (75, 'CTG'),
               (84, 'ATC'),
               (126, 'GTG')]),
             (1, [(28, 'AGG'), (37, 'ACG')]),
             (2, [(17, 'GTG'), (20, 'ACG')])])

In [335]:
def find_candidate_orfs(start_codon_matches, stop_codon_matches):
    start_codon_matches_framewise = OrderedDict()
    for frame in [0,1,2]:
        start_codon_matches_framewise[frame] = list(filter(lambda x: x[0]%3 ==frame, start_codon_matches.items()))
    stop_codon_matches_framewise = OrderedDict()
    
    for frame in [0,1,2]:
        stop_codon_matches_framewise[frame] = list(filter(lambda x: x[0]%3 ==frame, stop_codon_matches.items()))
        
    candidate_orfs = OrderedDict()
    candidate_orfs['starts'] = []
    candidate_orfs['stops'] = []

    for frame in start_codon_matches_framewise.keys() & start_codon_matches_framewise.keys():
        start_list = start_codon_matches_framewise[frame]
        stop_list = stop_codon_matches_framewise[frame]

        # For each start_codon, find the first stop codon it would encounter
        # We assume that once a stop codon is encountered, it would not be 
        # read through, at least not in the uORF

        for start_index, start_codon in start_list:
            for stop_index, stop_codon in stop_list:
                # Assume sequence coordinates, so strand etc already 
                # have been accounted for
                if stop_index-start_index>0:
                    candidate_orfs['starts'].append((start_index, start_codon, frame, stop_index-start_index))
                    candidate_orfs['stops'].append((stop_index, start_codon, frame, stop_index-start_index))
                    #break

    return candidate_orfs
    


In [258]:
sequence[0]

'ACGGGGGCGGGCCCGCGGTGACGTCGGGAGGGCAGCGACGCGCGGAGGCGGCGGCGGAGCCTCCTCCTGCTGCTGCTGCGCCCCATCCCCCCGCGGCCGGCCAGTTCCAGCCCGCACCCCGCGTCGGTGCCCGCGCCCCTCCCCGGGCCCCGCC'

In [259]:
start_codon_matches

OrderedDict([(0, 'ACG'),
             (17, 'GTG'),
             (20, 'ACG'),
             (28, 'AGG'),
             (37, 'ACG'),
             (45, 'AGG'),
             (66, 'CTG'),
             (69, 'CTG'),
             (72, 'CTG'),
             (75, 'CTG'),
             (84, 'ATC'),
             (126, 'GTG')])

In [260]:
start_codon_matches_framewise

OrderedDict([(0,
              [(0, 'ACG'),
               (45, 'AGG'),
               (66, 'CTG'),
               (69, 'CTG'),
               (72, 'CTG'),
               (75, 'CTG'),
               (84, 'ATC'),
               (126, 'GTG')]),
             (1, [(28, 'AGG'), (37, 'ACG')]),
             (2, [(17, 'GTG'), (20, 'ACG')])])

In [261]:
stop_codon_matches_framewise

OrderedDict([(0, [(18, 'TGA')]), (1, []), (2, [])])

In [262]:
# Now select the most upstream start codon which is the smallest?

In [263]:
candidate_orfs

OrderedDict([('starts', [(0, 'ACG', 0, 18)]), ('stops', [(18, 'ACG', 0, 18)])])

In [264]:
tx_start = 127588345
eif1_df_start = 127588411
ribocop_start = 127588390

In [265]:
sorted(START_CODONS)

['AAG', 'ACG', 'AGG', 'ATA', 'ATC', 'ATG', 'ATT', 'CTG', 'GTG', 'TTG']

In [266]:
sorted([tx_start + x[0] for x in candidate_orfs['starts']])

[127588345]

In [270]:

sorted([(tx_start + x, y) for x,y in start_codon_matches.items()])

[(127588345, 'ACG'),
 (127588362, 'GTG'),
 (127588365, 'ACG'),
 (127588373, 'AGG'),
 (127588382, 'ACG'),
 (127588390, 'AGG'),
 (127588411, 'CTG'),
 (127588414, 'CTG'),
 (127588417, 'CTG'),
 (127588420, 'CTG'),
 (127588429, 'ATC'),
 (127588471, 'GTG')]

In [271]:
sorted([(tx_start + x, y) for x,y in stop_codon_matches.items()])

[(127588363, 'TGA')]

In [269]:
sequence = fasta_reader.query([Interval('chr7', 127588345, 127588498)])
sequence[0]

'ACGGGGGCGGGCCCGCGGTGACGTCGGGAGGGCAGCGACGCGCGGAGGCGGCGGCGGAGCCTCCTCCTGCTGCTGCTGCGCCCCATCCCCCCGCGGCCGGCCAGTTCCAGCCCGCACCCCGCGTCGGTGCCCGCGCCCCTCCCCGGGCCCCGCC'

In [272]:
STOP_CODONS

['TAG', 'TAA', 'TGA']

In [274]:
'TTT' in sequence[0]

False

In [281]:
sequence = fasta_reader.reverse_complement(fasta_reader.query([Interval('chr7', 8949488, 8949955)])[0])
sequence

'GTTTCATTCTATTGGCACGAATAAATATAGGAGGTATTATTCTTTAAAAAAATGATTGTACTAGTTAGGGACCACAAAAAGCAAAGCAGTGAGAATAAATGTTATGAGGATATTTTAAATCCAAGAGAAGAACTGGCTTGGTTGATTGTCAGGAATAGTCTGTGCGTAAAGAGGAGTGGTTCCTAATGAGGATGGATGTATCACCTAACTATATACGAAACGGCAGTGGCAAGGAAGAAATTGTGATGCATGTAGAGACCCTTTTCTTTGAACGATGCCTGTTCTAAGCCTATGTTTGAAGCCAATAAAAAGCATTCTACTATGTATTGTACAATAAGTAAAATGGCATGATGGAAACAAATATCATCAAGCTATAAGAAGCCAGAAATTGTGAGTTCTTTTGTTTTCCAAACAAAAGAATCTTTCACTGTCCAAATGTGTGACAAGCAGACACTACATTTTTTACTC'

In [282]:
start_codon_matches = find_matches(sequence, START_CODONS)
stop_codon_matches = find_matches(sequence, STOP_CODONS)
candidate_orfs = find_candidate_orfs(start_codon_matches, stop_codon_matches)

In [285]:
candidate_orfs

OrderedDict([('starts',
              [(24, 'ATA', 0, 3),
               (51, 'ATG', 0, 132),
               (54, 'ATT', 0, 129),
               (66, 'AGG', 0, 117),
               (78, 'AAG', 0, 105),
               (132, 'CTG', 0, 51),
               (141, 'TTG', 0, 42),
               (144, 'ATT', 0, 39),
               (150, 'AGG', 0, 33),
               (159, 'CTG', 0, 24),
               (168, 'AAG', 0, 15),
               (171, 'AGG', 0, 12),
               (195, 'ATG', 0, 48),
               (210, 'ATA', 0, 33),
               (219, 'ACG', 0, 24),
               (225, 'GTG', 0, 18),
               (234, 'AAG', 0, 9),
               (249, 'ATG', 0, 3),
               (267, 'TTG', 0, 81),
               (285, 'AAG', 0, 63),
               (291, 'ATG', 0, 57),
               (309, 'AAG', 0, 39),
               (321, 'ATG', 0, 27),
               (333, 'ATA', 0, 15),
               (342, 'ATG', 0, 6),
               (10, 'ATT', 1, 42),
               (16, 'ACG', 1, 36),
           

In [296]:
fasta_reader.reverse_complement(fasta_reader.query([Interval('chr12', 8949498-3, 8949498)])[0])

'TCCC'

In [299]:

fasta_reader.reverse_complement(fasta_reader.query([Interval('chr12', 8949513-2, 8949513)])[0])

'TTG'

In [300]:
fasta_reader.reverse_complement(fasta_reader.query([Interval('chr12', 8949612-2, 8949612)])[0])

'CTG'

In [306]:

fasta_reader.reverse_complement(fasta_reader.query([Interval('chr12', 8949950-3, 8949950+3)])[0])

'GGGAGCG'

In [307]:
fasta_reader.query([Interval('chr12', 8949950-3, 8949950+3)])[0]

'CGCTCCC'

In [308]:
sequence

'GTTTCATTCTATTGGCACGAATAAATATAGGAGGTATTATTCTTTAAAAAAATGATTGTACTAGTTAGGGACCACAAAAAGCAAAGCAGTGAGAATAAATGTTATGAGGATATTTTAAATCCAAGAGAAGAACTGGCTTGGTTGATTGTCAGGAATAGTCTGTGCGTAAAGAGGAGTGGTTCCTAATGAGGATGGATGTATCACCTAACTATATACGAAACGGCAGTGGCAAGGAAGAAATTGTGATGCATGTAGAGACCCTTTTCTTTGAACGATGCCTGTTCTAAGCCTATGTTTGAAGCCAATAAAAAGCATTCTACTATGTATTGTACAATAAGTAAAATGGCATGATGGAAACAAATATCATCAAGCTATAAGAAGCCAGAAATTGTGAGTTCTTTTGTTTTCCAAACAAAAGAATCTTTCACTGTCCAAATGTGTGACAAGCAGACACTACATTTTTTACTC'

In [None]:
tx_start = 8949488
tx_end = 8949955

In [322]:
sequence = fasta_reader.reverse_complement(fasta_reader.query([Interval('chr12', 8949488, 8949955)])[0])

len(sequence)

468

In [323]:
sequence

'CCGGGAGCGGTCAGGCGCGTGACCCCGCGTGACCGGGGTGCGCGAGCCGAGAGGCCCTGGAAGCGGGACTCTGGGACCCCATTGAAAGAAAGGCAACCAGAAAAGGCGAAACGACAACCTCGATAGTGATTGGAGCTGATGGAACGAGGGCGGAGCTAAAGTCCCAGAATACTGGCCAATCAGAGTTTAAGATTATCAGGGAGCCCGGGAAGGGGAAAGGGGCGAAATGGTTCTATGGTCACATGCCGCGGGGTCTGGTGGGAGGAGCGGTTGCCCAGCGGCCTCTTGGCGCTTCCTGTTTCCGGTTCCCAGAGTGGGGCACAGCGAGGCGCTAGGGGGAACGCTGGCCTCTGAAACTAGCTCTGGGACCGGGGTCTGCGGCCGGCCCCTAGCTGGCCCCGTCTCCCATCCCCAGAAGGGTATTCACTGGGGATTCTGAGCTTTGGCTACTCCAGTTTCCCACGACAC'

In [324]:
fasta_reader.reverse_complement(fasta_reader.query([Interval('chr12', 8949955-2, 8949955)])[0])

'CCG'

In [325]:
fasta_reader.query([Interval('chr12', 8949955-2, 8949955)])[0][::-1]

'GGC'

In [336]:
sequence = fasta_reader.reverse_complement(fasta_reader.query([Interval('chr12', 8949488, 8949955)])[0])

start_codon_matches = find_matches(sequence, START_CODONS)
stop_codon_matches = find_matches(sequence, STOP_CODONS)
candidate_orfs = find_candidate_orfs(start_codon_matches, stop_codon_matches)

In [337]:
sorted([(8949955 - x, y) for x,y in start_codon_matches.items()])
#529, 563, 
#8949513

[(8949494, 'ACG'),
 (8949513, 'TTG'),
 (8949520, 'CTG'),
 (8949523, 'ATT'),
 (8949529, 'CTG'),
 (8949534, 'ATT'),
 (8949540, 'AAG'),
 (8949548, 'ATC'),
 (8949563, 'CTG'),
 (8949580, 'CTG'),
 (8949593, 'CTG'),
 (8949605, 'CTG'),
 (8949612, 'CTG'),
 (8949615, 'ACG'),
 (8949622, 'AGG'),
 (8949629, 'AGG'),
 (8949642, 'GTG'),
 (8949660, 'CTG'),
 (8949670, 'TTG'),
 (8949685, 'TTG'),
 (8949693, 'AGG'),
 (8949698, 'GTG'),
 (8949701, 'CTG'),
 (8949713, 'ATG'),
 (8949721, 'ATG'),
 (8949729, 'ATG'),
 (8949739, 'AAG'),
 (8949746, 'AAG'),
 (8949758, 'AGG'),
 (8949761, 'ATC'),
 (8949764, 'ATT'),
 (8949767, 'AAG'),
 (8949777, 'ATC'),
 (8949784, 'CTG'),
 (8949787, 'ATA'),
 (8949797, 'AAG'),
 (8949809, 'AGG'),
 (8949812, 'ACG'),
 (8949817, 'ATG'),
 (8949820, 'CTG'),
 (8949827, 'ATT'),
 (8949830, 'GTG'),
 (8949833, 'ATA'),
 (8949845, 'ACG'),
 (8949853, 'AAG'),
 (8949866, 'AAG'),
 (8949870, 'AAG'),
 (8949875, 'ATT'),
 (8949885, 'CTG'),
 (8949895, 'AAG'),
 (8949899, 'CTG'),
 (8949904, 'AGG'),
 (8949918, '

In [338]:
eif1_df.head()

Unnamed: 0_level_0,transcript_stable_ID,gene_stable_ID,uORF.CDS.pair_ID,position.aTIS,position.uTIS,start_codon_aTIS,start_codon_uTIS,strand,uORF_length_nt,uTIS.AUG.or.non_AUG,uTIS.class,uTIS_Kozak,uTIS_dist_to_aTIS_nt,LFQ.si.Ctrl.rep1,LFQ.si.Ctrl.rep2
uTIS.ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
ENST00000000233_127588411,ENST00000000233,ENSG00000004059,ENST00000000233_127588499_127588411,127588499,127588411.0,ATG,CTG,1,264.0,non-AUG,u-oORF,weak,-88.0,29.64103,29.75042
ENST00000000412_8949529,ENST00000000412,ENSG00000003056,ENST00000000412_8946404_8949529,8946404,8949529.0,ATG,CTG,-1,162.0,non-AUG,u-oORF,weak,-43.0,26.05631,26.00443
ENST00000000412_8949563,ENST00000000412,ENSG00000003056,ENST00000000412_8946404_8949563,8946404,8949563.0,ATG,CTG,-1,141.0,non-AUG,u-oORF,weak,-77.0,26.05631,26.00443
ENST00000000412_8949593,ENST00000000412,ENSG00000003056,ENST00000000412_8946404_8949593,8946404,8949593.0,ATG,CTG,-1,30.0,non-AUG,uORF,strong,-107.0,26.05631,26.00443
ENST00000000412_8949612,ENST00000000412,ENSG00000003056,ENST00000000412_8946404_8949612,8946404,8949612.0,ATG,CTG,-1,96.0,non-AUG,uORF,strong,-126.0,26.05631,26.00443


In [339]:
sorted([(8949955 - x, y) for x,y in start_codon_matches.items()])


[(8949494, 'ACG'),
 (8949513, 'TTG'),
 (8949520, 'CTG'),
 (8949523, 'ATT'),
 (8949529, 'CTG'),
 (8949534, 'ATT'),
 (8949540, 'AAG'),
 (8949548, 'ATC'),
 (8949563, 'CTG'),
 (8949580, 'CTG'),
 (8949593, 'CTG'),
 (8949605, 'CTG'),
 (8949612, 'CTG'),
 (8949615, 'ACG'),
 (8949622, 'AGG'),
 (8949629, 'AGG'),
 (8949642, 'GTG'),
 (8949660, 'CTG'),
 (8949670, 'TTG'),
 (8949685, 'TTG'),
 (8949693, 'AGG'),
 (8949698, 'GTG'),
 (8949701, 'CTG'),
 (8949713, 'ATG'),
 (8949721, 'ATG'),
 (8949729, 'ATG'),
 (8949739, 'AAG'),
 (8949746, 'AAG'),
 (8949758, 'AGG'),
 (8949761, 'ATC'),
 (8949764, 'ATT'),
 (8949767, 'AAG'),
 (8949777, 'ATC'),
 (8949784, 'CTG'),
 (8949787, 'ATA'),
 (8949797, 'AAG'),
 (8949809, 'AGG'),
 (8949812, 'ACG'),
 (8949817, 'ATG'),
 (8949820, 'CTG'),
 (8949827, 'ATT'),
 (8949830, 'GTG'),
 (8949833, 'ATA'),
 (8949845, 'ACG'),
 (8949853, 'AAG'),
 (8949866, 'AAG'),
 (8949870, 'AAG'),
 (8949875, 'ATT'),
 (8949885, 'CTG'),
 (8949895, 'AAG'),
 (8949899, 'CTG'),
 (8949904, 'AGG'),
 (8949918, '

In [340]:
sorted([(8949955 - x[0]) for x in candidate_orfs['starts']])


[8949534,
 8949540,
 8949593,
 8949605,
 8949612,
 8949615,
 8949622,
 8949622,
 8949629,
 8949629,
 8949642,
 8949660,
 8949670,
 8949670,
 8949685,
 8949685,
 8949693,
 8949698,
 8949698,
 8949701,
 8949701,
 8949713,
 8949713,
 8949721,
 8949721,
 8949729,
 8949739,
 8949739,
 8949746,
 8949746,
 8949758,
 8949758,
 8949761,
 8949761,
 8949764,
 8949764,
 8949767,
 8949767,
 8949777,
 8949777,
 8949784,
 8949784,
 8949787,
 8949787,
 8949797,
 8949797,
 8949809,
 8949809,
 8949812,
 8949812,
 8949817,
 8949817,
 8949817,
 8949820,
 8949820,
 8949820,
 8949827,
 8949827,
 8949830,
 8949830,
 8949833,
 8949833,
 8949845,
 8949845,
 8949853,
 8949853,
 8949853,
 8949853,
 8949853,
 8949866,
 8949866,
 8949870,
 8949870,
 8949870,
 8949875,
 8949875,
 8949885,
 8949885,
 8949885,
 8949885,
 8949895,
 8949895,
 8949895,
 8949895,
 8949895,
 8949899,
 8949899,
 8949904,
 8949904,
 8949904,
 8949904,
 8949904,
 8949918,
 8949918,
 8949918,
 8949918,
 8949927,
 8949927,
 8949927,
 8949927,


In [332]:
candidate_orfs

OrderedDict([('starts',
              [(12, 'AGG', 0, 111),
               (18, 'GTG', 0, 105),
               (51, 'AGG', 0, 72),
               (60, 'AAG', 0, 63),
               (102, 'AAG', 0, 21),
               (135, 'CTG', 0, 21),
               (138, 'ATG', 0, 18),
               (168, 'ATA', 0, 183),
               (171, 'CTG', 0, 180),
               (216, 'AAG', 0, 135),
               (234, 'ATG', 0, 117),
               (270, 'TTG', 0, 81),
               (285, 'TTG', 0, 66),
               (333, 'AGG', 0, 18),
               (28, 'GTG', 1, 54),
               (37, 'GTG', 1, 45),
               (70, 'CTG', 1, 12),
               (85, 'AAG', 1, 51),
               (178, 'ATC', 1, 9),
               (226, 'ATG', 1, 210),
               (262, 'AGG', 1, 174),
               (295, 'CTG', 1, 141),
               (313, 'GTG', 1, 123),
               (340, 'ACG', 1, 96),
               (343, 'CTG', 1, 93),
               (415, 'AAG', 1, 21),
               (421, 'ATT', 1, 15),
   

In [311]:
tx_end-tx_start

467