In [3]:
from ribotricer.orf import ORF
from collections import defaultdict

In [16]:
def count_orfs(ribotricer_index,
               detected_orfs,
               features,
               prefix,
               report_all=False):
    """
    Parameters
    ----------
    ribotricer_index: str
                      Path to the index file generated by ribotricer prepare_orfs
    detected_orfs: str
                   Path to the detected orfs file generated by ribotricer detect_orfs
    features: set
              set of ORF types, such as {annotated}
    prefix: str
            prefix for output file
    report_all: bool
                if True, all coverages will be exported
    """
    orf_index = {}
    read_counts = defaultdict(dict)
    with open(ribotricer_index, 'r') as fin:
        # Skip header
        fin.readline()
        for line in fin:
            orf = ORF.from_string(line)
            if orf.category in features:
                orf_index[orf.oid] = orf
    with open(detected_orfs, 'r') as fin:
        # Skip header
        fin.readline()
        for line in fin:
            fields = line.strip().split('\t')
            oid, otype, status = fields[:3]
            gene_id, gene_name, gene_type = fields[9:12]
            chrom, strand, start_codon, profile = fields[12:]
            if otype in features:
                # do not output 'nontranslating' events unless report_all is set
                if status != 'nontranslating' or report_all:
                    intervals = orf_index[oid].intervals
                    coor = [
                        x for iv in intervals
                        for x in range(iv.start, iv.end + 1)
                    ]
                    if strand == '-':
                        coor = coor[::-1]
                    try:
                        profile = list(map(int, profile.strip()[1:-1].split(', ')))
                    except:
                        print(profile.strip())
                        print(fields)
                        break
                    for pos, cov in zip(coor, profile):
                        if pos not in read_counts[gene_id, gene_name]:
                            read_counts[gene_id, gene_name][pos] = cov

    # Output count table
    with open('{}_cnt.txt'.format(prefix), 'w') as fout:
        fout.write('gene_id\tcount\n')
        for gene_id, gene_name in sorted(read_counts):
            total = sum(read_counts[gene_id, gene_name].values())
            fout.write('{}\t{}\n'.format(gene_id, total))

In [11]:
def count_orfs(ribotricer_index,
               detected_orfs,
               features,
               prefix,
               report_all=False):
    """
    Parameters
    ----------
    ribotricer_index: str
                      Path to the index file generated by ribotricer prepare_orfs
    detected_orfs: str
                   Path to the detected orfs file generated by ribotricer detect_orfs
    features: set
              set of ORF types, such as {annotated}
    prefix: str
            prefix for output file
    report_all: bool
                if True, all coverages will be exported
    """
    orf_index = {}
    read_counts = defaultdict(dict)
    with open(ribotricer_index, 'r') as fin:
        # Skip header
        fin.readline()
        for line in fin:
            orf = ORF.from_string(line)
            if orf.category in features:
                orf_index[orf.oid] = orf
    with open(detected_orfs, 'r') as fin:
        # Skip header
        fin.readline()
        for line in fin:
            fields = line.strip().split('\t')
            oid, otype, status = fields[:3]
            gene_id, gene_name, gene_type = fields[9:12]
            chrom, strand, start_codon, profile = fields[12:]
            if otype in features:
                # do not output 'nontranslating' events unless report_all is set
                if status != 'nontranslating' or report_all:
                    intervals = orf_index[oid].intervals
                    coor = [
                        x for iv in intervals
                        for x in range(iv.start, iv.end + 1)
                    ]
                    if strand == '-':
                        coor = coor[::-1]
                    profile_stripped = profile.strip()[1:-1].split(', ')
                    profile = list()
                    
                    try:
                        if profile_stripped:
                            profile = list(map(int, profile_stripped))
                    except:
                        print(profile_stripped)
                        if profile_stripped
                        print(fields)
                        break
                    for pos, cov in zip(coor, profile):
                        if pos not in read_counts[gene_id, gene_name]:
                            read_counts[gene_id, gene_name][pos] = cov

    # Output count table
    with open('{}_cnt.txt'.format(prefix), 'w') as fout:
        fout.write('gene_id\tcount\tlength\n')
        for gene_id, gene_name in sorted(read_counts):
            values = read_counts[gene_id, gene_name].values()
            length = len(values)
            total = sum(values)
            fout.write('{}\t{}\t{}\n'.format(gene_id, total, length))

In [12]:
ribotricer_index = '/home/cmb-panasas2/skchoudh/github_projects/C_albicans_project/notebooks/June_06_GTF_analysis/C_albicans_SC5314_version_A22-s07-m01-r27_features.UTR5_CDS_UTR3_cleaned_ribotricer_longest_candidate_orfs.tsv'
detected_orfs = '/home/cmb-panasas2/skchoudh/rna/Dec_12_2017_Kadosh_C_albicans_30C_37C_all_fastq_merged_riboraptor/ribotricer_results_June_2019/ribotricer_results_stringtie_gtf_longest/ribo_30C_2_translating_ORFs.tsv'

In [13]:
features = 'annotated'
prefix = 'test'

In [14]:
count_orfs(ribotricer_index,
               detected_orfs,
               features,
               prefix,
               report_all=True)

['']
['C1_10870W_A-T_2406453_2406452_0', 'annotated', 'nontranslating', '0.0', '0', '0', '0', 'C1_10870W_A-T', 'protein_coding', 'C1_10870W_A', 'RPS17B', 'protein_coding', 'Ca22chr1A_C_albicans_SC5314', '+', 'Non', '[]']
