# Find locations of codons in CCDS transcripts for downstream analyses

### Globals

In [1]:
# %matplotlib inline
# import matplotlib.pyplot as plt
# import seaborn as sns
import pandas as pd
# import numpy as np
# import re
import pyfaidx
# import copy
from Bio.SeqUtils.CodonUsage import SynonymousCodons
pd.set_option('display.max_rows', 10)

ccds_annotations = pd.read_table('../data/gencode/gencode.v32.canonical_ccds.parameters.tsv.gz')
ccds_tx_seqs = pyfaidx.Fasta('../data/gencode/gencode.v32.canonical_ccds_tx.fa')

nt = ['A', 'C', 'T', 'G']
allCodons = [x + y + z for x in nt for y in nt for z in nt]
codonDict = {codon: 0 for codon in allCodons}

In [2]:
ccds_annotations.set_index('transcript_id', inplace=True)

In [3]:
ccds_annotations.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19174 entries, ENST00000000233.10 to ENST00000673498.1
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   utr5_length  19174 non-null  int64 
 1   cds_length   19174 non-null  int64 
 2   utr3_length  19174 non-null  int64 
 3   gene_id      19174 non-null  object
 4   gene_name    19174 non-null  object
 5   ccdsid       18751 non-null  object
dtypes: int64(3), object(3)
memory usage: 1.0+ MB


### Find locations of all codons and write it to output file

In [4]:
codon_locs = list()
for tx_num, tx in enumerate(ccds_tx_seqs.keys()):
    utr5_length = ccds_annotations.loc[tx, 'utr5_length']
    utr3_length = ccds_annotations.loc[tx, 'utr3_length']
    if utr3_length > 0 and utr5_length > 0:
        seq = str(ccds_tx_seqs[tx])[utr5_length:-utr3_length]
    elif utr5_length > 0:
        seq = str(ccds_tx_seqs[tx])[utr5_length:]
    elif utr3_length > 0:
        seq = str(ccds_tx_seqs[tx])[:-utr3_length]
    else:
        seq = str(ccds_tx_seqs[tx])
    codons = (seq[n:n+3] for n in range(0, len(seq), 3))
    for loc, codon in enumerate(codons):
        if len(codon) != 3:  # for out of frame calculations
            break
        codon_locs.append({
            'transcript_id': tx,
            # 0-based coordinate
            'loc': 3*loc + utr5_length,
            'motif': codon})
        
codon_locs = pd.DataFrame(codon_locs)

codon_locs.to_csv('../data/motif_counts/ccds_codon_locs.tsv.gz', sep="\t", index=False)