# Calculation of codon counts for each CCDS

<ol class="toc-item"><li><a href="#Globals">Globals</a></li><li><a href="#Calculate-codon-counts-for-each-CCDS">Calculate codon counts for each CCDS</a></li><li><a href="#Calculate-genome-wide-codon-frequency-(per-1000-codons)">Calculate genome-wide codon frequency (per 1000 codons)</a></li><li><a href="#Plot-genome-wide-codon-frequency-(per-1000-codons)">Plot genome-wide codon frequency (per 1000 codons)</a></li></ol>

### Globals

In [1]:
import pandas as pd
import numpy as np
import re
import pyfaidx
import copy
from Bio.SeqUtils.CodonUsage import SynonymousCodons

pd.set_option('display.max_rows', 10)

ccdsseqs = pyfaidx.Fasta('/fh/fast/subramaniam_a/db/rasi/genomes/human/hg38/'
                  'gencode/annotations/gencode.v32.canonical_ccds.20210126.fa' )

nt = ['A', 'C', 'T', 'G']
allCodons = [x + y + z for x in nt for y in nt for z in nt]
codonDict = {codon: 0 for codon in allCodons}

The text.latex.unicode rcparam was deprecated in Matplotlib 3.0 and will be removed in 3.2.
  import matplotlib
The examples.directory rcparam was deprecated in Matplotlib 3.0 and will be removed in 3.2. In the future, examples will be found relative to the 'datapath' directory.
  import matplotlib
The examples.directory rcparam was deprecated in Matplotlib 3.0 and will be removed in 3.2. In the future, examples will be found relative to the 'datapath' directory.
  import matplotlib


### Calculate codon counts for each CCDS

In [2]:
for frame in range(3):
    codoncounts = dict()
    for tx in ccdsseqs.keys():
        codoncounts[tx] = copy.deepcopy(codonDict)
        seq = str(ccdsseqs[tx])
        codons = (seq[n:n+3] for n in range(frame, len(seq), 3))
        for codon in codons:
            if len(codon) != 3:  # for out of frame calculations
                break
            codoncounts[tx][codon] += 1
    codoncounts = pd.DataFrame.from_dict(codoncounts, orient='index')
    codoncounts.to_csv('../data/motif_counts/ccds_codon_counts_frame{}.tsv'.format(frame),
                       sep="\t", index_label='transcript_id')