In [77]:
import requests
import sys
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ensembl_rest as enr
from collections import defaultdict

In [66]:
class SeqLoader:
    """
    input:
    @GRCh37: bool
    """
    def __init__(self, GRCh37):
        if GRCh37:
            self.__server = "http://grch37.rest.ensembl.org"
            self.__ext = ""
        
    def loadSeq(self, gene_id):
        self.__ext = "/sequence/id/{gene_id}?".format(gene_id = gene_id)
        seq = requests.get(self.__server + self.__ext, headers={ 'content-Type' : 'text/plain'})
        if not seq.ok:
          seq.raise_for_status()
          sys.exit()
        return seq.text
    
    def countATGC(self, seq):
        """
        input:
        @seq: plain text sequence from loadSeq()
        
        return:
        a list of pertcentages of A, T, G, C and GC
        """
        dicA = defaultdict(int)
        dicT = defaultdict(int)
        dicG = defaultdict(int)
        dicC = defaultdict(int)
        seqLen = len(seq)
        for char in seq:
            if char == 'A':
                dicA['A'] = dicA['A'] + 1
            elif char == 'T':
                dicT['T'] = dicT['T'] + 1
            elif char == 'G':
                dicG['G'] = dicG['G'] + 1
            else:
                dicC['C'] = dicC['C'] + 1
        ratioA = dicA['A'] / seqLen
        ratioT = dicT['T'] / seqLen
        ratioG = dicG['G'] / seqLen
        ratioC = dicC['C'] / seqLen
        ratioGC = ratioG + ratioC
        print(ratioA)
        print(ratioT)
        print(ratioG)
        print(ratioC)
        print(ratioGC)
        return [ratioA, ratioT, ratioG, ratioC, ratioGC]

In [110]:
class CGREncoder:
    def __init__(self):
        self.__kmer_counts = defaultdict(int)
        self.__kmer_prob = defaultdict(int)
        self.__kmerSize = -1
        self.__matrixsize = -1
        self.__chaos = None
        
    def __kmerGenerator(self, seq, k):
        """
        input:
        @seq: string, a plain text sequence 
        @k: a positive integer
        
        return:
        self.__kmer: a dictionary with keys being the kmers, values being the counts of kmers
        """
        print("Generating kmer pool")
        size = int(math.sqrt(4 ** k))
        self.__matrixsize = size
        self.__chaos = np.zeros((size, size), dtype = np.float)
        self.__kmerSize = len(seq) - k + 1
        for i in range(self.__kmerSize):
            self.__kmer_counts[seq[i : i + k]] += 1
            
        for key, value in self.__kmer_counts.items():
            self.__kmer_prob[key] = float(value) / (len(seq) - k + 1)
            
    def getKmerSize(self):
        return self.__kmerSize
    
    def getKmer(self):
        return [self.__kmer_counts, self.__kmer_prob]
    
    def encoding(self, seq, k):
        self.__kmerGenerator(seq, k)
        for key, value in self.__kmer_prob.items():
            minx = 0
            miny = 0
            maxx = self.__matrixsize
            maxy = self.__matrixsize
            charIdx = 0
            self.__helper(charIdx, minx, maxx, miny, maxy, key, value)
    
    def getChaoMatrix(self):
        return self.__chaos.copy()
    
    def __midPoint(self, small, large):
        return int(small + (large - small) / 2)
    
    def __helper(self, charIdx, minx, maxx, miny, maxy, key, value):
        if (minx >= maxx or miny >= maxy or charIdx >= len(key)):
            idxx = self.__midPoint(minx, maxx)
            idxy = self.__midPoint(miny, maxy)
            self.__chaos[idxx][idxy] = value
            return
        #print(key)
        char = key[charIdx]
        #print(char)
        charIdx = charIdx + 1
        if char == 'A':
            minx = minx
            maxx = self.__midPoint(minx, maxx)
            
        elif char == 'T':
            minx = self.__midPoint(minx, maxx)
            maxx = maxx
                
        elif char == 'G':
            minx = self.__midPoint(minx, maxx)
            maxx = maxx
            miny = self.__midPoint(miny, maxy)
            maxy = maxy
        else:
            miny = self.__midPoint(miny, maxy)
            maxy = maxy
       
        self.__helper(charIdx, minx, maxx, miny, maxy, key, value)

In [72]:
seq = SeqLoader(True)
testSeq = seq.loadSeq('ENSG00000000003')

In [111]:
encoder = CGREncoder()
encoder.encoding(testSeq, 3)

Generating kmer pool


In [112]:
testPool = encoder.getChaoMatrix()

In [113]:
testPool

array([[0.        , 0.        , 0.        , 0.        , 0.03312721,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.0225265 ,
        0.        , 0.01519435, 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.01810954,
        0.        , 0.02164311, 0.0135159 ],
       [0.        , 0.        , 0.        , 0.        , 0.02561837,
        0.        , 0.01713781, 0.01757951],
       [0.        , 0.        , 0.        , 0.        , 0.02128975,
        0.        , 0.01925795, 0.01130742],
       [0.        , 0.        , 0.        , 0.        , 0.01969965,
        0.        , 0.01183746, 0.00300353],
       [0.        , 0.        , 0.        , 0.        , 0.02332155,
        0.        , 0.01881625, 0.00309187],
       [0.        , 0.        , 0.        , 0.        , 0.04372792,
        0.        , 0.02473498, 0.00477032]])

In [114]:
testKmer = encoder.getKmer()

In [116]:
size = encoder.getKmerSize()

In [117]:
size

11320

In [118]:
133/11320

0.01174911660777385

In [115]:
testKmer[0]

defaultdict(int,
            {'AGC': 133,
             'GCT': 152,
             'CTC': 170,
             'TCT': 226,
             'CTT': 251,
             'TTC': 238,
             'TCA': 187,
             'CAG': 206,
             'AGT': 226,
             'GTA': 192,
             'TAG': 194,
             'GTT': 241,
             'TTT': 495,
             'CTG': 224,
             'TGA': 213,
             'GAA': 218,
             'AAC': 153,
             'ACA': 177,
             'CAT': 161,
             'ATC': 113,
             'CTA': 131,
             'AGA': 245,
             'GAC': 102,
             'ACG': 39,
             'CGG': 49,
             'GGT': 163,
             'AGG': 199,
             'GGA': 158,
             'GAT': 150,
             'ATG': 181,
             'TGT': 280,
             'AAT': 255,
             'ATA': 205,
             'TAA': 241,
             'AAG': 223,
             'AAA': 375,
             'TGC': 174,
             'GCA': 166,
             'CAA': 172,
          

In [108]:
testKmer[1]

defaultdict(int,
            {'AGC': 0.01174911660777385,
             'GCT': 0.01342756183745583,
             'CTC': 0.015017667844522967,
             'TCT': 0.019964664310954065,
             'CTT': 0.022173144876325088,
             'TTC': 0.021024734982332156,
             'TCA': 0.016519434628975264,
             'CAG': 0.018197879858657243,
             'AGT': 0.019964664310954065,
             'GTA': 0.01696113074204947,
             'TAG': 0.01713780918727915,
             'GTT': 0.02128975265017668,
             'TTT': 0.04372791519434629,
             'CTG': 0.019787985865724382,
             'TGA': 0.01881625441696113,
             'GAA': 0.019257950530035334,
             'AAC': 0.013515901060070671,
             'ACA': 0.015636042402826855,
             'CAT': 0.0142226148409894,
             'ATC': 0.009982332155477032,
             'CTA': 0.01157243816254417,
             'AGA': 0.021643109540636044,
             'GAC': 0.00901060070671378,
             'ACG': 0.003445