# Detecting laterally transferred genes: use of entropic clustering methods and genome position
- A) dna -> count(nucleotide)/length
- B) compositional difference between genes -> 12 symbol
- C) dinucleotide -> 48 symbol
- D) codon usage

## Data rep

type I error: false positive


type II error: false negative

In [6]:
seq = 'AAAAGGGGCCCCTTTT'
seq2 = 'AGAAGGGGCCCCTGTT'
rna = ("atgtttcagaaagaggatcttgctacatggatgcaaatttcaatgagtggtcaatttgatgatacagcattagaggaatggagtacaaatggtaaagaacctgagatctgtgagaaatctccaaaagctgatggagttactacgattatggagagagctctatgtccatgggatagcagagtcaactaccaagagagccgagaacctaaattgattgctgaatcagtttgtctatgccgtaagagccgtggttctacaggagctttctgtatgccaattgttcgaaaagttccaattctccgacgtgtctcttgtgatcgttcaacaggtttatggaattatgtaagatcaactgagctaataactgttggatgtcattctgtattgccaagaactcaaagagcagcacgtcttgcccatttatcatcttctcgtattattgtttaa").upper()

A)

In [2]:
def relative_freq(sequence):
    g_count=0 
    a_count=0 
    c_count=0 
    t_count = 0
    for base in sequence:
            if base == 'G':
                g_count +=1
            elif base == 'A':
                a_count +=1
            elif base == 'C':
                c_count +=1
            elif base == 'T':
                t_count +=1

    rep = {
       'G': g_count/len(sequence),
        'A': a_count/len(sequence),
        'C': c_count/len(sequence),
        'T': t_count/len(sequence)
    }
    return rep
    
rep = relative_freq(rna)
print(rep)

{'G': 0.22595078299776286, 'A': 0.29977628635346754, 'C': 0.174496644295302, 'T': 0.29977628635346754}


b) 

still something wrong with this! proba dist of codons does not add up to 1 for each pos! actually rather the calculations are right but the only way to make it a proba dist is to actually divide by total counts of each nucleotide instead of sequence!

In [3]:
# faulty
def comp_nucl(nucleotide_sequence):
    d = {}
    for nt in ["A", "T", "G", "C"]:
        d[nt] = [0, 0, 0]
    for i in range(0, len(seq), 3):
        codon = nucleotide_sequence[i : i + 3]
        if len(codon) < 3:
            codon += "  "
        for pos in range(0, 3):
            for nt in ["A", "T", "G", "C"]:
                if codon[pos] == nt or codon[pos] == nt.lower():
                    d[nt][pos] += 1

    #print(d)
    # calculate length and divide
    for codon_pos in range(0,3):
        seq_length = len(nucleotide_sequence)-codon_pos
        if seq_length%3 != 0:
            number_of_codons = (seq_length-seq_length%3)/3
            #print((length-length%3)/3)
        else:
            number_of_codons = (seq_length)/3
            #print(length/3)
        for nt in ["A", "T", "G", "C"]:
            d[nt][codon_pos] =d[nt][codon_pos]/number_of_codons
            
    return d

In [4]:
# correct mathematically
def comp_nucl_correct(nucleotide_sequence):
    d = {}
    for nt in ["A", "T", "G", "C"]:
        d[nt] = [0, 0, 0]
    for i in range(0, len(seq), 3):
        codon = nucleotide_sequence[i : i + 3]
        if len(codon) < 3:
            codon += "  "
        for pos in range(0, 3):
            for nt in ["A", "T", "G", "C"]:
                if codon[pos] == nt or codon[pos] == nt.lower():
                    d[nt][pos] += 1

    #print(d)
    # calculate length and divide
    for codon_pos in range(0,3):
        total_counts_per_pos = 0
        for nt in ["A", "T", "G", "C"]:
            total_counts_per_pos += d[nt][codon_pos]
        for nt in ["A", "T", "G", "C"]:
            d[nt][codon_pos] =d[nt][codon_pos]/total_counts_per_pos
            
    return d

In [5]:
d = comp_nucl(rna)
print(d)

{'A': [0.013422818791946308, 0.02702702702702703, 0.006756756756756757], 'T': [0.006711409395973154, 0.013513513513513514, 0.013513513513513514], 'G': [0.013422818791946308, 0.0, 0.02027027027027027], 'C': [0.006711409395973154, 0.0, 0.0]}


In [6]:
d = comp_nucl_correct(rna)
print(d)

{'A': [0.3333333333333333, 0.6666666666666666, 0.16666666666666666], 'T': [0.16666666666666666, 0.3333333333333333, 0.3333333333333333], 'G': [0.3333333333333333, 0.0, 0.5], 'C': [0.16666666666666666, 0.0, 0.0]}


In [7]:
def codon_pos_length(length):
    if length%3 != 0:
        print((length-length%3)/3)
    else:
        print(length/3)

In [6]:
codon_pos_length(len(seq))
codon_pos_length(len(seq)-1)
codon_pos_length(len(seq)-2)

5.0
5.0
4.0


c)

In [4]:
def comp_dinucl(sequence):
    dn = {}
    for nt in ["AA", "AG", "AT", "AC", "TA", "TG", "TT", "TC","GA", "GG", "GT", "GC","CA", "CG", "CT", "CC"]:
        dn[nt] = [0, 0, 0]
    for i in range(0, len(sequence), 3):
        codon = sequence[i : i + 4]
        if len(codon) < 4:
            codon += "  "
        for pos in range(0, 3):
            for nt in ["AA", "AG", "AT", "AC", "TA", "TG", "TT", "TC","GA", "GG", "GT", "GC","CA", "CG", "CT", "CC"]:
                if codon[pos] == nt[0] and codon[pos+1] == nt[1] :
                    dn[nt][pos] += 1
                elif codon[pos] == nt[0].lower() and codon[pos+1] == nt[1].lower():
                    dn[nt][pos] += 1
                    
    # calculate length and divide
    for codon_pos in range(0,3):
        seq_length = len(sequence)-codon_pos
        if seq_length%3 != 0:
            number_of_codons = (seq_length-seq_length%3)/3
            #print((length-length%3)/3)
        else:
            number_of_codons = (seq_length)/3
            #print(length/3)
        for nt in ["AA", "AG", "AT", "AC", "TA", "TG", "TT", "TC","GA", "GG", "GT", "GC","CA", "CG", "CT", "CC"]:
            dn[nt][codon_pos] =dn[nt][codon_pos]/number_of_codons
    return dn


In [9]:
def comp_dinucl_correct(sequence):
    dn = {}
    for nt in ["AA", "AG", "AT", "AC", "TA", "TG", "TT", "TC","GA", "GG", "GT", "GC","CA", "CG", "CT", "CC"]:
        dn[nt] = [0, 0, 0]
    for i in range(0, len(sequence), 3):
        codon = sequence[i : i + 4]
        if len(codon) < 4:
            codon += "  "
        for pos in range(0, 3):
            for nt in ["AA", "AG", "AT", "AC", "TA", "TG", "TT", "TC","GA", "GG", "GT", "GC","CA", "CG", "CT", "CC"]:
                if codon[pos] == nt[0] and codon[pos+1] == nt[1] :
                    dn[nt][pos] += 1
                elif codon[pos] == nt[0].lower() and codon[pos+1] == nt[1].lower():
                    dn[nt][pos] += 1
                    
    # calculate length and divide
    for codon_pos in range(0,3):
        total_counts_per_pos = 0
        for nt in ["AA", "AG", "AT", "AC", "TA", "TG", "TT", "TC","GA", "GG", "GT", "GC","CA", "CG", "CT", "CC"]:
            total_counts_per_pos += dn[nt][codon_pos]
        for nt in ["AA", "AG", "AT", "AC", "TA", "TG", "TT", "TC","GA", "GG", "GT", "GC","CA", "CG", "CT", "CC"]:
            dn[nt][codon_pos] =dn[nt][codon_pos]/total_counts_per_pos
    return dn

In [7]:
dn = comp_dinucl(rna)
print(dn)

{'AA': [0.06711409395973154, 0.10135135135135136, 0.10135135135135136], 'AG': [0.06711409395973154, 0.060810810810810814, 0.11486486486486487], 'AT': [0.09395973154362416, 0.07432432432432433, 0.10810810810810811], 'AC': [0.06711409395973154, 0.013513513513513514, 0.02702702702702703], 'TA': [0.020134228187919462, 0.060810810810810814, 0.0945945945945946], 'TG': [0.0738255033557047, 0.0472972972972973, 0.14864864864864866], 'TT': [0.053691275167785234, 0.11486486486486487, 0.08108108108108109], 'TC': [0.0738255033557047, 0.033783783783783786, 0.10135135135135136], 'GA': [0.11409395973154363, 0.07432432432432433, 0.08108108108108109], 'GG': [0.04697986577181208, 0.02702702702702703, 0.02702702702702703], 'GT': [0.06711409395973154, 0.12162162162162163, 0.006756756756756757], 'GC': [0.06040268456375839, 0.02702702702702703, 0.02702702702702703], 'CA': [0.04697986577181208, 0.12162162162162163, 0.013513513513513514], 'CG': [0.06040268456375839, 0.006756756756756757, 0.0], 'CT': [0.0402684

In [10]:
dn = comp_dinucl_correct(rna)
print(dn)

{'AA': [0.06711409395973154, 0.10067114093959731, 0.10135135135135136], 'AG': [0.06711409395973154, 0.06040268456375839, 0.11486486486486487], 'AT': [0.09395973154362416, 0.0738255033557047, 0.10810810810810811], 'AC': [0.06711409395973154, 0.013422818791946308, 0.02702702702702703], 'TA': [0.020134228187919462, 0.06040268456375839, 0.0945945945945946], 'TG': [0.0738255033557047, 0.04697986577181208, 0.14864864864864866], 'TT': [0.053691275167785234, 0.11409395973154363, 0.08108108108108109], 'TC': [0.0738255033557047, 0.03355704697986577, 0.10135135135135136], 'GA': [0.11409395973154363, 0.0738255033557047, 0.08108108108108109], 'GG': [0.04697986577181208, 0.026845637583892617, 0.02702702702702703], 'GT': [0.06711409395973154, 0.12080536912751678, 0.006756756756756757], 'GC': [0.06040268456375839, 0.026845637583892617, 0.02702702702702703], 'CA': [0.04697986577181208, 0.12080536912751678, 0.013513513513513514], 'CG': [0.06040268456375839, 0.006711409395973154, 0.0], 'CT': [0.040268456

d) need to discuss about this

In [12]:
import collections
code = {
    'ttt': 'F', 'tct': 'S', 'tat': 'Y', 'tgt': 'C',
    'ttc': 'F', 'tcc': 'S', 'tac': 'Y', 'tgc': 'C',
    'tta': 'L', 'tca': 'S', 'taa': '*', 'tga': '*',
    'ttg': 'L', 'tcg': 'S', 'tag': '*', 'tgg': 'W',
    'ctt': 'L', 'cct': 'P', 'cat': 'H', 'cgt': 'R',
    'ctc': 'L', 'ccc': 'P', 'cac': 'H', 'cgc': 'R',
    'cta': 'L', 'cca': 'P', 'caa': 'Q', 'cga': 'R',
    'ctg': 'L', 'ccg': 'P', 'cag': 'Q', 'cgg': 'R',
    'att': 'I', 'act': 'T', 'aat': 'N', 'agt': 'S',
    'atc': 'I', 'acc': 'T', 'aac': 'N', 'agc': 'S',
    'ata': 'I', 'aca': 'T', 'aaa': 'K', 'aga': 'R',
    'atg': 'M', 'acg': 'T', 'aag': 'K', 'agg': 'R',
    'gtt': 'V', 'gct': 'A', 'gat': 'D', 'ggt': 'G',
    'gtc': 'V', 'gcc': 'A', 'gac': 'D', 'ggc': 'G',
    'gta': 'V', 'gca': 'A', 'gaa': 'E', 'gga': 'G',
    'gtg': 'V', 'gcg': 'A', 'gag': 'E', 'ggg': 'G'
}
def count_codons(cds):
    counts = collections.defaultdict(int)
    if len(cds)%3 != 0:
        last_pos = len(cds) - (len(cds)%3)
    else:
        last_pos = len(cds)
    for i in range(0,last_pos,3):
       codon = cds[i:i+3]
       counts[codon] += 1
    return counts


In [16]:
cb = count_codons(seq+'TT')
print(cb)
cb['ATC']

defaultdict(<class 'int'>, {'AAA': 1, 'AGG': 1, 'GGC': 1, 'CCC': 1, 'TTT': 2})


0

In [42]:
sm = 0
for i in code:
    if code[i]=='K':
        sm += cb[i.upper()]
print(sm)

7


## JS divergence

this stuff is symmetric!

https://en.wikipedia.org/wiki/Jensen–Shannon_divergence

JS(S1,s1) = H(S) - (L1/L)H(S1) - L2/LH(S2)

where L= L1+L2

and H(Sk) = - sigma ( fk(i) * log2fk(i))

with S = s1 directsum s2

In [17]:
import math
seq = 'AAAAGGGGCCCCTTTT'
seq2 = 'GGGGGGGGGGGAACCTT'
seq3 = 'TTTTAAAAGGGGCCCC'

In [33]:
def calculate_JS_relative_freq(sequence_1, sequence_2):
    # calculate H(s1):
    freq_1 = relative_freq(sequence_1)
    freq_2 = relative_freq(sequence_2)
    sigma_HS1 = 0
    sigma_HS2 = 0
    sigma_HS12 = 0
    length_L1L2 = len(sequence_1)+len(sequence_2)
    for nt in ["A", "T", "G", "C"]:
        sigma_HS1 += freq_1[nt]*math.log(freq_1[nt],2)
        sigma_HS2 += freq_2[nt]*math.log(freq_2[nt],2)
        sigma_HS12 += ((((len(sequence_1)/length_L1L2)*freq_1[nt])+((len(sequence_2)/length_L1L2)*freq_2[nt])))*math.log(((((len(sequence_1)/length_L1L2)*freq_1[nt])+((len(sequence_2)/length_L1L2)*freq_2[nt]))),2)
        
    sigma_HS1= -1 *sigma_HS1
    sigma_HS2= -1 *sigma_HS2
    sigma_HS12 = -1 * sigma_HS12
    JS_div = sigma_HS12 - ((len(sequence_1)/length_L1L2)*sigma_HS1) - ((len(sequence_2)/length_L1L2)*sigma_HS2)
    print(sigma_HS1)
    print(sigma_HS2)
    print(sigma_HS12)
    print(JS_div)


problem when distribution is 0

In [40]:
def calculate_JS_freq_nucl(sequence_1, sequence_2):
    # calculate H(s1):
    freq_1 = comp_nucl(sequence_1)
    freq_2 = comp_nucl(sequence_2)
    sigma_HS1 = 0
    sigma_HS2 = 0
    sigma_HS12 = 0
    length_L1L2 = len(sequence_1)+len(sequence_2)
    # think about the length again for this case
    for pos in range(0,3):
        for nt in ["A", "T", "G", "C"]:
            sigma_HS1 += freq_1[nt][pos]*math.log(freq_1[nt][pos],2)
            sigma_HS2 += freq_2[nt][pos]*math.log(freq_2[nt][pos],2)
            sigma_HS12 += ((((len(sequence_1)/length_L1L2)*freq_1[nt][pos])+((len(sequence_2)/length_L1L2)*freq_2[nt][pos])))*math.log(((((len(sequence_1)/length_L1L2)*freq_1[nt][pos])+((len(sequence_2)/length_L1L2)*freq_2[nt][pos]))),2)
        
        
    sigma_HS1= -1 *sigma_HS1
    sigma_HS2= -1 *sigma_HS2
    sigma_HS12 = -1 * sigma_HS12
    JS_div = sigma_HS12 - ((len(sequence_1)/length_L1L2)*sigma_HS1) - ((len(sequence_2)/length_L1L2)*sigma_HS2)
    print(sigma_HS1)
    print(sigma_HS2)
    print(sigma_HS12)
    print(JS_div)

In [44]:
freq_1 = comp_nucl(seq2)
print(freq_1)
math.log(freq_1['A'][1],2)

{'A': [0.2, 0.0, 0.2], 'T': [0.2, 0.2, 0.0], 'G': [0.8, 0.8, 0.6], 'C': [0.0, 0.2, 0.2]}


ValueError: math domain error

In [38]:
calculate_JS_relative_freq(seq2,seq)

1.496065911543853
2.0
1.8585552118703144
0.11815762107499617


In [41]:
calculate_JS_freq_nucl(seq2,seq)

ValueError: math domain error

In [20]:
calculate_JS_relative_freq(seq,seq2)

2.0
1.496065911543853
1.8585552118703144
0.11815762107499617


In [21]:
calculate_JS_relative_freq(seq,seq)

2.0
2.0
2.0
0.0


In [22]:
calculate_JS_relative_freq(seq,seq3)

2.0
2.0
2.0
0.0


In [23]:
calculate_JS_relative_freq(rna, seq)

1.9664293749375243
2.0
1.9687608372464125
0.0011713543153683698


## Custom agglomerative clustering