In [36]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

In [37]:
import pandas as pd
import numpy as np
from pandas import Series, DataFrame
import Bio
from Bio import SeqIO,AlignIO
from sklearn.feature_extraction.text import CountVectorizer

In [40]:
def parseFasta(data):
    d = {fasta.id : str(fasta.seq) for fasta in SeqIO.parse(data, "fasta")}
    pd.DataFrame([d])

    s = pd.Series(d, name='Sequence')
    s.index.name = 'ID'
    s.reset_index()
    return pd.DataFrame(s)

In [38]:
def get_kmer_table(genes,gene_len,k_min,k_max):
    
    count_vect = CountVectorizer(analyzer='char',ngram_range=(k_min,k_max))
    X = count_vect.fit_transform(genes)
    chars = count_vect.get_feature_names()
    chars
    kmers = X.toarray()
    kmer_freq = []
    for i in range(len(genes)):
        kmer_freq.append(kmers[i]/gene_len[i])
    input = pd.DataFrame(kmer_freq,columns=chars)
    return input

def get_ids(filename):
    ids = []
    for record in SeqIO.parse(filename, "fasta"):
        ids.append(record.id)

def get_gene_sequences(filename):
    genes = []
    for record in SeqIO.parse(filename, "fasta"):
        genes.append(str(record.seq))
    return genes

# genes: a list of gene sequences, which can directly be generated from get_gene_sequences().
def get_gene_len(genes):
    gene_len = []
    
    for i in range(len(genes)):
        gene_len.append(len(genes[i]))
    return gene_len

In [41]:
virus1 = parseFasta("label0.fasta")

# put confirmed virus killers at bottom, and removed the duplicates already in the data
virus01 = parseFasta("label1.fasta")
virus01 = virus01.append(virus1)
virus01 = virus01.drop_duplicates(keep="last")

In [46]:
genes = list(virus01['Sequence'])

In [48]:
len(genes)

350

In [49]:
genes_0 = get_gene_sequences("label0.fasta")
genes_1 = get_gene_sequences("label1.fasta")

gene_len_0 = get_gene_len(genes_0)
gene_len_1 = get_gene_len(genes_1)

all_genes = genes

all_gene_len = gene_len_0 + gene_len_1
kmer_table = get_kmer_table(all_genes,all_gene_len,5,6)
kmer_table


Unnamed: 0,aaaaa,aaaaaa,aaaaac,aaaaag,aaaaat,aaaac,aaaaca,aaaacc,aaaacg,aaaact,...,ygcwa,ygcwag,ygtac,ygtacg,ysccg,ysccgg,ytcag,ytcagt,yttac,yttacg
0,0.002347,0.000204,0.000918,0.000612,0.000612,0.001735,0.000918,0.000102,0.000102,0.000612,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000243,0.000243,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.001649,0.001335,0.000236,0.000000,0.000000,0.001021,0.000157,0.000550,0.000157,0.000157,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000748,0.000000,0.000499,0.000000,0.000249,0.001746,0.000499,0.000249,0.000249,0.000748,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.001517,0.000433,0.000217,0.000433,0.000433,0.001733,0.000000,0.000650,0.000433,0.000650,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
345,0.000928,0.000464,0.000000,0.000464,0.000000,0.000464,0.000000,0.000000,0.000464,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
346,0.000565,0.000282,0.000000,0.000282,0.000000,0.000565,0.000282,0.000282,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
347,0.002254,0.000282,0.000000,0.001127,0.000845,0.000845,0.000563,0.000282,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
348,0.003480,0.000580,0.000290,0.001450,0.001160,0.002320,0.000580,0.000870,0.000580,0.000290,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [50]:
from sklearn.mixture import GaussianMixture as GMM
gmm = GMM(n_components=2).fit(kmer_table)
labels = gmm.predict(kmer_table)

In [51]:
# 5-6 
labels_5_6 = labels
labels_5_6

array([0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [52]:
kmer_table2 = get_kmer_table(all_genes,all_gene_len,5,5)
kmer_table2

Unnamed: 0,aaaaa,aaaac,aaaag,aaaat,aaaca,aaacc,aaacg,aaact,aaaga,aaagc,...,yatga,ycgcg,ycgtc,ygacg,ygatg,ygcwa,ygtac,ysccg,ytcag,yttac
0,0.002347,0.001735,0.001735,0.002143,0.001531,0.000918,0.000918,0.002041,0.001837,0.001123,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000,0.000243,0.000000,0.000243,0.000243,0.000243,0.000243,0.000973,0.000486,0.000486,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.001649,0.001021,0.000707,0.000785,0.000628,0.001492,0.001178,0.001178,0.001099,0.001335,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000748,0.001746,0.000748,0.001746,0.001247,0.000499,0.000998,0.002245,0.001497,0.001247,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.001517,0.001733,0.002384,0.001517,0.001083,0.001300,0.001300,0.002600,0.002600,0.000650,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
345,0.000928,0.000464,0.000928,0.000000,0.000928,0.000000,0.000928,0.000464,0.001392,0.000464,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
346,0.000565,0.000565,0.000565,0.000565,0.001130,0.000565,0.000565,0.000000,0.001130,0.000565,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
347,0.002254,0.000845,0.001972,0.002535,0.002254,0.001127,0.000563,0.000845,0.003662,0.000563,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
348,0.003480,0.002320,0.003770,0.002030,0.002900,0.000870,0.001160,0.001160,0.008411,0.001160,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [53]:
gmm2 = GMM(n_components=2).fit(kmer_table2)
labels_5 = gmm2.predict(kmer_table2)
labels_5

array([1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,

In [15]:
last7_5 = labels_5[350:357]
last7_5

array([1, 1, 1, 1, 0, 0, 1])

In [54]:
#4-mer

kmer_table3 = get_kmer_table(all_genes,all_gene_len,4,4)
kmer_table3

gmm3 = GMM(n_components=2,covariance_type="full").fit(kmer_table3)
labels_4 = gmm3.predict(kmer_table3)
labels_4

array([1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,

In [23]:
last7_4 = labels3[350:357]

In [55]:
#4-5-mer

kmer_table3 = get_kmer_table(all_genes,all_gene_len,4,5)
kmer_table3

gmm3 = GMM(n_components=2,covariance_type="full").fit(kmer_table3)
labels_4_5 = gmm3.predict(kmer_table3)
labels_4_5

array([0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,

In [25]:
last7_4_5 = labels3[350:357]

In [56]:
# 6

kmer_table4 = get_kmer_table(all_genes,all_gene_len,6,6)
kmer_table4

gmm4 = GMM(n_components=2,covariance_type="full").fit(kmer_table4)
labels_6 = gmm4.predict(kmer_table4)
labels_6

array([0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,

In [27]:
last7_6 = labels4[350:357]

In [58]:
kmer_table = get_kmer_table(all_genes,all_gene_len,5,5)
kmer_table

gmm = GMM(n_components=3,covariance_type="full").fit(kmer_table)
labels = gmm.predict(kmer_table)
labels

array([0, 2, 2, 0, 0, 0, 2, 2, 2, 2, 2, 0, 1, 2, 0, 0, 2, 2, 0, 0, 0, 2,
       0, 1, 2, 1, 2, 2, 2, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 0, 2, 0, 0, 0, 0, 0, 2, 0,
       2, 0, 0, 0, 0, 2, 0, 2, 2, 0, 1, 2, 1, 1, 1, 2, 0, 0, 2, 0, 0, 1,
       1, 2, 1, 1, 2, 1, 1, 1, 1, 2, 0, 0, 0, 1, 1, 1, 1, 0, 0, 2, 1, 1,
       2, 1, 2, 1, 0, 1, 2, 1, 2, 0, 0, 2, 1, 1, 1, 1, 1, 2, 0, 2, 2, 0,
       0, 2, 1, 2, 2, 2, 2, 2, 2, 2, 1, 0, 1, 1, 2, 2, 0, 0, 0, 0, 2, 1,
       1, 1, 2, 2, 0, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 2, 0, 2, 0, 2,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 1, 0, 1, 1, 2, 0, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0,
       2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 2, 2, 2, 2, 1, 1, 1, 0, 1, 1, 0, 0,
       0, 0, 0, 2, 2, 2, 0, 2, 1, 1, 1, 1, 0, 0, 2, 2, 1, 1, 1, 1, 0, 0,
       2, 2, 0, 0, 0, 0, 1, 2, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1,
       1, 2, 2, 2, 0, 1, 2, 1, 1, 2, 1, 1, 0, 0, 0,

In [32]:
gmm5 = GMM(n_components=4).fit(kmer_table)
labels5 = gmm5.predict(kmer_table)
labels5

array([0, 2, 0, 0, 1, 3, 2, 2, 0, 0, 2, 1, 1, 2, 3, 3, 2, 2, 3, 3, 3, 0,
       0, 2, 0, 1, 0, 2, 2, 1, 1, 1, 1, 3, 2, 2, 0, 0, 0, 3, 2, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 3, 2, 1, 1, 1, 1, 1, 2, 0,
       0, 0, 3, 0, 3, 0, 0, 0, 0, 3, 2, 2, 2, 2, 2, 2, 3, 3, 0, 0, 0, 2,
       2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1,
       2, 2, 2, 2, 1, 2, 2, 2, 2, 3, 3, 0, 2, 2, 2, 2, 2, 2, 3, 0, 2, 0,
       0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 1, 2, 0, 3, 3, 3, 3, 0, 2,
       2, 1, 0, 0, 3, 2, 0, 1, 2, 2, 1, 1, 1, 1, 1, 3, 3, 2, 3, 0, 0, 0,
       0, 3, 3, 3, 3, 3, 2, 2, 1, 1, 1, 0, 1, 0, 2, 2, 0, 3, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 3, 3, 3, 3, 3, 0, 0, 3, 3, 3, 3, 3,
       0, 0, 2, 0, 0, 0, 3, 1, 3, 3, 0, 0, 0, 0, 2, 2, 2, 3, 1, 1, 0, 3,
       1, 3, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 3, 2, 0, 2, 2, 2, 2, 3, 1,
       0, 0, 3, 0, 3, 0, 2, 0, 3, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
       1, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1,

In [33]:
last7_5_4cluster = labels5[350:357]

In [28]:
n_components = np.arange(2, 3)
models = [GMM(n, covariance_type='full', random_state=0).fit(kmer_table)
          for n in n_components]
print(m.bic(kmer_table) for m in models)


<generator object <genexpr> at 0x13036c750>


In [30]:
gmm.bic(kmer_table)

156283363.65769827

In [32]:
gmm6 = GMM(n_components=4).fit(kmer_table)
labels6 = gmm6.predict(kmer_table)
labels6

array([1, 0, 1, 1, 1, 2, 0, 0, 0, 1, 0, 1, 1, 0, 3, 3, 0, 0, 3, 3, 3, 0,
       1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 3, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 0, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 3, 1, 1, 1, 1, 1, 1, 3, 0, 0, 0, 0, 0, 0, 3, 3, 1, 1, 2, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 3, 3, 1, 0, 0, 0, 0, 0, 0, 3, 1, 0, 1,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 1, 0, 1, 3, 3, 3, 3, 1, 0,
       0, 1, 1, 1, 3, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 3, 2, 0, 3, 1, 3, 1,
       1, 3, 3, 3, 3, 3, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 3, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 3, 3, 3, 1, 1, 1, 3, 3, 3, 3, 1,
       0, 1, 0, 1, 1, 1, 3, 1, 3, 3, 1, 0, 0, 0, 0, 0, 0, 3, 1, 1, 1, 3,
       1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 3, 1,
       0, 1, 3, 1, 3, 1, 0, 1, 3, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,

In [57]:
df = pd.DataFrame(data=[labels_4,labels_4_5,labels_5,labels_5_6,labels_6], index=['4 kmer','4 to 5 kmer','5 kmer','5 to 6 kmers','6 kmer'])
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,340,341,342,343,344,345,346,347,348,349
4 kmer,1,0,0,1,1,1,0,0,0,0,...,1,1,1,0,0,0,0,1,1,1
4 to 5 kmer,0,1,1,0,0,0,1,1,1,1,...,0,0,0,1,1,1,1,0,0,0
5 kmer,1,0,0,1,1,1,0,0,0,1,...,1,1,1,0,0,0,0,1,1,1
5 to 6 kmers,0,1,0,1,0,0,1,1,1,0,...,1,1,1,1,1,1,1,0,0,0
6 kmer,0,1,0,0,0,0,1,1,1,0,...,0,0,0,1,1,1,1,0,0,0
