In [48]:
# import packages
import matplotlib
#matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import numpy as np
import pandas as pdz
import numpy as np
import pandas as pd

from pandas import Series, DataFrame
import Bio
from Bio import SeqIO,AlignIO

import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.mixture import GaussianMixture as GMM

from sklearn.manifold import TSNE
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA


In [49]:
# methods

def parseFasta(data):
    d = {fasta.id : str(fasta.seq) for fasta in SeqIO.parse(data, "fasta")}
    pd.DataFrame([d])
    s = pd.Series(d, name='Sequence')
    s.index.name = 'ID'
    s.reset_index()
    return pd.DataFrame(s)

def get_kmer_table(path,k_min,k_max):
    genes,gene_len = read_fasta(path)
    count_vect = CountVectorizer(analyzer='char', ngram_range=(k_min, k_max))
    X = count_vect.fit_transform(genes)
    chars = count_vect.get_feature_names()
    kmers = X.toarray()
    kmer_freq = []
    for i in range(len(genes)):
        kmer_freq.append(kmers[i] / gene_len[i])
    input = pd.DataFrame(kmer_freq, columns=chars)
    return input

def get_gene_sequences(filename):
    genes = []
    for record in SeqIO.parse(filename, "fasta"):
        genes.append(str(record.seq))
    return genes

# genes: a list of gene sequences, which can directly be generated from get_gene_sequences().
def get_gene_len(genes):
    gene_len = []

    for i in range(len(genes)):
        gene_len.append(len(genes[i]))
    return gene_len

#read single fasta file containing all the gene sequences
def read_fasta(path):
    virus = parseFasta(path)
    virus = virus.drop_duplicates(keep="last")
    genes = list(virus['Sequence'])
    gene_seq = get_gene_sequences(path)
    gene_len = get_gene_len(gene_seq)
    return gene_seq,gene_len

def get_predictions_default(path,k_min,k_max,num_class,cov_type):
    seed  = np.random.seed(None)
    ran_state = np.random.get_state()
    kmer_table = get_kmer_table(path, k_min, k_max)
    gmm = GMM(n_components=num_class,covariance_type=cov_type,random_state = seed).fit(kmer_table)
    labels = gmm.predict(kmer_table)
    return labels,ran_state

def get_predictions_from_state(path,k_min,k_max,num_class,cov_type,state):
    kmer_table = get_kmer_table(path, k_min, k_max)
    gmm = GMM(n_components=num_class,covariance_type=cov_type,random_state = np.random.set_state(state)).fit(kmer_table)
    labels = gmm.predict(kmer_table)
    return labels

def get_predictions(path,k_min,k_max,num_class,cov_type, seed):
    kmer_table = get_kmer_table(path, k_min, k_max)
    gmm = GMM(n_components=num_class,covariance_type=cov_type,random_state = seed).fit(kmer_table)
    gmm.init_params = 'random'
    labels = gmm.predict(kmer_table)
    return labels

def cal_accuracy(labels, predictions):
    err = 0
    total_len = len(labels)
    for i in range(len(labels)):
        if (labels[i] == -1):
            total_len = total_len-1
            continue
        if (labels[i] != predictions[i]):
            err += 1
            
    return 1-err/(total_len)

def get_predictions_semi(path,k_min,k_max,num_class,cov_type,seed,labels):
    targets = []
    kmer_table = get_kmer_table(path, k_min, k_max)
    finalDf = pd.concat([kmer_table, pd.Series(labels)], axis = 1)
    gmm = GMM(n_components=num_class,covariance_type=cov_type,random_state = seed)
    gmm.init_params = 'random'
    for i in range(num_class):
        if (i in list(finalDf.Labels)):
            targets.append(i)
    if (len(targets)==num_class):
        #print("Yes")
        gmm.means_init = np.array([kmer_table[finalDf.Labels == i].mean(axis=0) for i in targets])
    gmm.fit(kmer_table)
    predictions = gmm.predict(kmer_table)
    return predictions


In [50]:
def model_selection(path,labels,num_class):
    best_accu = 0
    best_prediction = []
    cov_type = ['full','diag','tied','spherical']
    k_min = [2,3,4,5]
    k_max = [2,3,4,5]
    for cov in cov_type:
        for k1 in k_min:
            for k2 in k_max:
                if (k2 >= k1):
                    prediction = get_predictions_semi(path,k1,k2,num_class,cov,0,labels)
                    accu = cal_accuracy(labels,prediction)
                    if accu > best_accu: 
                        best_accu = accu
                        best_kmin = k1
                        best_kmax = k2
                        best_cov = cov
                        best_prediction = prediction
    print('Best model has the following parameters:')
    print('minimum length of kmer: ', best_kmin)
    print('maximum length of kmer: ', best_kmax)
    print('covariance type: ', best_cov)
    print('It has an accuracy regard to known labels of ',best_accu)
    return best_prediction

In [51]:
def PCA_plot(x,y,n_dim,path,title):
    
    # normalization of X is omitted, since it gives weird plots
    pca = PCA(n_components=n_dim)
    principalComponents = pca.fit_transform(x)
    principalDf = pd.DataFrame(data = principalComponents,columns = ['principal component 1', 'principal component 2'])
    finalDf = pd.concat([principalDf, pd.Series(y)], axis = 1)
    finalDf.columns = ['principal component 1', 'principal component 2','target']
    
    fig = plt.figure(figsize = (8,8))
    ax = fig.add_subplot(1,1,1) 
    ax.set_xlabel('Principal Component 1', fontsize = 15)
    ax.set_ylabel('Principal Component 2', fontsize = 15)
    ax.set_title('2 component PCA', fontsize = 20)
    targets = [0,1]
    colors = ['r', 'g']
    for target, color in zip(targets,colors):
        indicesToKeep = finalDf['target'] == target
        ax.scatter(finalDf.loc[indicesToKeep, 'principal component 1']
                   , finalDf.loc[indicesToKeep, 'principal component 2']
                   , c = color)
    ax.legend(targets)
    #images.append(path)
    plt.xlabel('PCA Component 1')
    plt.ylabel('PCA Component 2')
    plt.title(title)
    fig.savefig(path)   
    plt.close(fig)
    #plt.show()

In [52]:
def tsne_plot(x,y,path,title):
    tsne = TSNE()
    tsne.random_state = 0
    X_embedded = tsne.fit_transform(x)
    sns.scatterplot(X_embedded[:,0], X_embedded[:,1], hue=y, legend='full')
    plt.xlabel('Dimension 1')
    plt.ylabel('Dimension 2')
    plt.title(title)
    plt.savefig(path)   
    plt.close()

# Dataset Overview

In [7]:
path = "../datasets/combined_Bat_Cat_flu.fa"
gene,gene_len = read_fasta(path)

In [8]:
max(gene_len)

2341

In [9]:
min(gene_len)

538

In [10]:
sum(gene_len)/len(gene_len)

1613.9943820224719

## Unsupervised

In [54]:
bat_len = len(get_gene_sequences("../datasets/bat_flu.fa"))
cat_len = len(get_gene_sequences("../datasets/cat_flu.fa"))
zeros = [0]*bat_len
labels_all = np.append(zeros, [1]*cat_len, axis=None)

In [55]:
path = "../datasets/combined_Bat_Cat_flu.fa"
k_min = 2 
k_max = 6 
num_class = 2 
cov_type = 'full' 
seed = 0
predictions1 = get_predictions(path,k_min,k_max,num_class,cov_type,seed)

In [56]:
predictions1

array([0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1])

In [11]:
number = list(range(1, len(predictions1)+1))
df = pd.DataFrame(list(zip(number, predictions1)), 
               columns =['Number', 'Labels']) 
df.to_csv('predictions_unsup.csv',index = False)

In [12]:
df2 = pd.DataFrame(list(zip(number, labels_all)), 
               columns =['Number', 'Labels']) 
df2.to_csv('actual_labels.csv',index = False)

In [13]:
cal_accuracy(labels_all,predictions1)

0.752808988764045

In [57]:
df_0 = get_kmer_table("../datasets/combined_Bat_Cat_flu.fa",2,6)
df_0

Unnamed: 0,aa,aaa,aaaa,aaaaa,aaaaaa,aaaaac,aaaaag,aaaaat,aaaac,aaaaca,...,ttttg,ttttga,ttttgc,ttttgg,ttttgt,ttttt,ttttta,tttttc,tttttg,tttttt
0,0.163382,0.059133,0.023215,0.006570,0.000876,0.001752,0.001752,0.002190,0.002190,0.000876,...,0.000438,0.000438,0.000000,0.000000,0.000000,0.000438,0.000000,0.000000,0.000438,0.000000
1,0.150594,0.057684,0.019815,0.005284,0.000881,0.001761,0.000881,0.001761,0.002642,0.001761,...,0.002202,0.000881,0.000440,0.000000,0.000881,0.001761,0.000440,0.000881,0.000000,0.000440
2,0.164332,0.063025,0.020542,0.007937,0.002801,0.001867,0.001401,0.001867,0.004669,0.002801,...,0.000934,0.000467,0.000000,0.000467,0.000000,0.000934,0.000000,0.000934,0.000000,0.000000
3,0.148280,0.058126,0.022539,0.006524,0.001779,0.001186,0.001186,0.002372,0.003559,0.003559,...,0.002966,0.000593,0.000593,0.001186,0.000593,0.001779,0.000000,0.001186,0.000593,0.000000
4,0.133869,0.048193,0.014726,0.002677,0.000000,0.000000,0.001339,0.001339,0.003347,0.001339,...,0.002008,0.002008,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,0.119607,0.036309,0.009825,0.002563,0.000427,0.000000,0.001282,0.000854,0.000854,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000854,0.000000,0.000854,0.000000,0.000000
174,0.119944,0.040446,0.012552,0.002325,0.000000,0.000465,0.001395,0.000465,0.001860,0.001395,...,0.001395,0.000465,0.000465,0.000000,0.000465,0.001860,0.000000,0.000930,0.000465,0.000465
175,0.113463,0.038801,0.013521,0.001176,0.000588,0.000588,0.000000,0.000000,0.002939,0.000588,...,0.002352,0.000000,0.001176,0.000588,0.000588,0.001176,0.000000,0.000000,0.001176,0.000000
176,0.091489,0.021277,0.004965,0.001418,0.000000,0.000000,0.000709,0.000709,0.000000,0.000000,...,0.000709,0.000000,0.000000,0.000000,0.000709,0.000709,0.000709,0.000000,0.000000,0.000000


In [67]:
x = df_0
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents,columns = ['principal component 1', 'principal component 2'])
finalDf = pd.concat([principalDf, pd.Series(predictions1)], axis = 1)
finalDf.columns = ['principal Component 1', 'Principal Component 2','Predicted Label']
finalDf.to_csv('pca_unsup.csv',index = False)

In [68]:
finalDf = pd.concat([principalDf, pd.Series(labels_all)], axis = 1)
finalDf.columns = ['principal Component 1', 'Principal Component 2','True Label']
finalDf.to_csv('pca_labels.csv',index = False)

In [62]:
PCA_plot(df_0,predictions1,2,'figures/pca_0.png','PCA of Predictions')

In [63]:
PCA_plot(df_0,labels_all,2,'figures/pca_labels.png','PCA of All Labels')

In [18]:
tsne_plot(df_0,predictions1,'figures/tsne_0.png','TSNE of Predictions from Unsupervised Model')



In [19]:
tsne_plot(df_0,labels_all,'figures/tsne_labels.png','TSNE of All Labels')



# Semi-supervised 50%

In [20]:
labels_50 = pd.read_csv('../datasets/labels_fifty_percent.csv')
labels_50 = pd.Series(labels_50['Labels'])
path = "../datasets/combined_Bat_Cat_flu.fa"
num_class = 2

In [21]:
predictions_50_0 = model_selection(path,labels_50,num_class)

Best model has the following parameters:
minimum length of kmer:  2
maximum length of kmer:  3
covariance type:  tied
It has an accuracy regard to known labels of  1.0


In [22]:
cal_accuracy(labels_all,predictions_50_0)

0.8651685393258427

In [23]:
df_50_0 = get_kmer_table("../datasets/combined_Bat_Cat_flu.fa",2,3)
tsne_plot(df_50_0,predictions_50_0,'figures/tsne_50_best.png','TSNE of Predictions from 50% Known Labels')



In [64]:
PCA_plot(df_50_0,predictions_50_0,2,'figures/pca_50_best.png','PCA of Predictions from 50% Known Labels')

In [59]:
x = df_50_0
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents,columns = ['principal component 1', 'principal component 2'])
finalDf = pd.concat([principalDf, pd.Series(predictions_50_0)], axis = 1)
finalDf.columns = ['principal Component 1', 'Principal Component 2','Predicted Label']
finalDf.to_csv('pca_50%.csv',index = False)

In [25]:
df3 = pd.DataFrame(list(zip(number, predictions_50_0)), 
               columns =['Number', 'Labels']) 
df3.to_csv('predictions_50%_bestModel.csv',index = False)

In [28]:
tsne_plot(df_50_0,predictions_50_0,'figures/tsne_50.png','TSNE of Predictions from 50% Known Labels')



# Semi-supervised 10%

In [29]:
labels_10 = pd.read_csv('../datasets/labels_ten_percent.csv')
labels_10 = pd.Series(labels_10['Labels'])

In [30]:
predictions_10_0 = model_selection(path,labels_10,num_class)
cal_accuracy(labels_all,predictions_10_0)

Best model has the following parameters:
minimum length of kmer:  2
maximum length of kmer:  4
covariance type:  tied
It has an accuracy regard to known labels of  1.0


0.702247191011236

In [31]:
df_10_0 = get_kmer_table("../datasets/combined_Bat_Cat_flu.fa",2,4)
tsne_plot(df_10_0,predictions_10_0,'figures/tsne_10_best.png','TSNE of Predictions from 10% Known Labels')
PCA_plot(df_10_0,predictions_10_0,2,'figures/pca_10_best.png','PCA of Predictions from 10% Known Labels')



In [60]:
x = df_10_0
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents,columns = ['principal component 1', 'principal component 2'])
finalDf = pd.concat([principalDf, pd.Series(predictions_10_0)], axis = 1)
finalDf.columns = ['principal Component 1', 'Principal Component 2','Predicted Label']
finalDf.to_csv('pca_10%.csv',index = False)

In [33]:
df4 = pd.DataFrame(list(zip(number, predictions_10_0)), 
               columns =['Number', 'Labels']) 
df4.to_csv('predictions_10%_bestModel.csv',index = False)

# Semi-supervised 10% only 0s

In [34]:
labels_10_0 = pd.read_csv('../datasets/labels_ten_percent_only0s.csv')
labels_10_0 = labels_10_0['Labels']

In [35]:
predictions_10_only0 = model_selection(path,labels_10_0,num_class)
cal_accuracy(labels_all,predictions_10_only0)

Best model has the following parameters:
minimum length of kmer:  2
maximum length of kmer:  3
covariance type:  diag
It has an accuracy regard to known labels of  1.0


0.550561797752809

In [36]:
df_10_only0 = get_kmer_table("../datasets/combined_Bat_Cat_flu.fa",2,3)
tsne_plot(df_10_only0,predictions_10_only0,'figures/tsne_10_0_best.png','TSNE of Predictions from 10% Known Labels')
PCA_plot(df_10_only0,predictions_10_only0,2,'figures/pca_10_0_best.png','PCA of Predictions from 10% Known Labels')



In [38]:
df5 = pd.DataFrame(list(zip(number, predictions_10_only0)), 
               columns =['Number', 'Labels']) 
df5.to_csv('predictions_10%_only0s_bestModel.csv',index = False)

In [61]:
x = df_10_only0
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents,columns = ['principal component 1', 'principal component 2'])
finalDf = pd.concat([principalDf, pd.Series(predictions_10_only0)], axis = 1)
finalDf.columns = ['principal Component 1', 'Principal Component 2','Predicted Label']
finalDf.to_csv('pca_10%_only0.csv',index = False)

In [40]:
predictions1

array([0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1])

In [41]:
cal_accuracy(labels_all,predictions1)

0.752808988764045

In [42]:
predictions_50_0

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1])

In [43]:
cal_accuracy(labels_all,predictions_50_0)

0.8651685393258427

In [44]:
predictions_10_0

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0,
       0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1])

In [45]:
cal_accuracy(labels_all,predictions_10_0)

0.702247191011236

In [46]:
predictions_10_only0

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 1])

In [47]:
cal_accuracy(labels_all,predictions_10_only0)

0.550561797752809