In [13]:
# Copyright 2020 by Luke Selberg, Solis-Lemus Lab, WID.
# All rights reserved.
# This file is part of the BioKlustering Website.

import pandas as pd
from Bio import SeqIO
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.cluster import MeanShift
from sklearn import preprocessing
import numpy as np
import os
from helpers import plotly_dash_show_plot

def parseFasta(data):
    # print(data)
    d = {fasta.id : str(fasta.seq) for fasta in SeqIO.parse(data, "fasta")}
    pd.DataFrame([d])

    s = pd.Series(d, name='Sequence')
    s.index.name = 'ID'
    s.reset_index()
    return pd.DataFrame(s)

def kmerXTable(s, a, b):
    tfid_vector = TfidfVectorizer(analyzer='char', ngram_range=(a,b))
    s_hat = tfid_vector.fit_transform(s.Sequence)
    kmerNames = tfid_vector.get_feature_names()
    kmers = s_hat.toarray()
    return pd.DataFrame(kmers,columns=kmerNames, index = s.index)

# credit to chunrong
def read_fasta_sequences(sequence_paths):
    all_sequences = pd.DataFrame()
    sequence = parseFasta(sequence_paths)
    all_sequences = pd.concat([all_sequences, sequence])
    # for path in sequence_paths:
        # path = os.path.join("media", path)
        # sequence = parseFasta(path)
        # all_sequences = pd.concat([all_sequences, sequence])
    return all_sequences


In [14]:
def get_unique_numbers(numbers):

    list_of_unique_numbers = []

    unique_numbers = set(numbers)

    for number in unique_numbers:
        list_of_unique_numbers.append(number)

    return list_of_unique_numbers

def kmeans_semiSupervised(userId, fasta, klength_min, klength_max, rNum, y_hat, method):
    inputData = read_fasta_sequences(fasta)
    inputData["Sequence"] = inputData["Sequence"].apply(lambda x: x.replace("-", ""))
    kmerXTableInput = kmerXTable(inputData, klength_min, klength_max)

    PCAembedding = PCA(n_components=10)
    NkmerXTableInput = preprocessing.normalize(kmerXTableInput)
    PCAembedding_low = PCAembedding.fit_transform(NkmerXTableInput)

    ms = MeanShift()
    ms.fit(PCAembedding_low)
    cluster_centers = ms.cluster_centers_

    import warnings
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        kmms = KMeans(init = cluster_centers, n_clusters = len(cluster_centers))
        kmms_labels = kmms.fit_predict(PCAembedding_low)

    kmerXTableInput["pLabels"] = kmms_labels
    kmerXTableInput["aLabels"] = actual_labels = y_hat.tolist()
    newLabelsClusters = dict()
    unique_actual_labels = get_unique_numbers(kmerXTableInput["aLabels"])
    for actual_label in unique_actual_labels:
        newLabelsClusters[actual_label] = kmerXTableInput[kmerXTableInput["aLabels"] == actual_label]["pLabels"].tolist()

    print(newLabelsClusters)
    unique_predicted_labels = get_unique_numbers(kmms_labels)
    new_labels_dict = dict()
    for plabel in unique_predicted_labels:
        l = {}
        for key in newLabelsClusters.keys():
            if key != -1:
                l[key] = newLabelsClusters[key].count(plabel)
        new_labels_dict[plabel] = max(l, key=l.get)

    newLabels = []
    print(f"new_labels_dict: {new_labels_dict}")
    for i in range(len(kmms_labels)):
        if actual_labels[i] == -1:
            newLabels.append(new_labels_dict[kmms_labels[i]])
        else:
            newLabels.append(actual_labels[i])

    print(f"new labels: {newLabels}")
    kmerTable = kmerXTableInput.drop(columns=["pLabels", "aLabels"])
    plotly_kmertable = kmerTable
    plotly_labels = np.array(newLabels)
    if method == "PCA":
        plotly_kmertable = preprocessing.normalize(kmerTable)
    plotly_div = plotly_dash_show_plot(userId, plotly_kmertable, plotly_labels, "Semi-supervised Kmeans", method)

    inputData.insert(0, "Labels", newLabels)

    return [[inputData], [plotly_div]]

In [15]:
from numpy import genfromtxt
#kmeans_semiSupervised(userId, fasta, klength_min, klength_max, rNum, y_hat, method)
PATH01 = "sequence_59.fasta"
known_labels = genfromtxt('labels_59.csv', delimiter=',')
data = PATH01
labels_true = [-1 for x in range(100)]
result = kmeans_semiSupervised(userId = 1, fasta = PATH01, klength_min = 6, klength_max = 7, rNum = 0, y_hat = known_labels, method="PCA")

{66.0: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0], 20.0: [0], 52.0: [0], 55.0: [0], 57.0: [0], 59.0: [1, 0, 3, 1, 1, 1, 1, 2, 3, 1, 1, 0, 1, 1, 1, 1], -1.0: [1, 1, 3, 1, 1, 2, 2, 1, 1, 1, 2, 1, 1, 1, 1, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
new_labels_dict: {0: 66.0, 1: 59.0, 2: 59.0, 3: 59.0}
new labels: [59.0, 59.0, 59.0, 59.0, 59.0, 59.0, 59.0, 59.0, 59.0, 59.0, 59.0, 59.0, 59.0, 59.0, 59.0, 59.0, 59.0, 59.0, 59.0, 59.0, 59.0, 59.0, 59.0, 59.0, 59.0, 59.0, 59.0, 59.0, 59.0, 59.0, 59.0, 59.0, 59.0, 66.0, 66.0, 66.0, 66.0, 66.0, 66.0, 66.0, 66.0, 66.0, 66.0, 66.0, 66.0, 66.0, 66.0, 66.0, 66.0, 66.0, 66.0, 66.0, 66.0, 66.0, 66.0, 66.0, 66.0, 66.0, 66.0, 66.0, 66.0, 66.0, 66.0, 66.0, 66.0, 66.0, 66.0, 66.0, 66.0, 66.0, 66.0, 66.0, 66.0, 66.0, 66.0, 66.0, 66.0, 66.0, 59.0, 66.0, 66.0, 66.0, 66.0, 66.0, 66.0, 66.0, 66.0, 66.0, 66.0, 66.0, 66.0, 20.0, 55.0, 66.0, 66.0, 52.0, 

In [8]:
def kmeans(userId, fasta, klength_min, klength_max, rNum, cNum, method):
    inputData = read_fasta_sequences(fasta)
    inputData["Sequence"] = inputData["Sequence"].apply(lambda x: x.replace("-", ""))

    kmerXTableInput = kmerXTable(inputData, klength_min, klength_max)
    km = KMeans(random_state = rNum, n_clusters = cNum)
    km.fit(kmerXTableInput)
    y_hat = km.predict(kmerXTableInput)
    unique_predict_labels = get_unique_numbers(y_hat)
    plotly_kmertable = kmerXTableInput
    if method == "PCA":
        plotly_kmertable = preprocessing.normalize(kmerXTableInput)
    plot_div = plotly_dash_show_plot(userId, plotly_kmertable, y_hat, "Unsupervised Kmeans", method)
    inputData.insert(0, "Labels", y_hat)

    return [[inputData], [plot_div]]

In [10]:
result = kmeans(userId = 1, fasta = PATH01, klength_min = 6, klength_max = 7, cNum = 4, rNum = 0, method="PCA")

sequence_59.fasta
plotly_kmertable:               aaaaaa   aaaaaaa   aaaaaag   aaaaaat    aaaaac   aaaaaca  \
ID                                                                       
OA979645.1  0.002543  0.000000  0.001297  0.001271  0.020343  0.010171   
OD912801.1  0.002540  0.000000  0.001296  0.001270  0.020324  0.010162   
FR998715.1  0.002755  0.000000  0.001405  0.001378  0.022041  0.011021   
OA966210.1  0.001861  0.000000  0.000949  0.000931  0.014891  0.007445   
OA979303.1  0.002540  0.000000  0.001296  0.001270  0.020322  0.010161   
...              ...       ...       ...       ...       ...       ...   
LR883375.1  0.001378  0.000000  0.000000  0.001378  0.020669  0.011023   
MT470109.1  0.040994  0.098533  0.001394  0.001366  0.021863  0.010932   
MZ332124.1  0.002758  0.000000  0.001407  0.001379  0.022063  0.011031   
MT956916.1  0.041009  0.098569  0.001394  0.001367  0.021871  0.010936   
LC632047.1  0.041002  0.098553  0.001394  0.001367  0.021868  0.010934   

 

In [12]:
# import packages
import numpy as np
import pandas as pd
from Bio import SeqIO, AlignIO
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.mixture import GaussianMixture as GMM
import os

from helpers import *


def parseFasta(data):
    d = {fasta.id: str(fasta.seq) for fasta in SeqIO.parse(data, "fasta")}
    pd.DataFrame([d])
    s = pd.Series(d, name='Sequence')
    s.index.name = 'ID'
    s.reset_index()
    return pd.DataFrame(s)


def get_kmer_table(path, k_min, k_max):
    genes, gene_len, output_df = read_fasta(path)
    count_vect = CountVectorizer(analyzer='char', ngram_range=(k_min, k_max))
    X = count_vect.fit_transform(genes)
    chars = count_vect.get_feature_names()
    kmers = X.toarray()
    kmer_freq = []
    for i in range(len(genes)):
        kmer_freq = pd.concat([kmer_freq, kmers[i] / gene_len[i]],ignore_index=True) #kmer_freq.append(kmers[i] / gene_len[i])
    input = pd.DataFrame(kmer_freq, columns=chars)
    return input, output_df


def get_gene_sequences(filename):
    genes = []
    for record in SeqIO.parse(filename, "fasta"):
        genes.append(str(record.seq))
    return genes


# genes: a list of gene sequences, which can directly be generated from get_gene_sequences().
def get_gene_len(genes):
    gene_len = []

    for i in range(len(genes)):
        gene_len.append(len(genes[i]))
    return gene_len


def read_fasta(paths):
    all_genes = []
    all_gene_len = []
    output_df = pd.DataFrame()
    temp = paths
    paths = [temp]
    # print(paths)
    for path in paths:
        # path = os.path.join('media', path)
        virus = parseFasta(path)
        output_df = pd.concat([output_df, virus])
        virus = virus.drop_duplicates(keep="last")
        genes = list(virus['Sequence'])
        genes_seq = get_gene_sequences(path)
        gene_len = get_gene_len(genes_seq)
        all_genes = all_genes + genes_seq
        all_gene_len = all_gene_len + gene_len
    return all_genes, all_gene_len, output_df


def get_predictions(userId, path, k_min, k_max, num_class, cov_type, seed, method):
    kmer_table, output_df = get_kmer_table(path, k_min, k_max)
    gmm = GMM(n_components=num_class, covariance_type=cov_type, random_state=seed).fit(kmer_table)
    predictions = gmm.predict(kmer_table)
    plot_div = plotly_dash_show_plot(userId, kmer_table, predictions, "Unsupervised Gaussian Mixture Model", method)
    output_df.insert(0, "Labels", predictions)
    return [[output_df], [plot_div]]


# modified for website
def get_predictions_semi(userId, path, k_min, k_max, num_class, cov_type, seed, labels, method):
    targets = []
    kmer_table, output_df = get_kmer_table(path, k_min, k_max)

    finalDf = pd.concat([kmer_table, labels], axis=1)
    gmm = GMM(n_components=num_class, covariance_type=cov_type, random_state=seed)
    print(finalDf[0])
    for i in range(num_class):
        if i in list(finalDf[0]):
            targets = pd.concat([targets, i],ignore_index=True)
            #targets.append(i)
    if len(targets) == num_class:
        gmm.means_init = np.array([kmer_table[finalDf[0] == i].mean(axis=0) for i in targets])
    gmm.fit(kmer_table)
    predictions = gmm.predict(kmer_table)
    print(type(predictions))
    unique_given_labels = get_unique_numbers(labels)
    given_labels_count = {}
    labels_list = list(labels)
    for label in unique_given_labels:
        given_labels_count[label] = labels_list.count(label)
    unique_predicted_labels = get_unique_numbers(predictions)
    predicted_labels_count = {}
    for label in unique_predicted_labels:
        predicted_labels_count[label] = (predictions == label).sum()
    del given_labels_count[-1]
    given_labels_count = sorted(given_labels_count.items(), key=lambda x: x[1], reverse=True)
    predicted_labels_count = sorted(predicted_labels_count.items(), key=lambda x: x[1], reverse=True)
    map_predict_to_actual = {}
    for i in range(len(predicted_labels_count)):
        map_predict_to_actual[predicted_labels_count[i][0]] = given_labels_count[i][0]
    predictions_final = []
    for i in range(len(predictions)):
        if labels[i] == -1:
            predictions_final.append(map_predict_to_actual[predictions[i]])
        else:
            predictions_final.append(labels[i])
    predictions = np.array(predictions_final)
    plot_div = plotly_dash_show_plot(userId, kmer_table, predictions, "Semi-supervised Gaussian Mixture Model", method)
    output_df.insert(0, "Labels", predictions)
    # update parameters for predictInfo object (i.e. for front end)
    acc = cal_accuracy(labels, predictions_final)
    update_parameters(userId, {"accuracy": acc})
    return [[output_df], [plot_div]]


def get_unique_numbers(numbers):

    list_of_unique_numbers = []

    unique_numbers = set(numbers)

    for number in unique_numbers:
        list_of_unique_numbers.append(number)

    return list_of_unique_numbers

def cal_accuracy(labels, predictions):
    err = 0
    total_len = len(labels)
    for i in range(len(labels)):
        if labels[i] == -1:
            total_len = total_len - 1
            continue
        if labels[i] != predictions[i]:
            err += 1

    return 1 - err / total_len


def get_predictions_semi_original(path, k_min, k_max, num_class, cov_type, seed, labels):
    kmer_table, output_df = get_kmer_table(path, k_min, k_max)
    finalDf = pd.concat([kmer_table, pd.Series(labels)], axis=1)
    gmm = GMM(n_components=num_class, covariance_type=cov_type, random_state=seed)
    gmm.means_init = np.array([kmer_table[finalDf.Labels == i].mean(axis=0) for i in range(num_class)])
    gmm.fit(kmer_table)
    predictions = gmm.predict(kmer_table)
    return predictions


def model_selection(userId, path, labels, num_class, seed, method):
    best_accu = 0
    best_prediction = []
    cov_type = ['full', 'diag', 'tied', 'spherical']
    k_min = [2, 3, 4]
    k_max = [2, 3, 4, 5]
    for cov in cov_type:
        for k1 in k_min:
            for k2 in k_max:
                if k2 >= k1:
                    prediction = get_predictions_semi_original(path, k1, k2, num_class, cov, 0, labels)
                    accu = cal_accuracy(labels, prediction)
                    if accu > best_accu:
                        best_accu = accu
                        best_kmin = k1
                        best_kmax = k2
                        best_cov = cov
                        best_prediction = prediction

    # update parameters for predictInfo object (i.e. for front end)
    new_params = {
        'k_min': best_kmin,
        'k_max': best_kmax,
        'cov_type': best_cov,
        'accuracy': best_accu
    }
    update_parameters(userId, new_params)
    return get_predictions_semi(userId, path, best_kmin, best_kmax, num_class, best_cov, seed, labels, method)
    # return best_kmin,best_kmax,best_cov,best_prediction


k_min = 2
k_max = 3
num_class = 2
cov_type = 'full'
seed = 0
labels_50 = pd.read_csv('labels_59.csv', delimiter=',', header=None)
labels_50 = pd.Series(labels_50[0])
PATH01 = "sequence_59.fasta"
# def get_predictions_semi(userId, path, k_min, k_max, num_class, cov_type, seed, labels, method)
[labels], [plot] = get_predictions_semi(userId=0, path = PATH01, k_min = k_min, k_max = k_max, num_class = num_class,
                                        cov_type = cov_type, seed = seed, labels = labels_50, method = 'PCA')
print(labels)


0     59
1     -1
2     59
3     59
4     -1
      ..
95    52
96    -1
97    -1
98    57
99    -1
Name: 0, Length: 100, dtype: int64
<class 'numpy.ndarray'>


NameError: name 'PredictInfo' is not defined