In [50]:
import pandas as pd
from Bio import SeqIO
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import MeanShift
from sklearn import preprocessing 
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import copy
from IPython.display import Image



### 
### This includes code copied and pasted from the main methods used for the website in BioKlustering-Website/BioKlustering/mlmodel/parser/kmeans.py
### These methods are copy-pasted instead of directly included due to difficulties importing Django classes for running locally without running the server
###

def parseFasta(data):
    d = {fasta.id : str(fasta.seq) for fasta in SeqIO.parse(data, "fasta")}
    pd.DataFrame([d])

    s = pd.Series(d, name='Sequence')
    s.index.name = 'ID'
    s.reset_index()
    return pd.DataFrame(s)

def kmerXTable(s, a, b):
    tfid_vector = TfidfVectorizer(analyzer='char', ngram_range=(a,b))
    s_hat = tfid_vector.fit_transform(s.Sequence)
    kmerNames = tfid_vector.get_feature_names_out()
    kmers = s_hat.toarray()
    return pd.DataFrame(kmers,columns=kmerNames, index = s.index)
    
def kmeans(fasta, cNum, klength_min = 6, klength_max = 6, rNum = 50, seed=None):
    inputData = parseFasta(fasta)
#     temp = virus01.append(inputData)
#     temp = temp.drop_duplicates(keep="last")
        
    inputData["Sequence"] = inputData["Sequence"].apply(lambda x: x.replace("-", ""))
    kmerXTableInput = kmerXTable(inputData, klength_min, klength_max)
        
        
    #km = KMeans(random_state = rNum, n_clusters = cNum)
    #m.fit(kmerXTableInput) 
    #y_hat = km.predict(kmerXTableInput)
    PCAembedding = PCA(n_components=10)
    NkmerXTableInput = preprocessing.normalize(kmerXTableInput)
    PCAembedding_low = PCAembedding.fit_transform(NkmerXTableInput)
    
    np.random.seed(rNum)
    ms = MeanShift()
    cluster_centers = ms.cluster_centers_

    n_cluster_centers = len(cluster_centers)

    import warnings
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        kmms = KMeans(init = cluster_centers, n_clusters = n_cluster_centers, random_state=seed)
        #kmms = KMeans(init = 'k-means++', n_clusters = 2, n_init=20, max_iter=600)
        y_hat = kmms.fit_predict(PCAembedding_low)

    if n_cluster_centers > cNum:
        res = y_hat
        unique_predicted_labels = get_unique_numbers(res)
        predicted_labels_count = {}
        for label in unique_predicted_labels:
            predicted_labels_count[label] = (res == label).sum()
        max_item = max(predicted_labels_count, key=predicted_labels_count.get)
        predicted_labels_count = sorted(predicted_labels_count.items(), key=lambda x: x[1], reverse=True)
        map_predict_to_actual = {}
        max_value = cNum-1
        for i in range(len(predicted_labels_count)):
            if i < max_value:
                map_predict_to_actual[predicted_labels_count[i][0]] = i
            else:
                print(f"{predicted_labels_count[i][0]} mapped to {max_value}")
                map_predict_to_actual[predicted_labels_count[i][0]] = max_value

        # predictions_final contains the final results
        # it takes care of the case when num_class > number of unique labels given
        predictions_final = []
        print(f"map_predict_to_actual: {map_predict_to_actual}")
        for i in range(len(res)):
            if res[i] in map_predict_to_actual.keys():
                predictions_final.append(map_predict_to_actual[res[i]])
            else:
                predictions_final.append(map_predict_to_actual[max_item])
        print(predictions_final)
        y_hat = np.array(predictions_final)

        
    return y_hat, kmerXTableInput
        
def kmeans_semiSupervised(fasta, y_hat, klength_min = 6, klength_max = 6, rNum = 50, cNum=2, seed=None, verbose=False):
    inputData = parseFasta(fasta)
    inputData["Sequence"] = inputData["Sequence"].apply(lambda x: x.replace("-", ""))
    kmerXTableInput = kmerXTable(inputData, klength_min, klength_max)
    
    PCAembedding = PCA(n_components=10)
    NkmerXTableInput = preprocessing.normalize(kmerXTableInput)
    PCAembedding_low = PCAembedding.fit_transform(NkmerXTableInput)
    
    np.random.seed(rNum)
    ms = MeanShift()
    ms.fit(PCAembedding_low)
    cluster_centers = ms.cluster_centers_

    n_cluster_centers = len(cluster_centers)
    
    print("n cluster centers", n_cluster_centers)

    import warnings
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        kmms = KMeans(init = cluster_centers, n_clusters = n_cluster_centers, random_state= seed)
        kmms_labels = kmms.fit_predict(PCAembedding_low)

    if n_cluster_centers > cNum:
        res = kmms_labels
        unique_predicted_labels = get_unique_numbers(res)
        predicted_labels_count = {}
        for label in unique_predicted_labels:
            predicted_labels_count[label] = (res == label).sum()
        max_item = max(predicted_labels_count, key=predicted_labels_count.get)
        predicted_labels_count = sorted(predicted_labels_count.items(), key=lambda x: x[1], reverse=True)
        map_predict_to_actual = {}
        max_value = cNum-1
        for i in range(len(predicted_labels_count)):
            if i < max_value:
                map_predict_to_actual[predicted_labels_count[i][0]] = i
            else:
                if verbose:
                    print(f"{predicted_labels_count[i][0]} mapped to {max_value}")
                map_predict_to_actual[predicted_labels_count[i][0]] = max_value

        # predictions_final contains the final results
        # it takes care of the case when num_class > number of unique labels given
        predictions_final = []
        if verbose:
            print(f"map_predict_to_actual: {map_predict_to_actual}")
        for i in range(len(res)):
            if res[i] in map_predict_to_actual.keys():
                predictions_final.append(map_predict_to_actual[res[i]])
            else:
                predictions_final.append(map_predict_to_actual[max_item])
        kmms_labels = np.array(predictions_final)

     # convert all clusters into two clusters
    kmerXTableInput["pLabels"] = kmms_labels
    kmerXTableInput["aLabels"] = actual_labels = y_hat

    # Get the counts for the given labels and the predicted labels
    given_labels_count = {}
    labels_list = list(actual_labels)
    unique_given_labels = get_unique_numbers(actual_labels)
    predictions = kmms_labels
    for label in unique_given_labels:
        given_labels_count[label] = labels_list.count(label)
    unique_predicted_labels = get_unique_numbers(predictions)
    predicted_labels_count = {}
    for label in unique_predicted_labels:
        predicted_labels_count[label] = (predictions == label).sum()
    max_item = max(predicted_labels_count, key=predicted_labels_count.get)
    if -1 in given_labels_count.keys():
        del given_labels_count[-1]
    given_labels_count = sorted(given_labels_count.items(), key=lambda x: x[1], reverse=True)
    predicted_labels_count = sorted(predicted_labels_count.items(), key=lambda x: x[1], reverse=True)

    res = np.array(predictions)
    
    # Map the predicted labels to the given/actual labels
    unselected_given = copy.deepcopy(unique_given_labels)
    if -1 in unselected_given:
        unselected_given.remove(-1)
    unselected_pred = copy.deepcopy(unique_predicted_labels)
    
    print("unselected pred ", unselected_pred)
    print("unselected given ", unselected_given)

    # Map the predicted labels to the given/actual labels
    map_predict_to_actual = {}
    for label_GIVEN_dict_entry in given_labels_count:
        label_GIVEN = label_GIVEN_dict_entry[0]
        predicted_labels_count_GIVEN = {}
        label_GIVEN_idx = [index for (index, item) in enumerate(labels_list) if item == label_GIVEN]
        res_GIVEN = [res[i] for i in label_GIVEN_idx]
        unique_predicted_labels_GIVEN = list(set(get_unique_numbers(res_GIVEN)) & set(unselected_pred))
        if len(unique_predicted_labels_GIVEN) == 0:
                        continue
        for lab in unique_predicted_labels_GIVEN:
            predicted_labels_count_GIVEN[lab] = (res_GIVEN == lab).sum()
        max_pred = max(predicted_labels_count_GIVEN, key=predicted_labels_count_GIVEN.get)
        map_predict_to_actual[max_pred] = label_GIVEN
        unselected_given.remove(label_GIVEN)
        unselected_pred.remove(max_pred)
            
    # in the case where multiple given labels completely map to the same 
    # predicted label, we need to finish assigning given labels to any 
    # predicted label
    for lab_remain in unselected_given:
        for upl in unique_predicted_labels:
            if upl not in map_predict_to_actual.keys():
                map_predict_to_actual[upl] = lab_remain
                unselected_given.remove(lab_remain)
                unselected_pred.remove(upl)
                break
    
    if len(unique_given_labels) <= cNum:
        max_value = max(unique_given_labels) + 1
        for upl in unique_predicted_labels:
            if upl not in map_predict_to_actual.keys():
                # print(f"{upl} mapped to {max_value}")
                map_predict_to_actual[upl] = max_value
                max_value += 1
                unselected_pred.remove(upl)
                
    print("unselected pred ", unselected_pred)
    print("unselected given ", unselected_given)
                
            
    print(f"map_predict_to_actual: {map_predict_to_actual}")
    if len(unselected_given) != len(unselected_pred):
        print("error: num unselected given =",len(unselected_given), "!= unselected pred =",len(unselected_pred))
        
    print(f"map_predict_to_actual: {map_predict_to_actual}")
    
    for l in range(min(len(unselected_pred),len(unselected_given))):
        map_predict_to_actual[unselected_pred[l]] = unselected_given[l]
        unselected_pred.remove(unselected_pred[l])
        unselected_given.remove(unselected_pred[l])
        
    # this should never happen
    #if len(unselected_pred) > 0 and len(unselected_given) == 0:
    #    for unsel in unselected_pred:
    #        map_predict_to_actual[unsel] = max_value
            
    # this can happen if there are fewer predicted labels than given
    #if len(unselected_pred) == 0 and len(unselected_given) > 0:
    #    for unsel in unselected_given:
    #        map_predict_to_actual[-1] = unsel
    
    predictions_final = []
    predictions_tmp = []

    # predictions_final contains the final results
    # it takes care of the case when num_class > number of unique labels given
    for i in range(len(predictions)):
        if actual_labels[i] == -1:
            if predictions[i] in map_predict_to_actual.keys():
                predictions_final.append(map_predict_to_actual[predictions[i]])
                predictions_tmp.append(map_predict_to_actual[predictions[i]])
            else:
                predictions_final.append(map_predict_to_actual[max_item])
                predictions_tmp.append(map_predict_to_actual[max_item])
        else:
            predictions_tmp.append(map_predict_to_actual[predictions[i]])
            predictions_final.append(actual_labels[i])
    
    newLabels = predictions_final

    # get accuracy with regard to known labels

    unknown_label = -1
    total_labeled = 0
    for i in range(len(actual_labels)):
        if actual_labels[i] != unknown_label:
            total_labeled = total_labeled + 1

    correct_count = 0
    temp_accuracy = 0
    for k in range(len(actual_labels)):
        if (actual_labels[k] != unknown_label):
            if (actual_labels[k] == predictions_tmp[k]):
                correct_count += 1
    temp_accuracy = correct_count / total_labeled
    
    return newLabels, kmerXTableInput.drop(columns=["pLabels", "aLabels"]), kmms_labels
    

# this method credit to Zhiwen
def get_gene_sequences(filename):
    genes = []
    for record in SeqIO.parse(filename, "fasta"):
        genes.append(str(record.seq))
    return genes

# added helper method for semi-supervised labeling
def get_unique_numbers(numbers):

    list_of_unique_numbers = []

    unique_numbers = set(numbers)

    for number in unique_numbers:
        list_of_unique_numbers.append(number)

    return list_of_unique_numbers

# Nucleotides

In [51]:
fasta = "../combined_nucleotide.fasta"
labels_nuc = pd.Series(pd.read_csv("../combined_labels_nucleotide.csv").Labels)
k_min = 5
k_max = 6
cNum = 2
seed = 32624222

predictions_nuc, kmers, old_labels = kmeans_semiSupervised(fasta, labels_nuc, k_min, k_max, cNum=cNum, seed=seed)

inputData = parseFasta(fasta)
inputData["Sequence"] = inputData["Sequence"].apply(lambda x: x.replace("-", ""))
kmerXTableInput = kmerXTable(inputData, k_min, k_max)

PCAembedding = PCA(n_components=2)
NkmerXTableInput = preprocessing.normalize(kmerXTableInput)
PCAembedding_low = PCAembedding.fit_transform(NkmerXTableInput)

predictions_nuc = pd.Series(predictions_nuc)

principalDf = pd.DataFrame(data = PCAembedding_low,columns = ['principal component 1', 'principal component 2'])
finalDf = pd.concat([pd.Series(kmers.index), principalDf, predictions_nuc], axis = 1)
finalDf.columns = ['ID','principal Component 1', 'principal Component 2','label']
finalDf.to_csv('../results/nuc_kmeans_pca.csv',index = False)

n cluster centers 2
unselected pred  [0, 1]
unselected given  [0, 1]
unselected pred  []
unselected given  []
map_predict_to_actual: {0: 0, 1: 1}
map_predict_to_actual: {0: 0, 1: 1}


In [52]:
n_unlab = sum(labels_nuc == -1)

print("Ones: ", sum(predictions_nuc), " out of ", n_unlab)
print("Zeros: ", sum(1- predictions_nuc), " out of ", n_unlab)

Ones:  25  out of  350
Zeros:  341  out of  350


# Amino Acids

In [53]:
fasta = "../combined_amino.fasta"
labels_am = pd.Series(pd.read_csv("../combined_labels_amino.csv").Labels)
k_min = 5
k_max = 6
cNum = 2
seed = 32624

predictions_am, kmers, old_labels = kmeans_semiSupervised(fasta, labels_am, k_min, k_max, cNum=cNum, seed=seed)

inputData = parseFasta(fasta)
inputData["Sequence"] = inputData["Sequence"].apply(lambda x: x.replace("-", ""))
kmerXTableInput = kmerXTable(inputData, k_min, k_max)

PCAembedding = PCA(n_components=2)
NkmerXTableInput = preprocessing.normalize(kmerXTableInput)
PCAembedding_low = PCAembedding.fit_transform(NkmerXTableInput)

predictions_am= pd.Series(predictions_am)

principalDf = pd.DataFrame(data = PCAembedding_low,columns = ['principal component 1', 'principal component 2'])
finalDf = pd.concat([pd.Series(kmers.index), principalDf, predictions_am], axis = 1)
finalDf.columns = ['ID','principal Component 1', 'principal Component 2','label']
finalDf.to_csv('../results/am_kmeans_pca.csv',index = False)

n cluster centers 27
unselected pred  [0, 1]
unselected given  [0, 1]
unselected pred  []
unselected given  []
map_predict_to_actual: {1: 0, 0: 1}
map_predict_to_actual: {1: 0, 0: 1}


In [54]:
n_unlab = sum(labels_am == -1)

print("Ones: ", sum(predictions_am), " out of ", n_unlab)
print("Zeros: ", sum(1- predictions_am), " out of ", n_unlab)

Ones:  423  out of  465
Zeros:  58  out of  465
