In [104]:
import pandas as pd
from Bio import SeqIO
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import MeanShift
from sklearn import preprocessing 
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
from IPython.display import Image
import copy


### 
### This includes code copied and pasted from the main methods used for the website in BioKlustering-Website/BioKlustering/mlmodel/parser/kmeans.py
### These methods are copy-pasted instead of directly included due to difficulties importing Django classes for running locally without running the server
###

def parseFasta(data):
    d = {fasta.id : str(fasta.seq) for fasta in SeqIO.parse(data, "fasta")}
    pd.DataFrame([d])

    s = pd.Series(d, name='Sequence')
    s.index.name = 'ID'
    s.reset_index()
    return pd.DataFrame(s)

def kmerXTable(s, a, b):
    tfid_vector = TfidfVectorizer(analyzer='char', ngram_range=(a,b))
    s_hat = tfid_vector.fit_transform(s.Sequence)
    kmerNames = tfid_vector.get_feature_names_out()
    kmers = s_hat.toarray()
    return pd.DataFrame(kmers,columns=kmerNames, index = s.index)
    
def kmeans(fasta, cNum, klength_min = 6, klength_max = 6, rNum = 50):
    inputData = parseFasta(fasta)
#     temp = virus01.append(inputData)
#     temp = temp.drop_duplicates(keep="last")
        
    inputData["Sequence"] = inputData["Sequence"].apply(lambda x: x.replace("-", ""))
    kmerXTableInput = kmerXTable(inputData, klength_min, klength_max)
        
        
    #km = KMeans(random_state = rNum, n_clusters = cNum)
    #m.fit(kmerXTableInput) 
    #y_hat = km.predict(kmerXTableInput)
    PCAembedding = PCA(n_components=10)
    NkmerXTableInput = preprocessing.normalize(kmerXTableInput)
    PCAembedding_low = PCAembedding.fit_transform(NkmerXTableInput)
    
    ms = MeanShift()
    ms.fit(PCAembedding_low)
    cluster_centers = ms.cluster_centers_

    n_cluster_centers = len(cluster_centers)

    import warnings
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        kmms = KMeans(init = cluster_centers, n_clusters = n_cluster_centers)
        #kmms = KMeans(init = 'k-means++', n_clusters = 2, n_init=20, max_iter=600)
        y_hat = kmms.fit_predict(PCAembedding_low)
    print(y_hat)
    print(f"n_cluster_centers: {n_cluster_centers}")
    if n_cluster_centers > cNum:
        res = y_hat
        unique_predicted_labels = get_unique_numbers(res)
        predicted_labels_count = {}
        for label in unique_predicted_labels:
            predicted_labels_count[label] = (res == label).sum()
        max_item = max(predicted_labels_count, key=predicted_labels_count.get)
        predicted_labels_count = sorted(predicted_labels_count.items(), key=lambda x: x[1], reverse=True)
        map_predict_to_actual = {}
        max_value = cNum-1
        for i in range(len(predicted_labels_count)):
            if i < max_value:
                map_predict_to_actual[predicted_labels_count[i][0]] = i
            else:
                # print(f"{predicted_labels_count[i][0]} mapped to {max_value}")
                map_predict_to_actual[predicted_labels_count[i][0]] = max_value
        print(f"map_predict_to_actual: {map_predict_to_actual}")
        # predictions_final contains the final results
        # it takes care of the case when num_class > number of unique labels given
        predictions_final = []
        # print(f"res: {res}")
        # print(f"map_predict_to_actual: {map_predict_to_actual}")
        for i in range(len(res)):
            if res[i] in map_predict_to_actual.keys():
                predictions_final.append(map_predict_to_actual[res[i]])
            else:
                predictions_final.append(map_predict_to_actual[max_item])
        # print(predictions_final)
        y_hat = np.array(predictions_final)

        
    return y_hat, kmerXTableInput

def get_unique_numbers(numbers):

    list_of_unique_numbers = []

    unique_numbers = set(numbers)

    for number in unique_numbers:
        list_of_unique_numbers.append(number)

    return list_of_unique_numbers


def kmeans_semiSupervised(fasta, y_hat, klength_min = 6, klength_max = 6, rNum = 50, cNum=2):
    inputData = parseFasta(fasta)
    inputData["Sequence"] = inputData["Sequence"].apply(lambda x: x.replace("-", ""))
    kmerXTableInput = kmerXTable(inputData, klength_min, klength_max)
    
    PCAembedding = PCA(n_components=10)
    NkmerXTableInput = preprocessing.normalize(kmerXTableInput)
    PCAembedding_low = PCAembedding.fit_transform(NkmerXTableInput)
    
    ms = MeanShift()
    ms.fit(PCAembedding_low)
    cluster_centers = ms.cluster_centers_

    n_cluster_centers = len(cluster_centers)

    import warnings
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        kmms = KMeans(init = cluster_centers, n_clusters = n_cluster_centers)
        kmms_labels = kmms.fit_predict(PCAembedding_low)

    if n_cluster_centers > cNum:
        res = kmms_labels
        unique_predicted_labels = get_unique_numbers(res)
        predicted_labels_count = {}
        for label in unique_predicted_labels:
            predicted_labels_count[label] = (res == label).sum()
        max_item = max(predicted_labels_count, key=predicted_labels_count.get)
        predicted_labels_count = sorted(predicted_labels_count.items(), key=lambda x: x[1], reverse=True)
        map_predict_to_actual = {}
        max_value = cNum-1
        for i in range(len(predicted_labels_count)):
            if i < max_value:
                map_predict_to_actual[predicted_labels_count[i][0]] = i
            else:
                print(f"{predicted_labels_count[i][0]} mapped to {max_value}")
                map_predict_to_actual[predicted_labels_count[i][0]] = max_value

        # predictions_final contains the final results
        # it takes care of the case when num_class > number of unique labels given
        predictions_final = []
        print(f"map_predict_to_actual: {map_predict_to_actual}")
        for i in range(len(res)):
            if res[i] in map_predict_to_actual.keys():
                predictions_final.append(map_predict_to_actual[res[i]])
            else:
                predictions_final.append(map_predict_to_actual[max_item])
        kmms_labels = np.array(predictions_final)

     # convert all clusters into two clusters
    kmerXTableInput["pLabels"] = kmms_labels
    kmerXTableInput["aLabels"] = actual_labels = y_hat
    
    # Get the counts for the given labels and the predicted labels
    given_labels_count = {}
    labels_list = list(actual_labels)
    unique_given_labels = get_unique_numbers(actual_labels)
    predictions = kmms_labels
    for label in unique_given_labels:
        given_labels_count[label] = labels_list.count(label)
    unique_predicted_labels = get_unique_numbers(predictions)
    predicted_labels_count = {}
    for label in unique_predicted_labels:
        predicted_labels_count[label] = (predictions == label).sum()
    max_item = max(predicted_labels_count, key=predicted_labels_count.get)
    if -1 in given_labels_count.keys():
        del given_labels_count[-1]
    given_labels_count = sorted(given_labels_count.items(), key=lambda x: x[1], reverse=True)
    predicted_labels_count = sorted(predicted_labels_count.items(), key=lambda x: x[1], reverse=True)

    res = np.array(predictions)

    # Map the predicted labels to the given/actual labels
    map_predict_to_actual = {}
    for label_GIVEN_dict_entry in given_labels_count:
        label_GIVEN = label_GIVEN_dict_entry[0]
        predicted_labels_count_GIVEN = {}
        label_GIVEN_idx = [index for (index, item) in enumerate(labels_list) if item == label_GIVEN]
        res_GIVEN = [res[i] for i in label_GIVEN_idx]
        unique_predicted_labels_GIVEN = get_unique_numbers(res_GIVEN)
        for lab in unique_predicted_labels_GIVEN:
            predicted_labels_count_GIVEN[lab] = (res_GIVEN == lab).sum()
        map_predict_to_actual[max(predicted_labels_count_GIVEN, key=predicted_labels_count_GIVEN.get)] = label_GIVEN
            
    max_value = max(unique_given_labels) + 1
    for upl in unique_predicted_labels:
        if upl not in map_predict_to_actual.keys():
            print(f"{upl} mapped to {max_value}")
            map_predict_to_actual[upl] = max_value
            max_value += 1
    
    predictions_final = []
    predictions_tmp = []

    # predictions_final contains the final results
    # it takes care of the case when num_class > number of unique labels given
    for i in range(len(predictions)):
        if actual_labels[i] == -1:
            if predictions[i] in map_predict_to_actual.keys():
                predictions_final.append(map_predict_to_actual[predictions[i]])
                predictions_tmp.append(map_predict_to_actual[predictions[i]])
            else:
                predictions_final.append(map_predict_to_actual[max_item])
                predictions_tmp.append(map_predict_to_actual[max_item])
        else:
            predictions_tmp.append(map_predict_to_actual[predictions[i]])
            predictions_final.append(actual_labels[i])
    
    newLabels = predictions_final

    # get accuracy with regard to known labels

    unknown_label = -1
    total_labeled = 0
    for i in range(len(actual_labels)):
        if actual_labels[i] != unknown_label:
            total_labeled = total_labeled + 1

    correct_count = 0
    temp_accuracy = 0
    for k in range(len(actual_labels)):
        if (actual_labels[k] != unknown_label):
            if (actual_labels[k] == predictions_tmp[k]):
                correct_count += 1
    temp_accuracy = correct_count / total_labeled
    
    return newLabels

In [105]:
import warnings
warnings.filterwarnings('ignore')

In [106]:
path = "./concatenated.fasta"
ouput_df = parseFasta(path)
ouput_df.head()

Unnamed: 0_level_0,Sequence
ID,Unnamed: 1_level_1
TA151,ATGAGTGATCTGCCAAGTCCGAAGAAACACAAGACCTCGAACTGGT...
IC1,ATGAGTGATCTGCCAAGTCCGAAGAAACACAAGACCTCGAACTGGT...
A237,ATGAGTGATCTGCCAAGTCCGAAGAAACACAAGACCTCGAACTGGT...
5920,ATGAGTGATCTGCCAAGTCCGAAGAAACACAAGACCTCGAACTGGT...
LiA96,ATGAGTGATCTGCCAAGTCCGAAGAAACACAAGACCTCGAACTGGT...


## Unsupervised

In [107]:
from operator import mod

# Define the actual labels
actual_labels = pd.read_csv("./responses-carb.csv")
actual_labels = actual_labels["class"]
actual_labels = actual_labels.tolist()


fasta = "./concatenated.fasta"
klength_min = 3
klength_max = 3
cNum = len(np.unique(actual_labels))
seed = 1232
predictions1, x = kmeans(fasta, cNum, klength_min, klength_max, seed)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0]
n_cluster_centers: 3
map_predict_to_actual: {0: 0, 1: 1, 2: 1}


In [108]:
# import numpy as np
# from scipy.stats import mode

# # Assuming you have an array of 500 numbers stored in the variable 'numbers'
# def get_accuracy(numbers):
#     chunk_size = 100
#     num_chunks = len(numbers) // chunk_size

#     modes = []

#     for i in range(num_chunks):
#         chunk = numbers[i * chunk_size: (i + 1) * chunk_size]
#         mode_result = mode(chunk)
#         mode_value = mode_result.mode[0]
#         mode_count = mode_result.count[0]
#         modes.append((mode_value, mode_count))

#     print(modes)

In [109]:
import itertools


def get_accuracy(predicted_labels, actual_labels):
    # Generate all possible permutations of label mappings
    possible_mappings = list(itertools.permutations(set(predicted_labels)))

    # Function to calculate accuracy given a label mapping
    def calculate_accuracy(actual_labels, predicted_labels, mapping):
        mapped_labels = [mapping[label] for label in predicted_labels]
        correct_predictions = sum(1 for actual, predicted in zip(actual_labels, mapped_labels) if actual == predicted)
        return correct_predictions / len(actual_labels)

    # Find the mapping with the highest accuracy
    best_mapping = None
    best_accuracy = 0.0

    for mapping in possible_mappings:
        accuracy = calculate_accuracy(actual_labels, predicted_labels, mapping)
        if accuracy > best_accuracy:
            best_mapping = mapping
            best_accuracy = accuracy

    # Create a dictionary containing the best matching and accuracy
    matching_results = {
        'Matching': dict(zip(set(predicted_labels), best_mapping)),
        'Accuracy': best_accuracy
    }

    # Print the results
    return matching_results

best_mapping = get_accuracy(predictions1, actual_labels)
print(best_mapping)

{'Matching': {0: 0, 1: 1}, 'Accuracy': 0.7950819672131147}


In [110]:
predictions1 = [best_mapping['Matching'][label] for label in predictions1]

In [111]:
number = list(range(1, len(predictions1)+1))
df = pd.DataFrame(list(zip(number, predictions1)), 
               columns =['Number', 'Labels']) 
df.to_csv('kmeans_unsup_predictions.csv',index = False)

In [112]:
actual_label = pd.read_csv("./responses-carb.csv")

In [113]:
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents,columns = ['principal component 1', 'principal component 2'])
finalDf = pd.concat([principalDf, pd.Series(predictions1)], axis = 1)
finalDf = pd.concat([finalDf, pd.Series(actual_label['class'])], axis = 1)
finalDf.columns = ['principal Component 1', 'Principal Component 2','Predicted Label', 'Actual Label']
finalDf.to_csv('kmeans_unsup_pca.csv',index = False)

In [114]:
sum(finalDf['Predicted Label'] == finalDf['Actual Label'])/len(finalDf)

0.7950819672131147

# Semi-supervised

In [115]:
semi_labels = pd.read_csv("./carb-semi-labels.csv").Labels.tolist()

actual_labels = pd.read_csv("./responses-carb.csv")
actual_labels = actual_labels["class"]
actual_labels = actual_labels.tolist()

fasta = "./concatenated.fasta"
klength_min = 3
klength_max = 3
cNum = len(np.unique(actual_labels))
seed = 80723

yhat = np.array(kmeans_semiSupervised(fasta,semi_labels,klength_min,klength_max))

1 mapped to 1
2 mapped to 1
3 mapped to 1
4 mapped to 1
map_predict_to_actual: {0: 0, 1: 1, 2: 1, 3: 1, 4: 1}
1 mapped to 2


In [116]:
ones = [i for i in range(len(yhat)) if yhat[i] == 1]
zeros = [i for i in range(len(yhat)) if yhat[i] == 0]
twos = [i for i in range(len(yhat)) if yhat[i] == 2]

yhat[zeros] = 0
yhat[twos] = 1
yhat[ones] = 1

print(yhat[21:30])

[1 1 1 1 1 1 1 1 1]


In [117]:
number = list(range(1, len(yhat)+1))
df = pd.DataFrame(list(zip(number, yhat)), 
               columns =['Number', 'Labels']) 
df.to_csv('kmeans_semisup_predictions.csv',index = False)