In [21]:
import pandas as pd
from Bio import SeqIO
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import MeanShift
from sklearn import preprocessing 
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
from IPython.display import Image


### 
### This includes code copied and pasted from the main methods used for the website in BioKlustering-Website/BioKlustering/mlmodel/parser/kmeans.py
### These methods are copy-pasted instead of directly included due to difficulties importing Django classes for running locally without running the server
###

def parseFasta(data):
    d = {fasta.id : str(fasta.seq) for fasta in SeqIO.parse(data, "fasta")}
    pd.DataFrame([d])

    s = pd.Series(d, name='Sequence')
    s.index.name = 'ID'
    s.reset_index()
    return pd.DataFrame(s)

def kmerXTable(s, a, b):
    tfid_vector = TfidfVectorizer(analyzer='char', ngram_range=(a,b))
    s_hat = tfid_vector.fit_transform(s.Sequence)
    kmerNames = tfid_vector.get_feature_names()
    kmers = s_hat.toarray()
    return pd.DataFrame(kmers,columns=kmerNames, index = s.index)
    
def kmeans(fasta, cNum, klength_min = 6, klength_max = 6, rNum = 50):
    inputData = parseFasta(fasta)
#     temp = virus01.append(inputData)
#     temp = temp.drop_duplicates(keep="last")
        
    inputData["Sequence"] = inputData["Sequence"].apply(lambda x: x.replace("-", ""))
    kmerXTableInput = kmerXTable(inputData, klength_min, klength_max)
        
        
    #km = KMeans(random_state = rNum, n_clusters = cNum)
    #m.fit(kmerXTableInput) 
    #y_hat = km.predict(kmerXTableInput)
    PCAembedding = PCA(n_components=10)
    NkmerXTableInput = preprocessing.normalize(kmerXTableInput)
    PCAembedding_low = PCAembedding.fit_transform(NkmerXTableInput)
    
    ms = MeanShift()
    ms.fit(PCAembedding_low)
    cluster_centers = ms.cluster_centers_

    n_cluster_centers = len(cluster_centers)

    import warnings
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        kmms = KMeans(init = cluster_centers, n_clusters = n_cluster_centers)
        #kmms = KMeans(init = 'k-means++', n_clusters = 2, n_init=20, max_iter=600)
        y_hat = kmms.fit_predict(PCAembedding_low)

    if n_cluster_centers > cNum:
        res = y_hat
        unique_predicted_labels = get_unique_numbers(res)
        predicted_labels_count = {}
        for label in unique_predicted_labels:
            predicted_labels_count[label] = (res == label).sum()
        max_item = max(predicted_labels_count, key=predicted_labels_count.get)
        predicted_labels_count = sorted(predicted_labels_count.items(), key=lambda x: x[1], reverse=True)
        map_predict_to_actual = {}
        max_value = cNum-1
        for i in range(len(predicted_labels_count)):
            if i < max_value:
                map_predict_to_actual[predicted_labels_count[i][0]] = i
            else:
                # print(f"{predicted_labels_count[i][0]} mapped to {max_value}")
                map_predict_to_actual[predicted_labels_count[i][0]] = max_value

        # predictions_final contains the final results
        # it takes care of the case when num_class > number of unique labels given
        predictions_final = []
        print(f"res: {res}")
        print(f"map_predict_to_actual: {map_predict_to_actual}")
        for i in range(len(res)):
            if res[i] in map_predict_to_actual.keys():
                predictions_final.append(map_predict_to_actual[res[i]])
            else:
                predictions_final.append(map_predict_to_actual[max_item])
        # print(predictions_final)
        y_hat = np.array(predictions_final)

        
    return y_hat, kmerXTableInput

def get_unique_numbers(numbers):

    list_of_unique_numbers = []

    unique_numbers = set(numbers)

    for number in unique_numbers:
        list_of_unique_numbers.append(number)

    return list_of_unique_numbers

In [13]:
import warnings
warnings.filterwarnings('ignore')

In [14]:
path = "./size_500_test.fasta"
ouput_df = parseFasta(path)
ouput_df.head()

Unnamed: 0_level_0,Sequence
ID,Unnamed: 1_level_1
MT994979.1,AACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAAC...
MZ043011.1,ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGA...
MT994951.1,AACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAAC...
MT994950.1,AACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAAC...
MW010029.1,AACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAAC...


## Unsupervised

In [22]:
from operator import mod


fasta = "./size_500_test.fasta"
klength_min = 3
klength_max = 3
cNum = 5
seed = 1232
predictions1, x = kmeans(fasta, cNum, klength_min, klength_max, seed)

res: [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  2  0  0  0  0  0  3  0  0  0  0
  0  0  0  0  0  2  0  0  0  0  1  0  0  0  2  0  1  2  1  0  0  0  0  0
  0  0  0  2  0  2  0  1  0  0  0  0  0  0  0  0  0  1  0  0  2  0  1  1
  0  0  0  0  0  2  0  0  0  0  0  0  0  0  0  0  0  2  0  0  0  0  0  0
  2  2  0  1  0  0  0  0  1  2  7  2  1  1  1  1  1  1  1  1  1  3  1  0
  0  2  1  1  2  2  1  1  3  1  1  4  1  1  1  3  7  1  1  2  4  1  1  1
  0  1  1  1  2  1  8  1  1  1  1  1  1  1  1  1  4  4  1  1  1  1  1  2
  5  1  3  5  1  2  0  1  1  1  1  2  9  1  2  2 10  1  1  1  2  1  1  1
  1  1  1  1  1  1  1  6  1  1  6  1  0  0  0  0  0  0  0  2  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 

In [18]:
# accuracy
actual_label = pd.read_csv("./size_500_test.csv")
sum(actual_label['class'] == predictions1)/len(predictions1)

0.242

In [23]:
number = list(range(1, len(predictions1)+1))
df = pd.DataFrame(list(zip(number, predictions1)), 
               columns =['Number', 'Labels']) 
df.to_csv('kmeans_unsup_predictions.csv',index = False)

In [24]:
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents,columns = ['principal component 1', 'principal component 2'])
finalDf = pd.concat([principalDf, pd.Series(predictions1)], axis = 1)
finalDf = pd.concat([finalDf, pd.Series(actual_label['class'])], axis = 1)
finalDf.columns = ['principal Component 1', 'Principal Component 2','Predicted Label', 'Actual Label']
finalDf.to_csv('kmeans_unsup_pca.csv',index = False)