In [19]:
# import packages
import matplotlib
#matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import numpy as np
import pandas as pdz
import numpy as np
import pandas as pd

from pandas import Series, DataFrame
import Bio
from Bio import SeqIO,AlignIO

import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.mixture import GaussianMixture as GMM

from sklearn.manifold import TSNE
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

import copy

In [20]:
import warnings
warnings.filterwarnings('ignore')

In [21]:
# methods

### 
### This includes code copied and pasted from the main methods used for the website in BioKlustering-Website/BioKlustering/mlmodel/parser/GMM.py
### These methods are copy-pasted instead of directly included due to difficulties importing Django classes for running locally without running the server
###

def parseFasta(data):
    d = {fasta.id : str(fasta.seq) for fasta in SeqIO.parse(data, "fasta")}
    pd.DataFrame([d])
    s = pd.Series(d, name='Sequence')
    s.index.name = 'ID'
    s.reset_index()
    return pd.DataFrame(s)

def get_kmer_table(path,k_min,k_max):
    genes, gene_len, output_df = read_fasta(path)
    count_vect = CountVectorizer(analyzer='char', ngram_range=(k_min, k_max))
    X = count_vect.fit_transform(genes)
    chars = count_vect.get_feature_names_out()
    kmers = X.toarray()
    kmer_freq = []
    for i in range(len(genes)):
        kmer_freq.append(kmers[i] / gene_len[i])
    input = pd.DataFrame(kmer_freq, columns=chars)
    return input, output_df

def get_gene_sequences(filename):
    genes = []
    for record in SeqIO.parse(filename, "fasta"):
        genes.append(str(record.seq))
    return genes

# genes: a list of gene sequences, which can directly be generated from get_gene_sequences().
def get_gene_len(genes):
    gene_len = []

    for i in range(len(genes)):
        gene_len.append(len(genes[i]))
    return gene_len

#read single fasta file containing all the gene sequences
def read_fasta(path):
    all_genes = []
    all_gene_len = []
    output_df = pd.DataFrame()

    virus = parseFasta(path)
    output_df = pd.concat([output_df, virus])
    virus = virus.drop_duplicates(keep="last")
    genes = list(virus['Sequence'])
    genes_seq = get_gene_sequences(path)
    gene_len = get_gene_len(genes_seq)
    all_genes = all_genes + genes_seq
    all_gene_len = all_gene_len + gene_len
    return all_genes, all_gene_len, output_df

def get_predictions(path,k_min,k_max,num_class,cov_type, seed):
    kmer_table, output_df = get_kmer_table(path, k_min, k_max)
    gmm = GMM(n_components=num_class, covariance_type=cov_type, random_state=seed).fit(kmer_table)
    predictions = gmm.predict(kmer_table)
    output_df.insert(0, "Labels", predictions)
    return predictions


# Dataset Overview

In [22]:
path = "./concatenated.fasta"
gene,gene_len,ouput_df = read_fasta(path)
sum(gene_len)/len(gene_len)

436428.61475409835

## Unsupervised

In [23]:
from operator import mod

# Define the actual labels
actual_labels = pd.read_csv("./responses-carb.csv")
actual_labels = actual_labels["class"]
actual_labels = actual_labels.tolist()

path = "./concatenated.fasta"
k_min = 3
k_max = 3
num_class = len(np.unique(actual_labels))
cov_type = 'full' 
seed = 1232
predictions1 = get_predictions(path,k_min,k_max,num_class,cov_type,seed).tolist()

In [24]:
import itertools

def get_accuracy(predicted_labels, actual_labels):
    # Generate all possible permutations of label mappings
    possible_mappings = list(itertools.permutations(set(predicted_labels)))

    # Function to calculate accuracy given a label mapping
    def calculate_accuracy(actual_labels, predicted_labels, mapping):
        mapped_labels = [mapping[label] for label in predicted_labels]
        correct_predictions = sum(1 for actual, predicted in zip(actual_labels, mapped_labels) if actual == predicted)
        return correct_predictions / len(actual_labels)

    # Find the mapping with the highest accuracy
    best_mapping = None
    best_accuracy = 0.0

    for mapping in possible_mappings:
        accuracy = calculate_accuracy(actual_labels, predicted_labels, mapping)
        if accuracy > best_accuracy:
            best_mapping = mapping
            best_accuracy = accuracy

    # Create a dictionary containing the best matching and accuracy
    matching_results = {
        'Matching': dict(zip(set(predicted_labels), best_mapping)),
        'Accuracy': best_accuracy
    }

    # Print the results
    return matching_results

best_mapping = get_accuracy(predictions1, actual_labels)
print(best_mapping)

{'Matching': {0: 0}, 'Accuracy': 0.9508196721311475}


In [25]:
predictions1 = [best_mapping['Matching'][label] for label in predictions1]

In [26]:
df_0, output_df = get_kmer_table("./concatenated.fasta",3,3)
df_0.head()

Unnamed: 0,aaa,aac,aag,aat,aca,acc,acg,act,aga,agc,...,tcg,tct,tga,tgc,tgg,tgt,tta,ttc,ttg,ttt
0,0.003681,0.013174,0.013201,0.00379,0.008649,0.022957,0.016592,0.007628,0.008062,0.022007,...,0.025837,0.006968,0.010662,0.022401,0.024736,0.006024,0.000738,0.013608,0.004279,0.002408
1,0.003801,0.013027,0.013276,0.003833,0.008713,0.022748,0.016381,0.007709,0.008137,0.021706,...,0.026021,0.007188,0.010703,0.022629,0.024683,0.006154,0.00078,0.013976,0.004441,0.002527
2,0.003762,0.01288,0.013292,0.003787,0.008676,0.022743,0.016396,0.007689,0.008113,0.021859,...,0.026054,0.007012,0.010551,0.022643,0.024766,0.006037,0.000734,0.013928,0.004296,0.002454
3,0.003801,0.012784,0.013351,0.003785,0.008708,0.022545,0.016202,0.007648,0.008144,0.021858,...,0.026063,0.007159,0.010539,0.022794,0.024779,0.006092,0.000798,0.013985,0.004483,0.002565
4,0.003836,0.013042,0.013188,0.003879,0.008719,0.022727,0.016411,0.00767,0.008077,0.021893,...,0.025836,0.007073,0.010691,0.022638,0.02478,0.00609,0.000824,0.013791,0.004435,0.002536


In [27]:
number = list(range(1, len(predictions1)+1))
df = pd.DataFrame(list(zip(number, predictions1)), 
               columns =['Number', 'Labels']) 
df.to_csv('gmm_unsup_predictions.csv',index = False)

In [28]:
actual_label = pd.read_csv("./responses-carb.csv")

In [29]:
x = df_0
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents,columns = ['principal component 1', 'principal component 2'])
finalDf = pd.concat([principalDf, pd.Series(predictions1)], axis = 1)
finalDf = pd.concat([finalDf, pd.Series(actual_label['class'])], axis = 1)
finalDf.columns = ['principal Component 1', 'Principal Component 2','Predicted Label', 'Actual Label']
finalDf.to_csv('gmm_unsup_pca.csv',index = False)

In [30]:
sum(finalDf['Predicted Label'] == finalDf['Actual Label'])/len(finalDf)

0.9508196721311475