In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import numpy as np
import pandas as pd
from Bio import SeqIO
import matplotlib.pyplot as plt
import umap

def get_conversion_table(path, norm=True):
    table_input = pd.read_csv(path, sep=" ", index_col=0)
    index = list(table_input.index)
    if norm:
        scaled = MinMaxScaler(feature_range=(-1, 1)).fit_transform(table_input)
        table = {}
        for index, aa in enumerate(index):
            table[aa] = np.array(scaled[index])
    else:
        table = {}
        table_input = np.array(table_input.values)
        for index, aa in enumerate(index):
            table[aa] = np.array(table_input[index])
    table["X"] = [0] * 5
    return table


def read_fasta(fasta_path):
    fasta = {}
    for record in SeqIO.parse(fasta_path, "fasta"):
        fasta[str(record.id)] = str(record.seq)
    return fasta


def padding_seqs(seqs, length=None, pad_value=None):
    length = length or 30
    pad_value = pad_value or "X"
    data = {}
    for key, seq in seqs.items():
        if len(seq) <= length:
            data[key] = seq + pad_value * (length - len(seq))
        else:
            raise Exception("Length exceeds {}".format(length))
    return data

def encode(fasta, table):
    encoded_seqs = {}
    for key, seq in fasta.items():
        encoded_seqs[key] = [table[aa] for aa in list(seq)]
    return encoded_seqs

def get_encoded_seqs(seqs, table):
    seqs = padding_seqs(seqs, 30)
    encoded_seqs = encode(seqs, table)
    output_vectors = []
    for value in encoded_seqs.values():
        value = list(np.array(value).flatten())
        output_vectors.append(value)
    
    return output_vectors

def draw_umap(data, label):
    embs = umap.UMAP(n_neighbors = 12,
                    min_dist = 0.75,
                    n_components = 2,
                    metric = 'correlation',  # correlation, euclidean
                    random_state = 1234).fit_transform(data)

    fig, ax = plt.subplots(figsize=(12,8), dpi=100)
    colors = ['lightskyblue', 'royalblue', 'violet', 'limegreen', 'darkorange', 'red']

    plt.xlim([np.min(embs[:,0])-1, np.max(embs[:,0])+1])
    plt.ylim([np.min(embs[:,1])-1, np.max(embs[:,1])+1])

    for i in range(len(embs)):
        ax.scatter(embs[i, 0], embs[i, 1], c=colors[int(label[i])], alpha=0.6, s=15)
    
    labelsize = 18
    plt.xticks(fontname='Arial', size=labelsize)
    plt.yticks(fontname='Arial', size=labelsize)

    labels = ['Real', 'Epoch 1', 'Epoch 100', 'Epoch 200', 'Epoch 400', 'Epoch 800']
    # produce a legend with the unique colors from the scatter
    patches = [plt.plot([],[], marker="o", ms=10, ls="", mec=None, color=colors[i], label="{:s}".format(labels[i]))[0] for i in range(len(colors))]
    labelss = plt.legend(handles=patches, fontsize=18).get_texts()
    [lab.set_fontname('Arial') for lab in labelss]

    # plt.setp(ax, xticks=[], yticks=[])
    # plt.show()
    plt.savefig("umap.png", dpi=300)

In [None]:
table_path = './data/AAF.txt'
table = get_conversion_table(table_path, norm=False)
seqs = read_fasta('./data/amp.fasta')

train_seqs = {}
for id_, seq in seqs.items():
    if len(seq) < 30:
        train_seqs[id_] = seq
train_encoded = get_encoded_seqs(train_seqs, table)

epoches = ['1', '100', '200', '400', '800']
seqlist = train_encoded
labels = list((np.zeros(len(train_seqs))))
for i in range(len(epoches)):
    seq = read_fasta('./data/epoch_'+epoches[i]+'_generated_seq.fasta')
    encoded_seq = get_encoded_seqs(seq, table)
    seqlist = seqlist + encoded_seq
    label =  (i + 1) * np.ones(len(encoded_seq))
    labels = labels + list(label)

draw_umap(seqlist, labels)

In [None]:
from sklearn.manifold import TSNE
def tsne_each(ex, label, name, perp):
    tsne = TSNE(n_components=2, perplexity=perp, init='pca', n_iter=1000, random_state=1234)
    tsne_ex = tsne.fit_transform(np.array(ex))
    colors = ['lightskyblue', 'royalblue', 'violet', 'limegreen', 'darkorange', 'red']

    fig = plt.figure(figsize=(12,8))

    plt.xlim(tsne_ex[:,0].min() - 15, tsne_ex[:,0].max() + 15)
    plt.ylim(tsne_ex[:,1].min() - 15, tsne_ex[:,1].max() + 15)

    for i in range(len(tsne_ex)):
        plt.scatter(tsne_ex[i, 0], tsne_ex[i, 1], c=colors[int(label[i])], alpha=0.6, s=15)
    # plt.tick_params(labelsize=20)
    plt.xticks(fontname='Arial', size=18)
    plt.yticks(fontname='Arial', size=18)

    labels = ['Real', 'Epoch 1', 'Epoch 100', 'Epoch 200', 'Epoch 400', 'Epoch 800']
    # produce a legend with the unique colors from the scatter
    patches = [plt.plot([],[], marker="o", ms=10, ls="", mec=None, color=colors[i], label="{:s}".format(labels[i]))[0] for i in range(len(colors))]
    labelss = plt.legend(handles=patches, fontsize=18).get_texts()
    [lab.set_fontname('Arial') for lab in labelss]

    plt.savefig(name + ".png", dpi=300)

tsne_each(seqlist, labels, 'tsne-35-each-all', 35)