In [2]:
import pandas as pd
from Bio import SeqIO
def parseFasta(data):
    d = {fasta.id : str(fasta.seq) for fasta in SeqIO.parse(data, "fasta")}
    pd.DataFrame([d])

    s = pd.Series(d, name='Sequence')
    s.index.name = 'ID'
    s.reset_index()
    return pd.DataFrame(s)

from sklearn.feature_extraction.text import TfidfVectorizer
def kmerXTable(s, a, b):
    tfid_vector = TfidfVectorizer(analyzer='char', ngram_range=(a,b))
    s_hat = tfid_vector.fit_transform(s.Sequence)
    kmerNames = tfid_vector.get_feature_names()
    kmers = s_hat.toarray()
    return pd.DataFrame(kmers,columns=kmerNames, index = s.index)

# not used
from sklearn.cluster import KMeans
def doKmeans(rState, nClusters, data):
    km = KMeans(random_state = rState, n_clusters = nClusters)
    km.fit(data)
    return km

# not used
from sklearn.decomposition import PCA
def doPCA(data):
    embedding = PCA()
    embedding.fit(data)
    return embedding

# not used
import matplotlib.pyplot as plt
def graphPCA(pca, data, labels):
    show = pd.DataFrame(pca.transform(data))
    ax = show.plot.scatter(x=0, y=1, style="o", c=labels, cmap = "viridis", s=2)
    plt.title('PCA')
    return ax

# not used
from sklearn.manifold import TSNE
def doTSNE(rState, nComponents, data):
    tSNEembedding = TSNE(n_components= nComponents, random_state = rState)
    tSNEembedding_low = tSNEembedding.fit_transform(data)
    return tSNEembedding_low

# not used
def graphTSNE(tSNEembedding_low, labels):
    tSNEshow = pd.DataFrame(tSNEembedding_low, labels)
    ax = tSNEshow.plot.scatter(x=0, y=1, style="o", c=labels, cmap = "viridis", s=2)
    plt.title('tSNE')
    return ax

# other imports 
import matplotlib.patches as mpatches
from sklearn.cluster import MeanShift
import warnings

In [3]:
path1 = "data/Sclerotinia_biocontrol_mycovirus_nucleotide.fasta"
path01 = "data/mycovirus_genbank_all_refseq_nucleotide_unique.fasta"

In [40]:
def kmeansPCA(fastaList):
    # read in fasta files
    virus1 = parseFasta(path1)
    virus01 = parseFasta(path01)
    virus01 = virus01.append(virus1)
    virus01 = virus01.drop_duplicates(keep="last")

    # make kmer tables
    kmer7Table1 = kmerXTable(virus1, 7,7)
    kmer7Table01 = kmerXTable(virus01, 7,7)

    # only use columns that confirmed virus killers have no zeros in for kmer length 7
    cols = kmer7Table1.loc[:, (kmer7Table1 == 0).any(axis=0) != True].columns

    #Kmeans
    km = KMeans(random_state = 42, n_clusters = 2)
    km.fit(kmer7Table01[cols])
    
    output = []
    for i in range(0, len(fastaList)):
        inputData = parseFasta(fastaList[i])
        inputData["Sequence"] = inputData["Sequence"].apply(lambda x: x.replace("-", ""))
        kmer7TableInput = kmerXTable(inputData, 7,7)
        
        # generate array of labels
        y_hat = km.predict(kmer7TableInput[cols])

        #PCA
        if len(inputData) > 1:
            embedding = PCA()
            embedding.fit(kmer7TableInput[cols])
            show = pd.DataFrame(embedding.transform(kmer7TableInput[cols]))
            # show kmeans clustering
            show["labels"] = y_hat
            ax = show[show["labels"]==1].plot.scatter(x=0, y=1, style="o", color="red", s=2)
            show[show["labels"]==0].plot.scatter(x=0, y=1, style="o", color="blue", s=2, ax=ax)
            red = mpatches.Patch(color='red', label='Fungus Killers')
            blue = mpatches.Patch(color='blue', label='Fungus Non-Killers')
            plt.legend(handles=[red, blue])
            plt.title('PCA Visualization for KMeans Clustering\n' + fastaList[i])
            plt.xlabel('First Principal Component')
            plt.ylabel('Second Principal Component')
            plt.savefig('nonNotebookFiles/kmeansPCA' + str(i) + '.png', bbox_inches='tight')
            plt.close()

        inputData["Labels"] = y_hat
        output.append(inputData)
    return output

In [41]:
x = kmeansPCA(["data/mycovirus_genbank_all_refseq_nucleotide_unique.fasta",  "data/Sclerotinia_biocontrol_mycovirus_nucleotide.fasta"])
for df in x:
    print(df.head())

                                                      Sequence  Labels
ID                                                                    
NC_000960.1  GGGGGTTAGAGAAATCTTGGGAGATTTCTATCGTCATAGACATATG...       1
NC_001278.1  GGGAAATTTGTGAGATTATCGCCCTAAAGGATAACTCTCATGCGGG...       0
NC_001492.1  GCCTATGGGTGGTCTACATAGGTGAGCATGCGTTGCTCGATATAGA...       1
NC_001633.1  ACAAAATAATTGAAGAAATTATTTTTGTTTTACATATTTAGATCTT...       1
NC_001641.1  GAATTTTTCGGTGAACCGGAATTATGTCGTCTCTGTTAAATTCATT...       0
                                                      Sequence  Labels
ID                                                                    
NC_007415.1  CTTTAGTAAATACCACCGCCTCTTAATAAGTCTCGAGGCACTCCCC...       1
NC_015939.1  GGGGTGATGGGAAACTTACGTTTCGCCAACGTTAAGGTACTGTGAT...       1
NC_022896.1  TTTTTGGGGATGGTACTCTCAGGTTTGATCTTTGTAGATCCTAACT...       1
NC_026510.1  TTGGCTCCTGGAGACCGTTAGGTCCCCAGAAGCGGTTCAATAGAAC...       1
NC_027138.1  GCAATAAAAAGCACAGCCGGAAGGCTTTCTTTTTATTGCCCAGTTT...       1


In [43]:
def kmeansTSNE(fastaList):
    # read in fasta files
    virus1 = parseFasta(path1)
    virus01 = parseFasta(path01)
    virus01 = virus01.append(virus1)
    virus01 = virus01.drop_duplicates(keep="last")

    # make kmer tables
    kmer7Table1 = kmerXTable(virus1, 7,7)
    kmer7Table01 = kmerXTable(virus01, 7,7)

    # only use columns that confirmed virus killers have no zeros in for kmer length 7
    cols = kmer7Table1.loc[:, (kmer7Table1 == 0).any(axis=0) != True].columns

    #Kmeans
    km = KMeans(random_state = 42, n_clusters = 2)
    km.fit(kmer7Table01[cols])
    
    output = []
    for i in range(0, len(fastaList)):
        inputData = parseFasta(fastaList[i])
        inputData["Sequence"] = inputData["Sequence"].apply(lambda x: x.replace("-", ""))
        kmer7TableInput = kmerXTable(inputData, 7,7)
        
        # generate array of labels
        y_hat = km.predict(kmer7TableInput[cols])

        #tSNE
        if len(inputData) > 1:
            tSNEembedding = TSNE(n_components= 2, random_state = 0)
            tSNEembedding_low = tSNEembedding.fit_transform(kmer7TableInput[cols])
            show = pd.DataFrame(tSNEembedding_low)
            # show kmeans clustering
            show["labels"] = y_hat
            ax = show[show["labels"]==1].plot.scatter(x=0, y=1, style="o", color="red", s=2)
            show[show["labels"]==0].plot.scatter(x=0, y=1, style="o", color="blue", s=2, ax=ax)
            red = mpatches.Patch(color='red', label='Fungus Killers')
            blue = mpatches.Patch(color='blue', label='Fungus Non-Killers')
            plt.legend(handles=[red, blue])
            plt.title('tSNE Visualization for KMeans Clustering\n' + fastaList[i])
            plt.xlabel('First Component')
            plt.ylabel('Second Component')
            plt.savefig('nonNotebookFiles/kmeansTSNE' + str(i) + '.png', bbox_inches='tight')
            plt.close()

        inputData["Labels"] = y_hat
        output.append(inputData)
    return output

In [44]:
x = kmeansTSNE(["data/mycovirus_genbank_all_refseq_nucleotide_unique.fasta",  "data/Sclerotinia_biocontrol_mycovirus_nucleotide.fasta"])
for df in x:
    print(df.head())

                                                      Sequence  Labels
ID                                                                    
NC_000960.1  GGGGGTTAGAGAAATCTTGGGAGATTTCTATCGTCATAGACATATG...       1
NC_001278.1  GGGAAATTTGTGAGATTATCGCCCTAAAGGATAACTCTCATGCGGG...       0
NC_001492.1  GCCTATGGGTGGTCTACATAGGTGAGCATGCGTTGCTCGATATAGA...       1
NC_001633.1  ACAAAATAATTGAAGAAATTATTTTTGTTTTACATATTTAGATCTT...       1
NC_001641.1  GAATTTTTCGGTGAACCGGAATTATGTCGTCTCTGTTAAATTCATT...       0
                                                      Sequence  Labels
ID                                                                    
NC_007415.1  CTTTAGTAAATACCACCGCCTCTTAATAAGTCTCGAGGCACTCCCC...       1
NC_015939.1  GGGGTGATGGGAAACTTACGTTTCGCCAACGTTAAGGTACTGTGAT...       1
NC_022896.1  TTTTTGGGGATGGTACTCTCAGGTTTGATCTTTGTAGATCCTAACT...       1
NC_026510.1  TTGGCTCCTGGAGACCGTTAGGTCCCCAGAAGCGGTTCAATAGAAC...       1
NC_027138.1  GCAATAAAAAGCACAGCCGGAAGGCTTTCTTTTTATTGCCCAGTTT...       1


In [4]:
def kmeansMeanshiftPCA(fastaList):
    # read in fasta files
    virus1 = parseFasta(path1)
    virus01 = parseFasta(path01)
    virus01 = virus01.append(virus1)
    virus01 = virus01.drop_duplicates(keep="last")

    # make kmer tables
    kmer7Table1 = kmerXTable(virus1, 7,7)
    kmer7Table01 = kmerXTable(virus01, 7,7)

    # only use columns that confirmed virus killers have no zeros in for kmer length 7
    cols = kmer7Table1.loc[:, (kmer7Table1 == 0).any(axis=0) != True].columns
    
    #meanshift
    ms = MeanShift()
    ms.fit(kmer7Table01[cols])
    cluster_centers = ms.cluster_centers_

    #Kmeans
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        km = KMeans(init = cluster_centers, n_clusters = len(cluster_centers))
        km.fit(kmer7Table01[cols])
    newLabelClusters = km.predict(kmer7Table01[cols])[343:]
    
    output = []
    for i in range(0, len(fastaList)):
        inputData = parseFasta(fastaList[i])
        inputData["Sequence"] = inputData["Sequence"].apply(lambda x: x.replace("-", ""))
        kmer7TableInput = kmerXTable(inputData, 7,7)
        
        # generate array of labels
        y_hat = km.predict(kmer7TableInput[cols])
        newLabels = []
        for label in y_hat:
            if label in newLabelClusters:
                newLabels.append(1)
            else:
                newLabels.append(0)  

        #PCA
        if len(inputData) > 1:
            embedding = PCA()
            embedding.fit(kmer7TableInput[cols])
            show = pd.DataFrame(embedding.transform(kmer7TableInput[cols]))
            # show kmeans clustering
            show["labels"] = newLabels
            ax = show[show["labels"]==1].plot.scatter(x=0, y=1, style="o", color="red", s=2)
            show[show["labels"]==0].plot.scatter(x=0, y=1, style="o", color="blue", s=2, ax=ax)
            red = mpatches.Patch(color='red', label='Fungus Killers')
            blue = mpatches.Patch(color='blue', label='Fungus Non-Killers')
            plt.legend(handles=[red, blue])
            plt.title('PCA Visualization for KMeans Clustering with Mean Shift\n' + fastaList[i])
            plt.xlabel('First Principal Component')
            plt.ylabel('Second Principal Component')
            plt.savefig('nonNotebookFiles/kmeansMeanshiftPCA' + str(i) + '.png', bbox_inches='tight')
            plt.close()

        inputData["Labels"] = newLabels
        print(newLabels)
        output.append(inputData)
    return output

In [5]:
x = kmeansMeanshiftPCA(["data/mycovirus_genbank_all_refseq_nucleotide_unique.fasta",  "data/Sclerotinia_biocontrol_mycovirus_nucleotide.fasta"])
for df in x:
    print(df.head())

[1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [10]:
def kmeansMeanshiftTSNE(fastaList):
    # read in fasta files
    virus1 = parseFasta(path1)
    virus01 = parseFasta(path01)
    virus01 = virus01.append(virus1)
    virus01 = virus01.drop_duplicates(keep="last")

    # make kmer tables
    kmer7Table1 = kmerXTable(virus1, 7,7)
    kmer7Table01 = kmerXTable(virus01, 7,7)

    # only use columns that confirmed virus killers have no zeros in for kmer length 7
    cols = kmer7Table1.loc[:, (kmer7Table1 == 0).any(axis=0) != True].columns
    
    #meanshift
    ms = MeanShift()
    ms.fit(kmer7Table01[cols])
    cluster_centers = ms.cluster_centers_

    #Kmeans
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        km = KMeans(init = cluster_centers, n_clusters = len(cluster_centers))
        km.fit(kmer7Table01[cols])
    newLabelClusters = km.predict(kmer7Table01[cols])[343:]
    
    output = []
    for i in range(0, len(fastaList)):
        inputData = parseFasta(fastaList[i])
        inputData["Sequence"] = inputData["Sequence"].apply(lambda x: x.replace("-", ""))
        kmer7TableInput = kmerXTable(inputData, 7,7)
        
        # generate array of labels
        y_hat = km.predict(kmer7TableInput[cols])
        newLabels = []
        for label in y_hat:
            if label in newLabelClusters:
                newLabels.append(1)
            else:
                newLabels.append(0)  

        #PCA
        if len(inputData) > 1:
            tSNEembedding = TSNE(n_components= 2, random_state = 0)
            tSNEembedding_low = tSNEembedding.fit_transform(kmer7TableInput[cols])
            show = pd.DataFrame(tSNEembedding_low)
            # show kmeans clustering
            show["labels"] = newLabels
            ax = show[show["labels"]==1].plot.scatter(x=0, y=1, style="o", color="red", s=2)
            show[show["labels"]==0].plot.scatter(x=0, y=1, style="o", color="blue", s=2, ax=ax)
            red = mpatches.Patch(color='red', label='Fungus Killers')
            blue = mpatches.Patch(color='blue', label='Fungus Non-Killers')
            plt.legend(handles=[red, blue])
            plt.title('tSNE Visualization for KMeans Clustering with Mean Shift\n' + fastaList[i])
            plt.xlabel('First Component')
            plt.ylabel('Second Component')
            plt.savefig('nonNotebookFiles/kmeansMeanshiftTSNE' + str(i) + '.png', bbox_inches='tight')
            plt.close()

        inputData["Labels"] = newLabels
        output.append(inputData)
    return output

In [11]:
x = kmeansMeanshiftTSNE(["data/mycovirus_genbank_all_refseq_nucleotide_unique.fasta",  "data/Sclerotinia_biocontrol_mycovirus_nucleotide.fasta"])
for df in x:
    print(df.head())

                                                      Sequence  Labels
ID                                                                    
NC_000960.1  GGGGGTTAGAGAAATCTTGGGAGATTTCTATCGTCATAGACATATG...       1
NC_001278.1  GGGAAATTTGTGAGATTATCGCCCTAAAGGATAACTCTCATGCGGG...       0
NC_001492.1  GCCTATGGGTGGTCTACATAGGTGAGCATGCGTTGCTCGATATAGA...       0
NC_001633.1  ACAAAATAATTGAAGAAATTATTTTTGTTTTACATATTTAGATCTT...       0
NC_001641.1  GAATTTTTCGGTGAACCGGAATTATGTCGTCTCTGTTAAATTCATT...       0
                                                      Sequence  Labels
ID                                                                    
NC_007415.1  CTTTAGTAAATACCACCGCCTCTTAATAAGTCTCGAGGCACTCCCC...       1
NC_015939.1  GGGGTGATGGGAAACTTACGTTTCGCCAACGTTAAGGTACTGTGAT...       0
NC_022896.1  TTTTTGGGGATGGTACTCTCAGGTTTGATCTTTGTAGATCCTAACT...       1
NC_026510.1  TTGGCTCCTGGAGACCGTTAGGTCCCCAGAAGCGGTTCAATAGAAC...       1
NC_027138.1  GCAATAAAAAGCACAGCCGGAAGGCTTTCTTTTTATTGCCCAGTTT...       1


In [67]:
x = kmeansPCA(["data/mycovirus_genbank_all_refseq_nucleotide_unique.fasta",  "data/Sclerotinia_biocontrol_mycovirus_nucleotide.fasta"])
for df in x:
    print(df.head())
    
x = kmeansTSNE(["data/mycovirus_genbank_all_refseq_nucleotide_unique.fasta",  "data/Sclerotinia_biocontrol_mycovirus_nucleotide.fasta"])
for df in x:
    print(df.head())
    
x = kmeansMeanshiftPCA(["data/mycovirus_genbank_all_refseq_nucleotide_unique.fasta",  "data/Sclerotinia_biocontrol_mycovirus_nucleotide.fasta"])
for df in x:
    print(df.head())
    
x = kmeansMeanshiftTSNE(["data/mycovirus_genbank_all_refseq_nucleotide_unique.fasta",  "data/Sclerotinia_biocontrol_mycovirus_nucleotide.fasta"])
for df in x:
    print(df.head())

                                                      Sequence  Labels
ID                                                                    
NC_000960.1  GGGGGTTAGAGAAATCTTGGGAGATTTCTATCGTCATAGACATATG...       1
NC_001278.1  GGGAAATTTGTGAGATTATCGCCCTAAAGGATAACTCTCATGCGGG...       0
NC_001492.1  GCCTATGGGTGGTCTACATAGGTGAGCATGCGTTGCTCGATATAGA...       1
NC_001633.1  ACAAAATAATTGAAGAAATTATTTTTGTTTTACATATTTAGATCTT...       1
NC_001641.1  GAATTTTTCGGTGAACCGGAATTATGTCGTCTCTGTTAAATTCATT...       0
                                                      Sequence  Labels
ID                                                                    
NC_007415.1  CTTTAGTAAATACCACCGCCTCTTAATAAGTCTCGAGGCACTCCCC...       1
NC_015939.1  GGGGTGATGGGAAACTTACGTTTCGCCAACGTTAAGGTACTGTGAT...       1
NC_022896.1  TTTTTGGGGATGGTACTCTCAGGTTTGATCTTTGTAGATCCTAACT...       1
NC_026510.1  TTGGCTCCTGGAGACCGTTAGGTCCCCAGAAGCGGTTCAATAGAAC...       1
NC_027138.1  GCAATAAAAAGCACAGCCGGAAGGCTTTCTTTTTATTGCCCAGTTT...       1
      