In [1]:
import sys

import os

import vamb

import numpy as np

In [None]:
BASE_DIR = '/home/pathinformatics/jupyter_projects/vamb/stanford_cs230_project'

In [2]:
vamb_inputs_base = 'example_input_data/new_simulations/camisim_outputs/vamb_inputs'

contignames = vamb.vambtools.read_npz(os.path.join(vamb_inputs_base, 'contignames.npz'))

lengths = vamb.vambtools.read_npz(os.path.join(vamb_inputs_base, 'lengths.npz'))

tnfs = vamb.vambtools.read_npz(os.path.join(vamb_inputs_base, 'tnfs.npz'))
    
rpkms = vamb.vambtools.read_npz(os.path.join(vamb_inputs_base, 'rpkms.npz'))

FileNotFoundError: [Errno 2] No such file or directory: 'example_input_data/new_simulations/camisim_outputs/vamb_inputs/contignames.npz'

In [11]:
vae = vamb.encode.VAE(nsamples=rpkms.shape[1])
dataloader, mask = vamb.encode.make_dataloader(rpkms, tnfs)

with open('vamb_models/model.pt', 'wb') as modelfile:
    vae.trainmodel(dataloader, nepochs=100, modelfile=modelfile, batchsteps=None, logfile=sys.stdout)

	Network properties:
	CUDA: False
	Alpha: 0.5
	Beta: 200
	Dropout: 0.2
	N hidden: 256, 256
	N latent: 32

	Training properties:
	N epochs: 100
	Starting batch size: 256
	Batchsteps: None
	Learning rate: 0.001
	N sequences: 1342
	N samples: 1

	Epoch: 1	Loss: 1.163908	CE: 1.0341598	SSE: 132.820998	KLD: 13.2234	Batchsize: 256
	Epoch: 2	Loss: 0.907602	CE: 0.7336207	SSE: 110.827435	KLD: 17.8821	Batchsize: 256
	Epoch: 3	Loss: 0.702348	CE: 0.5756330	SSE: 84.597641	KLD: 24.7280	Batchsize: 256
	Epoch: 4	Loss: 0.527197	CE: 0.3765369	SSE: 68.760202	KLD: 32.9052	Batchsize: 256
	Epoch: 5	Loss: 0.512551	CE: 0.3832766	SSE: 64.824719	KLD: 39.8714	Batchsize: 256
	Epoch: 6	Loss: 0.458212	CE: 0.3228889	SSE: 59.671340	KLD: 45.4466	Batchsize: 256
	Epoch: 7	Loss: 0.413203	CE: 0.2733659	SSE: 55.329079	KLD: 50.7638	Batchsize: 256
	Epoch: 8	Loss: 0.369315	CE: 0.2200949	SSE: 51.609453	KLD: 55.9093	Batchsize: 256
	Epoch: 9	Loss: 0.349445	CE: 0.2010029	SSE: 49.322278	KLD: 60.8942	Batchsize: 256
	Epoch: 10	Loss: 

In [12]:
latent = vae.encode(dataloader)

print(latent.shape)

(1342, 32)


In [13]:
filtered_labels = [n for (n,m) in zip(contignames, mask) if m]
cluster_iterator = vamb.cluster.cluster(latent, labels=filtered_labels)
clusters = dict(cluster_iterator)

medoid, contigs = next(iter(clusters.items()))
print('First key:', medoid, '(of type:', type(medoid), ')')
print('Type of values:', type(contigs))
print('First element of value:', next(iter(contigs)), 'of type:', type(next(iter(contigs))))

First key: S0C18790 (of type: <class 'numpy.str_'> )
Type of values: <class 'set'>
First element of value: S0C31786 of type: <class 'numpy.str_'>


In [14]:
def filterclusters(clusters, lengthof):
    filtered_bins = dict()
    for medoid, contigs in clusters.items():
        binsize = sum(lengthof[contig] for contig in contigs)
    
        if binsize >= 200000:
            filtered_bins[medoid] = contigs
    
    return filtered_bins
        
lengthof = dict(zip(contignames, lengths))
filtered_bins = filterclusters(vamb.vambtools.binsplit(clusters, 'C'), lengthof)
print('Number of bins before splitting and filtering:', len(clusters))
print('Number of bins after splitting and filtering:', len(filtered_bins))

Number of bins before splitting and filtering: 40
Number of bins after splitting and filtering: 5


In [17]:
vamb_outputs_base = 'example_input_data/new_simulations/camisim_outputs/vamb_outputs'

# This writes a .tsv file with the clusters and corresponding sequences
with open(os.path.join(vamb_outputs_base, 'clusters.tsv'), 'w') as file:
    vamb.vambtools.write_clusters(file, filtered_bins)

# Only keep contigs in any filtered bin in memory
keptcontigs = set.union(*filtered_bins.values())

with open('example_input_data/new_simulations/camisim_outputs/2021.01.26_04.04.06_sample_0/contigs/anonymous_gsa.fasta', 'rb') as file:
    fastadict = vamb.vambtools.loadfasta(file, keep=keptcontigs)
    
bindir = os.path.join(vamb_outputs_base, 'bins')
vamb.vambtools.write_bins(bindir, filtered_bins, fastadict, maxbins=500)