In [7]:
import sys

import os

import vamb

import numpy as np

In [10]:
BASE_DIR = os.getcwd()

EXAMPLE_FASTA_FILE = '2021.01.26_15.46.45_sample_0'

In [11]:
vamb_inputs_base = os.path.join(BASE_DIR,'example_input_data/new_simulations/camisim_outputs/vamb_inputs')

contignames = vamb.vambtools.read_npz(os.path.join(vamb_inputs_base, 'contignames.npz'))

lengths = vamb.vambtools.read_npz(os.path.join(vamb_inputs_base, 'lengths.npz'))

tnfs = vamb.vambtools.read_npz(os.path.join(vamb_inputs_base, 'tnfs.npz'))
    
rpkms = vamb.vambtools.read_npz(os.path.join(vamb_inputs_base, 'rpkms.npz'))

In [12]:
vae = vamb.encode.VAE(nsamples=rpkms.shape[1])
dataloader, mask = vamb.encode.make_dataloader(rpkms, tnfs)

with open('vamb_models/model.pt', 'wb') as modelfile:
    vae.trainmodel(dataloader, nepochs=3, modelfile=modelfile, batchsteps=None, logfile=sys.stdout)

	Network properties:
	CUDA: False
	Alpha: 0.5
	Beta: 200
	Dropout: 0.2
	N hidden: 256, 256
	N latent: 32

	Training properties:
	N epochs: 3
	Starting batch size: 256
	Batchsteps: None
	Learning rate: 0.001
	N sequences: 1342
	N samples: 1

	Epoch: 1	Loss: 1.207205	CE: 1.1356501	SSE: 131.278476	KLD: 13.4779	Batchsize: 256
	Epoch: 2	Loss: 0.916718	CE: 0.7624369	SSE: 109.725491	KLD: 18.2490	Batchsize: 256
	Epoch: 3	Loss: 0.709793	CE: 0.6107335	SSE: 82.483774	KLD: 25.7261	Batchsize: 256


In [13]:
latent = vae.encode(dataloader)

print(latent.shape)

(1342, 32)


In [14]:
filtered_labels = [n for (n,m) in zip(contignames, mask) if m]
cluster_iterator = vamb.cluster.cluster(latent, labels=filtered_labels)
clusters = dict(cluster_iterator)

medoid, contigs = next(iter(clusters.items()))
print('First key:', medoid, '(of type:', type(medoid), ')')
print('Type of values:', type(contigs))
print('First element of value:', next(iter(contigs)), 'of type:', type(next(iter(contigs))))

First key: S0C10113 (of type: <class 'numpy.str_'> )
Type of values: <class 'set'>
First element of value: S0C49355 of type: <class 'numpy.str_'>


In [15]:
def filterclusters(clusters, lengthof):
    filtered_bins = dict()
    for medoid, contigs in clusters.items():
        binsize = sum(lengthof[contig] for contig in contigs)
    
        if binsize >= 200000:
            filtered_bins[medoid] = contigs
    
    return filtered_bins
        
lengthof = dict(zip(contignames, lengths))
filtered_bins = filterclusters(vamb.vambtools.binsplit(clusters, 'C'), lengthof)
print('Number of bins before splitting and filtering:', len(clusters))
print('Number of bins after splitting and filtering:', len(filtered_bins))

Number of bins before splitting and filtering: 25
Number of bins after splitting and filtering: 4


In [33]:
vamb_outputs_base = os.path.join(BASE_DIR, 'example_input_data/new_simulations/camisim_outputs/vamb_outputs')

if not os.path.exists(vamb_outputs_base):
    os.mkdir(vamb_outputs_base)
    

# This writes a .tsv file with the clusters and corresponding sequences
with open(os.path.join(vamb_outputs_base, 'clusters.tsv'), 'w') as file:
    vamb.vambtools.write_clusters(file, filtered_bins)

# Only keep contigs in any filtered bin in memory
keptcontigs = set.union(*filtered_bins.values())



# decompress fasta.gz if present
fasta_path = os.path.join(BASE_DIR, f"example_input_data/new_simulations/camisim_outputs/{EXAMPLE_FASTA_FILE}/contigs/anonymous_gsa.fasta.gz")
if os.path.exists(fasta_path):
    !gzip -dk $fasta_path


with open(os.path.join(BASE_DIR, f"example_input_data/new_simulations/camisim_outputs/{EXAMPLE_FASTA_FILE}/contigs/anonymous_gsa.fasta"), 'rb') as file:
    fastadict = vamb.vambtools.loadfasta(file, keep=keptcontigs)
    
bindir = os.path.join(vamb_outputs_base, 'bins')
vamb.vambtools.write_bins(bindir, filtered_bins, fastadict, maxbins=500)