In [35]:
import sys

import os

import vamb

import numpy as np

In [10]:
BASE_DIR = os.getcwd()

EXAMPLE_FASTA_FILE = '2021.01.26_15.46.45_sample_0'

In [11]:
vamb_inputs_base = os.path.join(BASE_DIR,'example_input_data/new_simulations/camisim_outputs/vamb_inputs')

contignames = vamb.vambtools.read_npz(os.path.join(vamb_inputs_base, 'contignames.npz'))

lengths = vamb.vambtools.read_npz(os.path.join(vamb_inputs_base, 'lengths.npz'))

tnfs = vamb.vambtools.read_npz(os.path.join(vamb_inputs_base, 'tnfs.npz'))
    
rpkms = vamb.vambtools.read_npz(os.path.join(vamb_inputs_base, 'rpkms.npz'))

In [38]:
vae = vamb.encode.VAE(nsamples=rpkms.shape[1])
dataloader, mask = vamb.encode.make_dataloader(rpkms, tnfs)

with open('vamb_models/model.pt', 'wb') as modelfile:
    vae.trainmodel(dataloader, nepochs=300, modelfile=modelfile, batchsteps=None, logfile=sys.stdout)

	Network properties:
	CUDA: False
	Alpha: 0.5
	Beta: 200
	Dropout: 0.2
	N hidden: 256, 256
	N latent: 32

	Training properties:
	N epochs: 300
	Starting batch size: 256
	Batchsteps: None
	Learning rate: 0.001
	N sequences: 1342
	N samples: 1

	Epoch: 1	Loss: 1.205212	CE: 1.1396912	SSE: 130.438051	KLD: 13.9037	Batchsize: 256
	Epoch: 2	Loss: 0.919071	CE: 0.8160212	SSE: 104.666295	KLD: 19.0206	Batchsize: 256
	Epoch: 3	Loss: 0.728025	CE: 0.6677800	SSE: 80.329303	KLD: 26.7993	Batchsize: 256
	Epoch: 4	Loss: 0.616844	CE: 0.5535992	SSE: 68.898555	KLD: 35.7468	Batchsize: 256
	Epoch: 5	Loss: 0.542628	CE: 0.4548426	SSE: 63.555280	KLD: 42.7888	Batchsize: 256
	Epoch: 6	Loss: 0.471755	CE: 0.3570463	SSE: 58.855779	KLD: 48.1563	Batchsize: 256
	Epoch: 7	Loss: 0.443954	CE: 0.3295514	SSE: 55.794067	KLD: 53.3332	Batchsize: 256
	Epoch: 8	Loss: 0.392823	CE: 0.2556900	SSE: 52.729322	KLD: 57.6648	Batchsize: 256
	Epoch: 9	Loss: 0.377114	CE: 0.2429441	SSE: 50.687283	KLD: 61.3559	Batchsize: 256
	Epoch: 10	Loss: 

In [39]:
latent = vae.encode(dataloader)

print(latent.shape)

latent_output_path = os.path.join(BASE_DIR, 'example_input_data/new_simulations/camisim_outputs/vamb_outputs/latent_space.npy')
with open(latent_output_path, 'wb') as outfile:
    np.save(outfile, latent)

(1342, 32)


In [40]:
filtered_labels = [n for (n,m) in zip(contignames, mask) if m]
cluster_iterator = vamb.cluster.cluster(latent, labels=filtered_labels)
clusters = dict(cluster_iterator)

medoid, contigs = next(iter(clusters.items()))
print('First key:', medoid, '(of type:', type(medoid), ')')
print('Type of values:', type(contigs))
print('First element of value:', next(iter(contigs)), 'of type:', type(next(iter(contigs))))

First key: S0C34345 (of type: <class 'numpy.str_'> )
Type of values: <class 'set'>
First element of value: S0C34345 of type: <class 'numpy.str_'>


In [41]:
def filterclusters(clusters, lengthof):
    filtered_bins = dict()
    for medoid, contigs in clusters.items():
        binsize = sum(lengthof[contig] for contig in contigs)
    
        if binsize >= 200000:
            filtered_bins[medoid] = contigs
    
    return filtered_bins
        
lengthof = dict(zip(contignames, lengths))
filtered_bins = filterclusters(vamb.vambtools.binsplit(clusters, 'C'), lengthof)
print('Number of bins before splitting and filtering:', len(clusters))
print('Number of bins after splitting and filtering:', len(filtered_bins))

Number of bins before splitting and filtering: 638
Number of bins after splitting and filtering: 2


In [43]:
vamb_outputs_base = os.path.join(BASE_DIR, 'example_input_data/new_simulations/camisim_outputs/vamb_outputs')

if not os.path.exists(vamb_outputs_base):
    os.mkdir(vamb_outputs_base)
    

# This writes a .tsv file with the clusters and corresponding sequences
with open(os.path.join(vamb_outputs_base, 'clusters.tsv'), 'w') as file:
    vamb.vambtools.write_clusters(file, filtered_bins)

# Only keep contigs in any filtered bin in memory
keptcontigs = set.union(*filtered_bins.values())



# decompress fasta.gz if present
fasta_path = os.path.join(BASE_DIR, f"example_input_data/new_simulations/camisim_outputs/{EXAMPLE_FASTA_FILE}/contigs/anonymous_gsa.fasta.gz")
if os.path.exists(fasta_path) and not os.path.exists(fasta_path.replace('.fasta.gz','.fasta')):
    !gzip -dk $fasta_path


with open(os.path.join(BASE_DIR, f"example_input_data/new_simulations/camisim_outputs/{EXAMPLE_FASTA_FILE}/contigs/anonymous_gsa.fasta"), 'rb') as file:
    fastadict = vamb.vambtools.loadfasta(file, keep=keptcontigs)
    
bindir = os.path.join(vamb_outputs_base, 'bins')
vamb.vambtools.write_bins(bindir, filtered_bins, fastadict, maxbins=500)

gzip: /home/pathinformatics/jupyter_projects/vamb/stanford_cs230_project/example_input_data/new_simulations/camisim_outputs/2021.01.26_15.46.45_sample_0/contigs/anonymous_gsa.fasta already exists; do you wish to overwrite (y or n)? ^C
