In [79]:
import sys

import os

import glob

import vamb

import numpy as np
import pandas as pd

In [10]:
BASE_DIR = os.getcwd()

EXAMPLE_FASTA_FILE = '2021.01.26_15.46.45_sample_0'

In [11]:
vamb_inputs_base = os.path.join(BASE_DIR,'example_input_data/new_simulations/camisim_outputs/vamb_inputs')

contignames = vamb.vambtools.read_npz(os.path.join(vamb_inputs_base, 'contignames.npz'))

lengths = vamb.vambtools.read_npz(os.path.join(vamb_inputs_base, 'lengths.npz'))

tnfs = vamb.vambtools.read_npz(os.path.join(vamb_inputs_base, 'tnfs.npz'))
    
rpkms = vamb.vambtools.read_npz(os.path.join(vamb_inputs_base, 'rpkms.npz'))

In [56]:
vae = vamb.encode.VAE(nsamples=rpkms.shape[1], beta=1)
dataloader, mask = vamb.encode.make_dataloader(rpkms, tnfs)

with open('vamb_models/model.pt', 'wb') as modelfile:
    vae.trainmodel(dataloader, nepochs=300, modelfile=modelfile, batchsteps=None, logfile=sys.stdout)

	Network properties:
	CUDA: False
	Alpha: 0.5
	Beta: 500
	Dropout: 0.2
	N hidden: 256, 256
	N latent: 32

	Training properties:
	N epochs: 300
	Starting batch size: 256
	Batchsteps: None
	Learning rate: 0.001
	N sequences: 1342
	N samples: 1

	Epoch: 1	Loss: 1.212272	CE: 1.1258856	SSE: 133.586969	KLD: 13.5737	Batchsize: 256
	Epoch: 2	Loss: 0.941549	CE: 0.7512895	SSE: 116.340164	KLD: 18.3391	Batchsize: 256
	Epoch: 3	Loss: 0.765455	CE: 0.6306508	SSE: 92.397978	KLD: 25.5302	Batchsize: 256
	Epoch: 4	Loss: 0.625864	CE: 0.5666762	SSE: 70.116847	KLD: 34.4478	Batchsize: 256
	Epoch: 5	Loss: 0.534236	CE: 0.4486325	SSE: 63.306853	KLD: 41.6710	Batchsize: 256
	Epoch: 6	Loss: 0.481981	CE: 0.3791310	SSE: 59.616489	KLD: 48.2357	Batchsize: 256
	Epoch: 7	Loss: 0.443746	CE: 0.3444976	SSE: 55.239960	KLD: 53.4744	Batchsize: 256
	Epoch: 8	Loss: 0.398267	CE: 0.2719633	SSE: 53.277222	KLD: 58.5346	Batchsize: 256
	Epoch: 9	Loss: 0.360826	CE: 0.2194593	SSE: 50.903364	KLD: 63.8770	Batchsize: 256
	Epoch: 10	Loss: 

In [57]:
latent = vae.encode(dataloader)

print(latent.shape)

latent_output_path = os.path.join(BASE_DIR, 'example_input_data/new_simulations/camisim_outputs/vamb_outputs/latent_space.npy')
with open(latent_output_path, 'wb') as outfile:
    np.save(outfile, latent)

(1342, 32)


In [58]:
filtered_labels = [n for (n,m) in zip(contignames, mask) if m]
cluster_iterator = vamb.cluster.cluster(latent, labels=filtered_labels)
clusters = dict(cluster_iterator)

medoid, contigs = next(iter(clusters.items()))
print('First key:', medoid, '(of type:', type(medoid), ')')
print('Type of values:', type(contigs))
print('First element of value:', next(iter(contigs)), 'of type:', type(next(iter(contigs))))

First key: S0C34345 (of type: <class 'numpy.str_'> )
Type of values: <class 'set'>
First element of value: S0C34345 of type: <class 'numpy.str_'>


In [64]:
def filterclusters(clusters, lengthof):
    filtered_bins = dict()
    for medoid, contigs in clusters.items():
        binsize = sum(lengthof[contig] for contig in contigs)
    
        if binsize >= 50000:
            filtered_bins[medoid] = contigs
    
    return filtered_bins
        
lengthof = dict(zip(contignames, lengths))
filtered_bins = filterclusters(vamb.vambtools.binsplit(clusters, 'C'), lengthof)
print('Number of bins before splitting and filtering:', len(clusters))
print('Number of bins after splitting and filtering:', len(filtered_bins))

Number of bins before splitting and filtering: 532
Number of bins after splitting and filtering: 6


In [99]:
contig_mapping_table = pd.read_csv(os.path.join(BASE_DIR, f"example_input_data/new_simulations/camisim_outputs/{EXAMPLE_FASTA_FILE}/contigs/gsa_mapping.tsv"), sep='\t')

contig_mapping_output_path = os.path.join(BASE_DIR, 'example_input_data/new_simulations/camisim_outputs/vamb_outputs/encoding_mapping.tsv')    

contig_mapping_table[contig_mapping_table['#anonymous_contig_id'].isin(contignames)].reset_index().drop('index', axis=1).set_index(
    '#anonymous_contig_id').reindex(contignames).to_csv(contig_mapping_output_path, sep='\t')

In [65]:
vamb_outputs_base = os.path.join(BASE_DIR, 'example_input_data/new_simulations/camisim_outputs/vamb_outputs')

if not os.path.exists(vamb_outputs_base):
    os.mkdir(vamb_outputs_base)
    

# This writes a .tsv file with the clusters and corresponding sequences
with open(os.path.join(vamb_outputs_base, 'clusters.tsv'), 'w') as file:
    vamb.vambtools.write_clusters(file, filtered_bins)

# Only keep contigs in any filtered bin in memory
keptcontigs = set.union(*filtered_bins.values())



# decompress fasta.gz if present
fasta_path = os.path.join(BASE_DIR, f"example_input_data/new_simulations/camisim_outputs/{EXAMPLE_FASTA_FILE}/contigs/anonymous_gsa.fasta.gz")
if os.path.exists(fasta_path) and not os.path.exists(fasta_path.replace('.fasta.gz','.fasta')):
    !gzip -dk $fasta_path


with open(os.path.join(BASE_DIR, f"example_input_data/new_simulations/camisim_outputs/{EXAMPLE_FASTA_FILE}/contigs/anonymous_gsa.fasta"), 'rb') as file:
    fastadict = vamb.vambtools.loadfasta(file, keep=keptcontigs)


bindir = os.path.join(vamb_outputs_base, 'bins')
files = glob.glob(os.path.join(bindir,'*'))
for f in files:
    os.remove(f)

vamb.vambtools.write_bins(bindir, filtered_bins, fastadict, maxbins=500)