In [1]:
import h5py
import numpy as np
from pandas import read_csv
from umap import UMAP
from bio_embeddings.project import tsne_reduce
from bio_embeddings.visualize import render_3D_scatter_plotly, save_plotly_figure_to_html

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
mapping_file = read_csv('mapping_file.csv', index_col=0)

In [3]:
embeddings = []
with h5py.File('reduced_embeddings_file.h5', 'r') as f:
    for remapped_id in mapping_file.index:
        embeddings.append(np.array(f[remapped_id]))

In [4]:
umap_params = dict()
umap_params['n_components'] = 3
umap_params['min_dist'] = 0.6
umap_params['random_state'] = 420
umap_params['n_neighbors'] = 15
umap_params['verbose'] = 1
umap_params['metric'] = 'cosine'

transformed_embeddings = UMAP(**umap_params).fit_transform(embeddings)

UMAP(a=None, angular_rp_forest=True, b=None,
     force_approximation_algorithm=False, init='spectral', learning_rate=1.0,
     local_connectivity=1.0, low_memory=False, metric='cosine',
     metric_kwds=None, min_dist=0.6, n_components=3, n_epochs=None,
     n_neighbors=15, negative_sample_rate=5, output_metric='euclidean',
     output_metric_kwds=None, random_state=420, repulsion_strength=1.0,
     set_op_mix_ratio=1.0, spread=1.0, target_metric='categorical',
     target_metric_kwds=None, target_n_neighbors=-1, target_weight=0.5,
     transform_queue_size=4.0, transform_seed=42, unique=False, verbose=1)
Construct fuzzy simplicial set
Thu Apr 30 17:59:10 2020 Finding Nearest Neighbors



[1mThe TBB threading layer requires TBB version 2019.5 or later i.e., TBB_INTERFACE_VERSION >= 11005. Found TBB_INTERFACE_VERSION = 11000. The TBB threading layer is disabled.[0m



Thu Apr 30 17:59:13 2020 Finished Nearest Neighbor Search
Thu Apr 30 17:59:15 2020 Construct embedding
	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Thu Apr 30 17:59:18 2020 Finished embedding


In [5]:
mapping_file['x'] = transformed_embeddings[:, 0]
mapping_file['y'] = transformed_embeddings[:, 1]
mapping_file['z'] = transformed_embeddings[:, 2]

In [6]:
mapping_file.to_csv('projected_embeddings_file_UMAP.csv')

In [8]:
annotations_files_folder = 'annotations/'
figures_files_fodler = 'figures/'

annotation_files = ['disprot_2019_09_floats.csv', 
                    'disprot_2019_09_extreme_ends_0.2vs0.8.csv', 
                    'disprot_2019_09_extreme_ends_0.3vs0.7.csv', 
                    'disprot_2019_09_extreme_ends_0.5vs0.5.csv',
                    'disprot_2019_09_3classes_0.2_0.8.csv']

for annotation_file_path in annotation_files:
    annotation_file = read_csv(str(annotations_files_folder + annotation_file_path), index_col=0)
    if annotation_file['label'].nunique() < 3:
        annotation_file['label'] = annotation_file['label'].apply(str)

    merged_annotation_file = annotation_file.join(mapping_file.set_index('original_id'))
    figure = render_3D_scatter_plotly(merged_annotation_file)
    save_plotly_figure_to_html(figure, str(figures_files_fodler + annotation_file_path + ".html"))