# Convert Data

Need to convert the test data to be vitessce complient
Looks like I'll need about 5 or so files

## Files for running in python notebook
1. The CODEX images
    - in python, can ome-zarr or ome-tiff. Need conversion for either,
        - ome-zarr: https://github.com/vitessce/vitessce-python/blob/main/demos/codeluppi-2018/src/convert_to_ome_zarr.py
        - ome.tiff: https://pypi.org/project/tifffile/
    - as of now, multi-image only works for ome.tiff
2. Segmentation masks
    - this can be ome-tiff
    - Try as JSON? This can remember cell associations
3. Expression values
4. Metadata/annotations
5. Dimension reductions

## Files for React app
Based on https://github.com/vitessce/vitessce-python/tree/main/demos/codeluppi-2018
1. Segmentations
    - JSON file that just has polygon coordinates. dump from dictionary that has cell ID keys
2. Cell meta
    - csv
    - reductions, cell centroids, cell IDs as index
3. Expression
    - simple csv. Cell ID as row, marker as columns, fill with X
4. Images
    - ome.zarr

NOTES:

- Not sure if the image transpose is necessary...


In [1]:
import tifffile as tff
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import anndata as ad
import pickle as pk
import scanpy as sc
from vitessce.data_utils import (
    multiplex_img_to_ome_zarr,
    optimize_adata,
    VAR_CHUNK_SIZE,
)
from vitessce import (
    VitessceConfig,
    Component as cm,
    CoordinationType as ct,
    AnnDataWrapper,
    OmeTiffWrapper,
    OmeZarrWrapper,
    MultiImageWrapper,
)


In [8]:
# channel_names = pd.read_table("/mnt/isilon/cscb/codex/pillaiv/SCTC-VP-15/code/forQuPath/channelNames.txt", header=None)
channel_names = channel_names.iloc[:,0].to_list()

In [9]:
img = tff.TiffFile("../data/test/codex_small.tiff")

In [10]:
img_dat = img.series[0].asarray()
img_dat = img_dat.astype(np.dtype("<u2"))
img_dat = np.transpose(img_dat, axes=(0, 2, 1))

In [27]:
masks = tff.TiffFile("../data/test/bitmask_small.tiff")
masks_dat = masks.series[0].asarray()
masks_dat = masks_dat.astype(np.dtype("<u2"))
masks_dat = np.transpose(masks_dat, axes=(1, 0))

In [31]:
masks_dat.shape

(2000, 2000)

### Making ome-tiff

In [36]:
# CODEX data
subresolutions = 1
pixelsize = 0.29  # micrometer, just using what was in example, probably wrong
with tff.TiffWriter('../data/test/converted/codex.ome.tif', ome=True) as tif:
    metadata={
        'axes': 'CYX',
        'Channel': {'Name': channel_names},
    }
    # options = dict(
    #     photometric='minisblack',
    #     tile=(128, 128),
    #     compression='jpeg',
    #     resolutionunit='CENTIMETER'
    # )
    tif.write(
        img_dat,
        # subifds=subresolutions,
        metadata=metadata,
    )

In [39]:
# Mask data
subresolutions = 1
pixelsize = 0.29  # micrometer, just using what was in example, probably wrong
with tff.TiffWriter('../data/test/converted/masks.ome.tif', ome=True) as tif:
    metadata={
        'axes': 'YX',
    }
    options = dict(
        photometric='minisblack',
        # tile=(128, 128),
        # compression='jpeg',
        # resolutionunit='CENTIMETER'
    )
    tif.write(
        masks_dat,
        # subifds=subresolutions,
        metadata=metadata,
        **options
    )

In [None]:
# Mask csv
# see "Make_small_data" notebook

### Making ome-zarr
code from https://github.com/vitessce/vitessce-python/blob/main/demos/codeluppi-2018/src/convert_to_ome_zarr.py

(44, 2000, 2000)

In [15]:
# for channel_i, channel_name in enumerate(channel_names):
#     img_arr[channel_i, :, :] = img_dat[channel_i]

multiplex_img_to_ome_zarr(
    img_dat,
    channel_names,
    "../data/test/converted/codex_img.ome.zarr",
    img_name="Image",
    chunks=(1, 512, 512),
    axes="cyx",
)

### Preparing the adata

In [12]:
adata_small = ad.read_h5ad("../data/test/adata_small.h5ad")
adata_small

AnnData object with n_obs × n_vars = 10556 × 44
    obs: 'Unnamed: 0', 'orig.ident', 'nCount_originalexp', 'nFeature_originalexp', 'Reg', 'Tile.Row', 'Tile.Col', 'Tile.Z', 'In.Tile.Y', 'In.Tile.X', 'Absolute.Y', 'Absolute.X', 'Cell.Size', 'n_genes_by_counts', 'total_counts', 'nCount_raw', 'nFeature_raw', 'cell_type_pred_knn', 'cell_type_pred_knn_prob', 'motif', 'index_info', 'new_pheno'
    var: 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'mean', 'std'
    uns: 'spatial_distance', 'spatial_interaction', 'spatial_lda', 'spatial_lda_probability'

In [13]:
adata_small.raw = adata_small
sc.pp.scale(adata_small, max_value=10)
sc.tl.pca(adata_small, svd_solver='arpack')
sc.pp.neighbors(adata_small, n_neighbors=10, n_pcs=15)
sc.tl.umap(adata_small)

In [20]:
adata_small

AnnData object with n_obs × n_vars = 10556 × 44
    obs: 'Unnamed: 0', 'orig.ident', 'nCount_originalexp', 'nFeature_originalexp', 'Reg', 'Tile.Row', 'Tile.Col', 'Tile.Z', 'In.Tile.Y', 'In.Tile.X', 'Absolute.Y', 'Absolute.X', 'Cell.Size', 'n_genes_by_counts', 'total_counts', 'nCount_raw', 'nFeature_raw', 'cell_type_pred_knn', 'cell_type_pred_knn_prob', 'motif', 'index_info', 'new_pheno'
    var: 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'mean', 'std'
    uns: 'spatial_distance', 'spatial_interaction', 'spatial_lda', 'spatial_lda_probability', 'pca', 'neighbors', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'distances', 'connectivities'

In [16]:
zarr_filepath = "../data/test/converted/adata_small.zarr"
if not os.path.isdir(zarr_filepath):
    adata = optimize_adata(
        adata_small,
        obs_cols=["new_pheno"],
        obsm_keys=["X_umap", "X_pca"],
        optimize_X=True,
    )
    adata.write_zarr(zarr_filepath, chunks=[adata.shape[0], VAR_CHUNK_SIZE])

In [28]:
# making a csv output
cells_df = adata_small.obs[["Unnamed: 0", "Absolute.Y", "Absolute.X", "new_pheno", "motif"]]
cells_df["Unnamed: 0"] = cells_df["Unnamed: 0"].astype(int)
cells_df.set_index("Unnamed: 0", inplace=True)
cells_df.index = cells_df.index.rename("cell_id")
cells_df.tail()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cells_df["Unnamed: 0"] = cells_df["Unnamed: 0"].astype(int)


Unnamed: 0_level_0,Absolute.Y,Absolute.X,new_pheno,motif
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
41524,1833.046512,1941.697674,B_Cell,Motif_1
41525,1712.140741,1583.82963,B_Cell,Motif_1
41526,1859.259542,2002.175573,B_Cell,Motif_1
41527,1631.783784,1839.081081,B_Cell,Motif_1
41530,1581.342105,1512.394737,B_Cell,Motif_1


In [29]:
cells_df['UMAP_1'] = adata_small.obsm['X_umap'][:,0]
cells_df['UMAP_2'] = adata_small.obsm['X_umap'][:,1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cells_df['UMAP_1'] = adata_small.obsm['X_umap'][:,0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cells_df['UMAP_2'] = adata_small.obsm['X_umap'][:,1]


In [31]:
matrix_df = pd.DataFrame(
    index=cells_df.index.values.tolist(),
    columns=adata_small.var.index.tolist(),
    data=adata_small.raw.X
)
matrix_df.index = matrix_df.index.rename("cell_id")

In [32]:
cells_df.to_csv("../data/test/converted/test_cells.csv", index=True)
matrix_df.to_csv("../data/test/converted/test_matrix.csv", index=True)

## Making the widget

In [22]:
# Option with ome-zarr
vc = VitessceConfig(schema_version="1.0.15", name='CODEX', description='Testing small slice of CODEX image')
dataset = vc.add_dataset(name='R1reg1').add_object(
    OmeZarrWrapper(img_path="/mnt/isilon/cscb/codex/pillaiv/SCTC-VP-15/vitessce-gh-pages/data/test/converted/codex_img.ome.zarr")
)
spatial = vc.add_view(cm.SPATIAL, dataset=dataset)
status = vc.add_view(cm.STATUS, dataset=dataset)
lc = vc.add_view(cm.LAYER_CONTROLLER, dataset=dataset).set_props(disableChannelsIfRgbDetected=True)
vc.layout(spatial | lc / status);

In [18]:
# Option with ome.tiff
vc = VitessceConfig(schema_version="1.0.15", name='CODEX', description='Testing small slice of CODEX image')
dataset = vc.add_dataset(name='R1reg1').add_object(
    MultiImageWrapper(
        image_wrappers=[
            OmeTiffWrapper(img_path='/mnt/isilon/cscb/codex/pillaiv/SCTC-VP-15/vitessce-gh-pages/data/test/converted/codex.ome.tif', name='Image'),
            OmeTiffWrapper(img_path='/mnt/isilon/cscb/codex/pillaiv/SCTC-VP-15/vitessce-gh-pages/data/test/converted/masks.ome.tif', name='Mask', is_bitmask=True),
        ]
 )
)
# making a UMAP
cell_dataset = vc.add_dataset(name='R1reg1_obs').add_object(AnnDataWrapper(
    adata_path=zarr_filepath,
    obs_set_paths=["obs/new_pheno"],
    obs_set_names=["phenotype"],
    obs_embedding_paths=["obsm/X_umap", "obsm/X_pca"],
    obs_embedding_names=["UMAP", "PCA"],
    obs_feature_matrix_path="X"
))
spatial = vc.add_view(cm.SPATIAL, dataset=dataset)
# status = vc.add_view(cm.STATUS, dataset=dataset)
lc = vc.add_view(cm.LAYER_CONTROLLER, dataset=dataset)
umap = vc.add_view(cm.SCATTERPLOT, dataset=cell_dataset, mapping="UMAP")
cell_sets = vc.add_view(cm.OBS_SETS, dataset=cell_dataset)
genes = vc.add_view(cm.FEATURE_LIST, dataset=cell_dataset)
vc.layout(spatial | (lc | umap));

In [18]:
vw = vc.widget()
vw

VitessceWidget(config={'version': '1.0.15', 'name': 'CODEX', 'description': 'Testing small slice of CODEX imag…

In [19]:
vc.display()