In [None]:
import os
import anndata
import pandas as pd
from kh import sketch
from utils import *

### Read in preprocessed tumor and T47D annotated data objects

In [None]:
directory = 'data'

tum_ad = anndata.read_h5ad(os.path.join(directory, 'tumor_preprocessed.h5ad'))
t47_ad = anndata.read_h5ad(os.path.join(directory, 'T47D_preprocessed.h5ad'))

### Downsample cells using kernel herding sketching: see associated [paper](https://dl.acm.org/doi/abs/10.1145/3535508.3545539) and [GitHub](https://github.com/jranek/SketchKH)
- finds 2000 protypical cells from each treatment condition (e.g. 0, 10, 100 nM)

In [None]:
kh_indices_tum, tum_ad_subsample = sketch(tum_ad, sample_set_key = 'well', gamma = 1, num_subsamples = 2000, n_jobs = -1, frequency_seed = 0)
kh_indices_t47, t47_ad_subsample = sketch(t47_ad, sample_set_key = 'well', gamma = 1, num_subsamples = 2000, n_jobs = -1, frequency_seed = 0)

### Perform integration of tumor and T47D samples with [TRANSACT](https://www.pnas.org/doi/10.1073/pnas.2106682118)

In [None]:
tum_subsample_df = pd.DataFrame(tum_ad_subsample.X, index = tum_ad_subsample.obs_names, columns = tum_ad_subsample.var_names)
t47_subsample_df = pd.DataFrame(t47_ad_subsample.X, index = t47_ad_subsample.obs_names, columns = t47_ad_subsample.var_names)

integrated_df = transact_integrate(tum_subsample_df, t47_subsample_df)

X_phate = compute_phate(integrated_df, n_components = 2, knn = 150)

### Save anndata object with embeddings and updated metadata
- `adata.X`: preprocessed data, where subsampled cells from both samples are vertically concatenated
- `adata.obs['Origin']`: metadata regarding sample origin
- `adata.obsm['X_integrated']`: embedding following transact integration
- `adata.obsm['X_phate']`: PHATE embedding following transact integration

In [None]:
origin_ids = ['Tumor']*len(tum_subsample_df.index) + ['T47D']*len(t47_subsample_df.index)
origin_ids = ['Tumor']*len(tum_subsample_df.index) + ['T47D']*len(t47_subsample_df.index)
cell_ids = list(tum_subsample_df.index) + list(t47_subsample_df.index)
origin = pd.DataFrame(origin_ids, index = cell_ids)

metadata = pd.concat([tum_ad_subsample.obs, t47_ad_subsample.obs], axis = 0)

joined_df = pd.concat([tum_subsample_df, t47_subsample_df])
sketched_integated = anndata.AnnData(joined_df)
sketched_integated.obs = metadata.copy()
sketched_integated.obs['Origin'] = origin.copy()
sketched_integated.obsm['X_integrated'] = integrated_df.values
sketched_integated.obsm['X_phate'] = X_phate.copy()

sketched_integated.write(os.path.join(directory, 'sketched_integrated.h5ad'))

### Save df version for R analysis: See `ci.R`, `ti.R`

In [None]:
sketched_integrated_df = pd.concat([joined_df, sketched_integated.obs, pd.DataFrame(X_phate, index = joined_df.index, columns = ['PHATE_1', 'PHATE_2'])], axis = 1)
sketched_integrated_df.to_csv(os.path.join(directory, 'sketched_integrated_df.csv'))