In [None]:
!pip install -Iv rpy2==3.4.3
%load_ext rpy2.ipython

!pip install scanpy
!pip install scib
!pip install anndata2ri

In [None]:
%%R
if (!require("BiocManager", quietly = TRUE))
  install.packages("BiocManager")

BiocManager::install("S4Vectors")
BiocManager::install("SingleCellExperiment")

install.packages('Seurat')
library(Seurat)


In [None]:

%%R

# Seurat integration functions
# functions taken from the scIB benchmarking pipeline at:
# https://github.com/theislab/scib-pipeline/blob/main

library(rlang)
library(Seurat)

loadSeuratObject = function(filename) {
	sobj = readRDS(filename)
	return(sobj)
}


saveSeuratObject = function(sobj, path) {
	require(Seurat)
	saveRDS(sobj, file=path)
}


runSeurat = function(data, batch, hvg=2000) {
	  require(Seurat)
	  batch_list = SplitObject(data, split.by = batch)

	  anchors = FindIntegrationAnchors(
	          object.list = batch_list,
	          anchor.features = hvg,
 		        scale = T,
		        l2.norm = T,
		        dims = 1:30,
        	  k.anchor = 5,
        	  k.filter = 200,
        	  k.score = 30,
        	  max.features = 200,
        	  eps = 0)
   
	  integrated = IntegrateData(
        	   anchorset = anchors,
		   new.assay.name = "integrated",
        	   features = NULL,
        	   features.to.integrate = NULL,
        	   dims = 1:30,
        	   k.weight = 100,
        	   weight.reduction = NULL,
        	   sd.weight = 1,
        	   sample.tree = NULL,
        	   preserve.order = F,
        	  #  do.cpp = T,
        	   eps = 0,
        	   verbose = T)
	  return(integrated)
}


# Define datasets parameters
datasets <- list(
  small_atac_windows = list(
    label_key = "final_cell_label",
    batch_key = "batchname",
    subsample = 1,
    log = TRUE
  )
  ,
  small_atac_peaks = list(
    label_key = "final_cell_label",
    batch_key = "batchname",
    subsample = 1,
    log = TRUE
  ),
  small_atac_gene_activity = list(
    label_key = "final_cell_label",
    batch_key = "batchname",
    subsample = 1,
    log = TRUE
  ),
  human_pancreas_norm_complexBatch = list(
    label_key = "celltype",
    batch_key = "tech",
    ATAC = FALSE,
    subsample = 1,
    log = FALSE
  ),
  Lung_atlas_public = list(
    label_key = "cell_type",
    batch_key = "batch",
    subsample = 1,
    ATAC = FALSE,
    log = FALSE
  ),
  Immune_ALL_hum_mou = list(
    label_key = "final_annotation",
    batch_key = "batch",
    subsample = 1,
    ATAC = FALSE,
    log = FALSE
  )
)


seurat_path <- '/content/drive/MyDrive/Colab Notebooks/integrationDatasets/Seurat/originalObjects/'
times <- list()

for (dataset_name in names(datasets)) {
  batch_key <- datasets[[dataset_name]]$batch_key

  # define paths
  inPath <- paste0(seurat_path, dataset_name, "_hvg.rds")
  outPath <- paste0(seurat_path, dataset_name, "_integrated.rds")

  print("Loading Seurat dataset:")
  print(dataset_name)
  sobj = loadSeuratObject(inPath)
  hvg<-3000

  # run integration and record time
  time_taken <- system.time({
    out = runSeurat(sobj, batch_key, hvg)
  })

  print("Finished integrating:")
  print(dataset_name)
  print("time_taken['elapsed']:")
  print(time_taken["elapsed"])

  # save duration info
  times[[dataset_name]] <- time_taken["elapsed"]

  saveSeuratObject(out, outPath)
}

times_df <- data.frame('Dataset' = names(times), 'Execution Time' = unlist(times), row.names = NULL)
write.csv(times_df, paste0(seurat_path, "execution_times.csv"))
