# Importing useful libraries

In [1]:
# Importing useful libraries
import numpy as np
import pandas as pd
import scanpy as sc
import anndata

# Defining useful variables

In [2]:
# Defining workdir path
workdir = '../'

# Labels in adata for batch removal
batch_key = 'batch'
cell_type_key = 'cell_type'

# Importing dataset 1 - Human Dendritic Cells

In [3]:
# Importing data
data = pd.read_table(f'{workdir}/data/dataset_1.txt')

# Preprocessing

## Retrieving cell types, batches, single cell types from columns name

In [4]:
# Create datafranme containing Cell Type, Batch ID, Single Cell Type
cell_types = np.array(list(data.columns.values))
cell_types = np.array([x.split("_") for x in cell_types])

d = {'Cell Type': cell_types[:,0], 'Batch ID': cell_types[:,1],'Single cell ID' : cell_types[:,2]}
d = pd.DataFrame(data=d).T
d.columns = data.columns

## Parsing information and keeping desired cell types

In [5]:
# Creating meta_data 
d_1 = d[d.columns.values[d.loc['Batch ID'].isin(['P7','P8','P9','P10'])]]
d_2 = d[d.columns.values[d.loc['Batch ID'].isin(['P3','P4','P13','P14'])]]
d1 = d_1[d_1.columns.values[d_1.loc['Cell Type'].isin(['CD141','pDC','DoubleNeg'])]]
d2 = d_2[d_2.columns.values[d_2.loc['Cell Type'].isin(['pDC','DoubleNeg','CD1C'])]]
d_ = pd.concat((d1,d2),axis=1)
meta_data = pd.DataFrame.transpose(pd.concat((d1,d2),axis=1))
Batch = ['B2' for i in range (meta_data.shape[0])]
Batch[:d1.shape[1]]=['B1' for i in range (d1.shape[1])]
meta_data[batch_key]=Batch
meta_data[cell_type_key] = meta_data['Cell Type']

## Removing identical cell types

In [6]:
# Separate into two batches
Batch_1 = data[d.columns.values[d.loc['Batch ID'].isin(['P7','P8','P9','P10'])]]
Batch_2 = data[d.columns.values[d.loc['Batch ID'].isin(['P3','P4','P13','P14'])]]

# Separate into two batches with non-identical cell types
Batch_1 = Batch_1[Batch_1.columns.values[d_1.loc['Cell Type'].isin(['CD141','pDC','DoubleNeg'])]]
Batch_2 = Batch_2[Batch_2.columns.values[d_2.loc['Cell Type'].isin(['pDC','DoubleNeg','CD1C'])]]

Batch_1_num = Batch_1.to_numpy()
Batch_2_num = Batch_2.to_numpy()

## Concatenating all information

In [7]:
# Creating preprocessed data
Input_ = pd.concat((Batch_1,Batch_2),axis=1)
Input_ = pd.concat((Input_,meta_data[[batch_key,cell_type_key]].T),axis=0).T

## Normalization of counts

In [8]:
Input_norm = sc.pp.normalize_per_cell(Input_.iloc[:,:-2],
                                      counts_per_cell_after=1,copy=True)
Input_norm = pd.concat([Input_norm, Input_.iloc[:,-2:]],axis=1)

## Log1p and normalization of counts

In [9]:
Input_norm_log = sc.pp.log1p(sc.pp.normalize_per_cell(Input_.iloc[:,:-2],copy=True).values)
Input_norm_log = np.concatenate([Input_norm_log, Input_.iloc[:,-2:]],axis=1)
Input_norm_log = pd.DataFrame(Input_norm_log, index=Input_norm.index, columns=Input_norm.columns)

# Saving to csv file

In [9]:
Input_.to_csv(f'{workdir}/data_preprocessed/dataset_1.csv', index=True)
Input_norm.to_csv(f'{workdir}/data_preprocessed/dataset_1_norm.csv',index=True)
Input_norm_log.to_csv(f'{workdir}/data_preprocessed/dataset_1_norm_log.csv',index=True)

To load the preprocessed file use the following command
`pd.read_csv('./data_preprocessed/dataset_1.csv',index_col=0,header=0)`

# Converting to AnnData format

In [11]:
# Creating annotated matrix in AnnData format with expression matrix
adata = anndata.AnnData(X = np.array(Input_.iloc[:,:-2]))

# Adding cell type and batch as observations
adata.obs[cell_type_key] = Input_[cell_type_key].tolist()
adata.obs[batch_key] = Input_[batch_key].tolist()

# Creating annotated matrix in AnnData format with expression matrix
adata_norm = anndata.AnnData(X = np.array(Input_norm.iloc[:,:-2]))

# Adding cell type and batch as observations
adata_norm.obs[cell_type_key] = Input_norm[cell_type_key].tolist()
adata_norm.obs[batch_key] = Input_norm[batch_key].tolist()

# Creating annotated matrix in AnnData format with expression matrix
adata_norm_log = anndata.AnnData(X = np.array(Input_norm_log.iloc[:,:-2]))

# Adding cell type and batch as observations
adata_norm_log.obs[cell_type_key] = Input_norm_log[cell_type_key].tolist()
adata_norm_log.obs[batch_key] = Input_norm_log[batch_key].tolist()

# Saving to h5ad file

In [11]:
adata.write(f'{workdir}/data_preprocessed/dataset_1.h5ad')
adata_norm.write(f'{workdir}/data_preprocessed/dataset_1_norm.h5ad')
adata_norm_log.write(f'{workdir}/data_preprocessed/dataset_1_norm_log.h5ad')

  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'cell_type' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'batch' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'cell_type' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'batch' as categorical


To load the preprocessed file use the following command
`sc.read('./data_preprocessed/dataset_1.h5ad')`