# Setup

In [1]:
import loompy as lp
import anndata as ad
import fast_matrix_market as fmm
import pandas as pd
import scvelo as scv
import os

# Create H5AD file

We provide combined data in `velocity.loom`.

In [20]:
adata = ad.read_loom("velocyto.loom")

Variable names are not unique. To make them unique, call `.var_names_make_unique`.


In [21]:
adata

AnnData object with n_obs × n_vars = 127278 × 58395
    obs: 'Clusters', '_X', '_Y'
    var: 'Accession', 'Chromosome', 'End', 'Start', 'Strand'
    layers: 'matrix', 'ambiguous', 'spliced', 'unspliced'

## Make unique gene names and update cell names to match those from R

In [22]:
adata.obs.index

Index(['MSA_K1418:AAAGAACAGAAACACTx', 'MSA_K1418:AAGAACAGTCATCCGGx',
       'MSA_K1418:AAGCCATAGCGAGTACx', 'MSA_K1418:AAGATAGCAAGAGGTCx',
       'MSA_K1418:AACCCAAAGTGACCTTx', 'MSA_K1418:AAGAACACATCTGTTTx',
       'MSA_K1418:AACGAAATCTGCCTCAx', 'MSA_K1418:AAGCGTTAGAAGATCTx',
       'MSA_K1418:AAAGTCCTCCACGGGTx', 'MSA_K1418:AAAGTGAAGATAACGTx',
       ...
       'PD_K1449:TTTCATGGTCCGAAAGx', 'PD_K1449:TTTGATCTCAAATGCCx',
       'PD_K1449:TTTGGTTAGGTCCTGCx', 'PD_K1449:TTTGTTGTCGCTTGAAx',
       'PD_K1449:TTTGTTGCACCGGCTAx', 'PD_K1449:TTTGTTGGTAAGCGGTx',
       'PD_K1449:TTTGGTTCACGTGAGAx', 'PD_K1449:TTTGTTGGTAGAAACTx',
       'PD_K1449:TTTGTTGTCCACACAAx', 'PD_K1449:TTTGGTTCAAGCCCACx'],
      dtype='object', length=127278)

In [40]:
new_index = []
for string in adata.obs.index:
    new_index.append(string.replace(":", "!!").replace("x", "-1").replace("_K", "_"))

adata.obs.index = new_index

In [24]:
adata.var_names_make_unique()

In [26]:
adata.obs.index.name = "cells"
adata.var.index.name = "genes"

# Create subsets

In [3]:
adata

AnnData object with n_obs × n_vars = 127278 × 58395
    obs: 'Clusters', '_X', '_Y'
    var: 'Accession', 'Chromosome', 'End', 'Start', 'Strand'
    layers: 'ambiguous', 'matrix', 'spliced', 'unspliced'

## Microglia

In [4]:
adata_subset = ad.AnnData(X = fmm.mmread("microglia_pvms.mtx"))
cellids = pd.read_csv("microglia_pvms.cells", header = None)[0].str.replace("_K", "_")
genes = pd.read_csv("microglia_pvms.genes", header = None)

In [5]:
anno = pd.read_csv("microglia_pvms_annotation.csv", header = None)
anno.index = anno[0].str.replace("_K", "_")
anno.index.name = "cells"
anno

Unnamed: 0_level_0,0,1
cells,Unnamed: 1_level_1,Unnamed: 2_level_1
CTRL_037!!AATTTCCAGATGCAGC-1,CTRL_037!!AATTTCCAGATGCAGC-1,Steady-state
CTRL_037!!CACTAAGGTAATCAAG-1,CTRL_037!!CACTAAGGTAATCAAG-1,Steady-state
CTRL_037!!CAGATCAGTACCTTCC-1,CTRL_037!!CAGATCAGTACCTTCC-1,Intermediate2
CTRL_037!!GTCTACCTCGCCTATC-1,CTRL_037!!GTCTACCTCGCCTATC-1,Intermediate2
CTRL_037!!TCTCACGAGACATCCT-1,CTRL_037!!TCTCACGAGACATCCT-1,Steady-state
...,...,...
PD_1449!!GTCTGTCTCCGGGACT-1,PD_K1449!!GTCTGTCTCCGGGACT-1,Intermediate1
PD_1449!!CCGATGGTCTAGGCCG-1,PD_K1449!!CCGATGGTCTAGGCCG-1,PVMs
PD_1449!!TACCGAAGTATCTTCT-1,PD_K1449!!TACCGAAGTATCTTCT-1,Intermediate2
PD_1449!!CCGGGTAGTCACAGTT-1,PD_K1449!!CCGGGTAGTCACAGTT-1,Steady-state


In [6]:
adata_subset.obs.index = cellids
adata_subset.var.index = genes[0]
adata_subset.obs["annotation"] = anno[1]
adata_subset.obs.index.name = "cells"
adata_subset.var.index.name = "genes"

In [7]:
emb = pd.read_csv("microglia_pvms.embedding", header = None)
emb.index = anno[0]
emb.index.name = "cells"
emb

Unnamed: 0_level_0,0,1
cells,Unnamed: 1_level_1,Unnamed: 2_level_1
CTRL_037!!AATTTCCAGATGCAGC-1,0.017858,-1.925463
CTRL_037!!CACTAAGGTAATCAAG-1,-1.087168,-1.582778
CTRL_037!!CAGATCAGTACCTTCC-1,-0.930021,-0.564554
CTRL_037!!GTCTACCTCGCCTATC-1,-0.342711,-0.211587
CTRL_037!!TCTCACGAGACATCCT-1,-0.240792,-2.031446
...,...,...
PD_K1449!!GTCTGTCTCCGGGACT-1,0.636007,-0.344692
PD_K1449!!CCGATGGTCTAGGCCG-1,0.707924,0.247146
PD_K1449!!TACCGAAGTATCTTCT-1,-0.192256,-0.212284
PD_K1449!!CCGGGTAGTCACAGTT-1,-0.123691,-0.119759


In [8]:
adata_subset.obsm["X_umap"] = emb.to_numpy()
adata_subset.X = adata_subset.X.tocsr()

### Merge with loom

Adjust cells and genes

In [9]:
cell_list = list(set(adata.obs.index.to_list()) & set(adata_subset.obs.index.to_list()))
gene_list = list(set(adata.var.index.to_list()) & set(adata_subset.var.index.to_list()))
adata2 = adata[cell_list, gene_list]
adata_subset2 = adata_subset[cell_list, gene_list]

Change format of raw matrix in adata_micro

In [10]:
adata_velo = scv.utils.merge(adata_subset2, adata2)

In [11]:
adata_velo.write_h5ad("microglia.h5ad")

... storing 'annotation' as categorical


## Astrocytes

In [5]:
adata_subset = ad.AnnData(X = fmm.mmread("astrocytes.mtx"))
cellids = pd.read_csv("astrocytes.cells", header = None)[0].str.replace("_K", "_")
genes = pd.read_csv("astrocytes.genes", header = None)

In [6]:
adata_subset

AnnData object with n_obs × n_vars = 10766 × 19415

In [7]:
anno = pd.read_csv("astrocytes.annotation", header = None)
anno.index = anno[0].str.replace("_K", "_")
anno.index.name = "cells"
anno

Unnamed: 0_level_0,0,1
cells,Unnamed: 1_level_1,Unnamed: 2_level_1
CTRL_037!!CATGAGTTCCACACAA-1,CTRL_037!!CATGAGTTCCACACAA-1,Homeostatic_astrocytes
CTRL_037!!GGCTTTCCAATGACCT-1,CTRL_037!!GGCTTTCCAATGACCT-1,Homeostatic_astrocytes
CTRL_037!!TCCGATCAGACCATGG-1,CTRL_037!!TCCGATCAGACCATGG-1,Homeostatic_astrocytes
CTRL_037!!CAACCTCTCGGATACT-1,CTRL_037!!CAACCTCTCGGATACT-1,Homeostatic_astrocytes
CTRL_037!!GTTCATTAGTTTGCTG-1,CTRL_037!!GTTCATTAGTTTGCTG-1,Homeostatic_astrocytes
...,...,...
PD_1449!!TTCATTGTCACACCCT-1,PD_K1449!!TTCATTGTCACACCCT-1,Homeostatic_astrocytes
PD_1449!!TCCCACACACTACCCT-1,PD_K1449!!TCCCACACACTACCCT-1,Reactive_astrocytes
PD_1449!!TATTTCGCACAACCGC-1,PD_K1449!!TATTTCGCACAACCGC-1,Reactive_astrocytes
PD_1449!!GACATCAGTAGACACG-1,PD_K1449!!GACATCAGTAGACACG-1,Reactive_astrocytes


In [9]:
adata_subset.obs.index = cellids
adata_subset.var.index = genes[0]
adata_subset.obs["annotation"] = anno[1]
adata_subset.obs.index.name = "cells"
adata_subset.var.index.name = "genes"

In [10]:
emb = pd.read_csv("astrocytes.embedding", header = None)
emb.index = anno[0]
emb.index.name = "cells"
emb

Unnamed: 0_level_0,0,1
cells,Unnamed: 1_level_1,Unnamed: 2_level_1
CTRL_037!!CATGAGTTCCACACAA-1,0.473453,0.244449
CTRL_037!!GGCTTTCCAATGACCT-1,1.252497,-0.255273
CTRL_037!!TCCGATCAGACCATGG-1,2.033582,0.652308
CTRL_037!!CAACCTCTCGGATACT-1,2.388103,-0.010170
CTRL_037!!GTTCATTAGTTTGCTG-1,2.095024,-0.010392
...,...,...
PD_K1449!!TTCATTGTCACACCCT-1,0.305426,0.247235
PD_K1449!!TCCCACACACTACCCT-1,0.037591,-0.271056
PD_K1449!!TATTTCGCACAACCGC-1,-1.719751,0.185266
PD_K1449!!GACATCAGTAGACACG-1,-0.828474,-0.135783


In [11]:
adata_subset.obsm["X_umap"] = emb.to_numpy()
adata_subset.X = adata_subset.X.tocsr()

### Merge with loom

In [13]:
cell_list = list(set(adata.obs.index.to_list()) & set(adata_subset.obs.index.to_list()))
gene_list = list(set(adata.var.index.to_list()) & set(adata_subset.var.index.to_list()))
adata2 = adata[cell_list, gene_list]
adata_subset2 = adata_subset[cell_list, gene_list]

In [14]:
adata_velo = scv.utils.merge(adata_subset2, adata2)

In [15]:
adata_velo

AnnData object with n_obs × n_vars = 9076 × 19407
    obs: 'annotation', 'Clusters', '_X', '_Y', 'initial_size_unspliced', 'initial_size_spliced', 'initial_size'
    var: 'Accession', 'Chromosome', 'End', 'Start', 'Strand'
    obsm: 'X_umap'
    layers: 'ambiguous', 'matrix', 'spliced', 'unspliced'

In [16]:
adata_velo.write_h5ad("astrocytes.h5ad")

... storing 'annotation' as categorical
