# Prepare the input data objects


```micromamba activate gena-env```

In [1]:
import numpy as np
import pandas as pd
import multianndata as mad
import cna
import scanpy as sc
import matplotlib.pyplot as plt
np.random.seed(0)

1. prepare the required inputs:
    * cell metadata with donor info for each cell 
    * cell by gene counts matrix 
    * donor-level metadata storing covariate information to be used in the GWAS, as well as batches

In [2]:
# read in the latest tenk cohort 
adata = sc.read("/directflow/SCCGGroupShare/projects/blabow/tenk10k_phase1/data_processing/scanpy/output/integrated_objects/240_libraries/240_libraries_concatenated_harmony_leiden_filtered_reanalysed.h5ad")

In [None]:
# get the cell barcode to donor mapping 
cell_meta = adata.obs[individual].rename({'individual':'id'})
cell_meta.head(3)


In [None]:
# get the counts matrix 
cells_x_genes = adata.X 
cells_x_genes.head(3)
# export to tsv 

In [None]:
# Make MultiAnnData object
madata = mad.MultiAnnData(X = cells_x_genes, obs = cell_meta, sampleid = 'id') 
# Add all covariate information to d.samplem
madata.samplem = madata.samplem.join(sample_meta)
# Define nearest neighbor graph and UMAP
sc.pp.pca(madata)
sc.pp.neighbors(madata)
sc.tl.umap(madata)

# Define neighborhood abundance matrix (NAM) and NAM-PCs
cna.tl.nam(madata)

In [None]:
outdir='/directflow/SCCGGroupShare/projects/blabow/tenk10k_phase1/data_processing/csa_qtl'
madata.write(f'{outdir}/data/scDataObject.h5ad')

# Data vis

In [None]:
# Visualize data
plt.scatter(madata.obsm['X_umap'][d.obs.clust1==1,0], d.obsm['X_umap'][d.obs.clust1==1,1], 
            c="green", edgecolor='none', s=2, label = "Cluster 1")
plt.scatter(madata.obsm['X_umap'][d.obs.clust2==1,0], d.obsm['X_umap'][d.obs.clust2==1,1], 
            c="purple", edgecolor='none', s=2, label = "Cluster 2")
plt.scatter(madata.obsm['X_umap'][d.obs.clust3==1,0], d.obsm['X_umap'][d.obs.clust3==1,1], 
            c="orange", edgecolor='none', s=2, label = "Cluster 3")
plt.legend(loc="lower left", markerscale=7, frameon=False)
plt.axis("off")
plt.show()

### Other required inputs:
* plink2 format genotyping data for each sample