In [1]:
import scanpy as sc
import pandas as pd
from pathlib import Path
import anndata as ad
import numpy as np
import os

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

DPI = 300
FONTSIZE = 20  # 42

sc.settings.set_figure_params(
    scanpy=True, dpi=100, transparent=True, vector_friendly=True, dpi_save=DPI
)
from matplotlib import rcParams

rcParams["pdf.fonttype"] = 42

In [2]:
DIR2SAVE = Path("/data/BCI-CRC/nasrine/data/CRC/Primary_CRC_dataset/final_object/20mt")

In [3]:
adata = sc.read_h5ad(DIR2SAVE.joinpath("SMC_KUL_Pelka_Che_Wu_pCRC_annotations_raw.h5ad")
)
adata.shape

(246779, 39609)

In [5]:
adata.obs.columns

Index(['Patient', 'Sample', 'Cell_type', 'Cell_subtype', 'Tissue', 'Therapy',
       'doublet_score', 'predicted_doublet', 'n_genes_by_counts',
       'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo',
       'pct_counts_ribo', 'cell_source', 'Annotation_scVI',
       'Annotation_scVI_detailed'],
      dtype='object')

In [9]:
# save metadata
adata.obs.loc[:, ['Patient', 'Sample', 'Tissue', 'Therapy', 'doublet_score', 'predicted_doublet', 'n_genes_by_counts',
              'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo',
              'cell_source', 'Annotation_scVI',
              'Annotation_scVI_detailed']].to_csv(DIR2SAVE.joinpath("SMC_KUL_Pelka_Che_Wu_pCRC_metadata.csv"),
                                                     header=True, index=True, sep=',',
                                                    )

In [12]:
adata.var.columns

Index(['n_cells_by_counts-Che', 'total_counts-Che', 'mean_counts-Che',
       'pct_dropout_by_counts-Che', 'n_cells_by_counts-KUL',
       'total_counts-KUL', 'mean_counts-KUL', 'pct_dropout_by_counts-KUL',
       'n_cells_by_counts-Pelka', 'total_counts-Pelka', 'mean_counts-Pelka',
       'pct_dropout_by_counts-Pelka', 'n_cells_by_counts-SMC',
       'total_counts-SMC', 'mean_counts-SMC', 'pct_dropout_by_counts-SMC',
       'n_cells_by_counts-Wu', 'total_counts-Wu', 'mean_counts-Wu',
       'pct_dropout_by_counts-Wu', 'n_cells'],
      dtype='object')

In [14]:
from scipy.io import mmread,mmwrite
mmwrite(DIR2SAVE.joinpath(f"SMC_KUL_Pelka_Che_Wu_pCRC_counts.mtx"),adata.X)

# rename index of .var and .obs to something more sensible like barrcode and gene_name
adata.var.index.set_names(names="gene_name", inplace=True)
adata.obs.index.set_names(names="barcode", inplace=True)

adata.var.drop(columns=['n_cells_by_counts-Che', 'total_counts-Che', 'mean_counts-Che',
       'pct_dropout_by_counts-Che', 'n_cells_by_counts-KUL',
       'total_counts-KUL', 'mean_counts-KUL', 'pct_dropout_by_counts-KUL',
       'n_cells_by_counts-Pelka', 'total_counts-Pelka', 'mean_counts-Pelka',
       'pct_dropout_by_counts-Pelka', 'n_cells_by_counts-SMC',
       'total_counts-SMC', 'mean_counts-SMC', 'pct_dropout_by_counts-SMC',
       'n_cells_by_counts-Wu', 'total_counts-Wu', 'mean_counts-Wu',
       'pct_dropout_by_counts-Wu', 'n_cells'], inplace=True)

# save obs index
adata.obs.to_csv(DIR2SAVE.joinpath(f"SMC_KUL_Pelka_Che_Wu_pCRC_obs.csv"), header=True, index=True)

# save var names
adata.var.to_csv(DIR2SAVE.joinpath(f"SMC_KUL_Pelka_Che_Wu_pCRC_var.csv"), header=True, index=True)

In [13]:
DIR2SAVE

PosixPath('/data/BCI-CRC/nasrine/data/CRC/Primary_CRC_dataset/final_object/20mt')