In [2]:
!ls /home/raquelcr/scanpy/raw_data

NMR1_cerebral_cortex.h5ad      NMR4_hippocampus_raw.h5ad
NMR1_cerebral_cortex_raw.h5ad  NMR5_midbrain_raw.h5ad
NMR2_cerebral_cortex_raw.h5ad  NMR6_midbrain_raw.h5ad
NMR3_hippocampus_raw.h5ad


In [None]:
!pendingfiles=("NMR2_cerebral_cortex_raw.h5ad" "NMR3_hippocampus_raw.h5ad" "NMR4_hippocampus_raw.h5ad" "NMR5_midbrain_raw.h5ad" "NMR6_midbrain_raw.h5ad")

In [1]:
!mkdir -p /home/raquelcr/scanpy/denoised

#### Work on tmux
```bash
tmux
```

#### Create cell bender environment (only if environment not already present)
```bash
raquelcr@aleph:~$ conda env create -f /home/raquelcr/seurat/cellbender/celbender-env.yml
```

#### Activate cellbender environment
```bash
conda activate cellbender-env
```

#### Make output directory
```bash
mkdir -p /home/raquelcr/scanpy/denoised
```
#### Run CellBender for every sample

If you want only certain files use:
```bash 
pendingfiles=("NMR2_cerebral_cortex_raw.h5ad" "NMR3_hippocampus_raw.h5ad" "NMR4_hippocampus_raw.h5ad" "NMR5_midbrain_raw.h5ad" "NMR6_midbrain_raw.h5ad")
```

If you want all files use:
```bash
pendingfiles=ls /home/raquelcr/scanpy/raw_data
```

Then run:

```bash
for file in "${pendingfiles[@]}"; do
    infile="/home/raquelcr/scanpy/raw_data/$file"
    outfile="/home/raquelcr/scanpy/denoised/${file/_raw.h5ad/_denoised.h5ad}"
    logfile="${outfile%.h5ad}.log"

    echo "Processing $infile → $outfile"
    cellbender remove-background \
        --input "$infile" \
        --output "$outfile" \
        --total-droplets-included 20000 \
        > "$logfile" 2>&1
done
```

In [None]:
### 

#### Make output directory
```bash
mkdir -p /home/raquelcr/jcgorozco/denoised
```
#### Run CellBender for every sample

If you want only certain files use:
```bash 
pendingfiles=("NMR2_cerebral_cortex_raw.h5ad" "NMR3_hippocampus_raw.h5ad" "NMR4_hippocampus_raw.h5ad" "NMR5_midbrain_raw.h5ad" "NMR6_midbrain_raw.h5ad")
```

If you want all files use:
```bash
pendingfiles=ls /home/raquelcr/jcgorozco/Axolotl_midbrain_sample1_STARsolo-out/GeneFull/raw
```

Then run:

```bash
for file in "${pendingfiles[@]}"; do
    infile="/home/raquelcr/scanpy/raw_data/$file"
    outfile="/home/raquelcr/scanpy/denoised/${file/_raw.h5ad/_denoised.h5ad}"
    logfile="${outfile%.h5ad}.log"

    echo "Processing $infile → $outfile"
    cellbender remove-background \
        --input "$infile" \
        --output "$outfile" \
        --total-droplets-included 20000 \
        > "$logfile" 2>&1
done

### Open filtered cells

In [2]:
import os
import glob
import scanpy as sc
import anndata
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [14]:
files= glob.glob('/home/raquelcr/scanpy/clean_data/*h5')
sizes_mb = [os.stat(file).st_size / 1_000_000 for file in files]   # MB (decimal)
for i in range(len(files)):
    print(f'File: {files[i]} Size (MB): {sizes_mb[i]}')

File: /home/raquelcr/scanpy/clean_data/NMR1_cerebral_cortex.h_posterior.h5 Size (MB): 222.24975
File: /home/raquelcr/scanpy/clean_data/NMR1_cerebral_cortex.h5 Size (MB): 39.354106
File: /home/raquelcr/scanpy/clean_data/NMR1_cerebral_cortex_filtered.h5 Size (MB): 27.18458


In [None]:
sample1_raw = sc.read_10x_h5('/home/raquelcr/scanpy/clean_data/NMR1_cerebral_cortex.h5')
sample1_raw

AnnData object with n_obs × n_vars = 3163481 × 20774
    var: 'gene_ids', 'feature_types', 'genome'

In [3]:
sample1_filtered = sc.read_10x_h5('/home/raquelcr/scanpy/clean_data/NMR1_cerebral_cortex_filtered.h5')
sample1_filtered

AnnData object with n_obs × n_vars = 14268 × 20774
    var: 'gene_ids', 'feature_types', 'genome'

In [4]:
sample1_filtered.var['genome']= 'HetGla_female_1.0'
sample1_filtered.var.loc['COX7C'] # This gene was on the 'Table of top genes removed'

gene_ids         ENSHGLG00000004572
feature_types       Gene Expression
genome            HetGla_female_1.0
Name: COX7C, dtype: object

In [8]:
sample1_filtered.var_names_make_unique()
sample1_filtered.obs['tissue']= 'cerebral_cortex'
sample1_filtered.obs['replicate']= '1'


In [None]:
sample1_filtered.obs['n_counts'] = sample1_filtered.X.sum(axis=1).A1
sample1_filtered.obs['n_genes'] = (sample1_filtered.X > 0).sum(axis=1).A1
sample1_filtered.obs['percent_mito'] = np.sum(
    sample1_filtered[:, sample1_filtered.var['gene_ids'].str.startswith('MT-')

Index(['CGGGACTGTTGGCTAT-1', 'CCACGAGTCGCCGATG-1', 'TTTATGCCACCAACAT-1',
       'ACCTGTCTCCGTGGCA-1', 'ATCGCCTAGCCTTCTC-1', 'CCTTTGGCACACACTA-1',
       'CGTCCATAGAATAACC-1', 'AGGTCATTCAACACGT-1', 'CGAAGGAAGGATAATC-1',
       'CTGGTCTTCAACTTTC-1',
       ...
       'GTCGTAAGTTAGAAAC-1', 'TACGGGCAGTACTCGT-1', 'TGAACGTGTGCAGGAT-1',
       'TAATTCCAGTCGGGAT-1', 'TCATTCACATTGAAGA-1', 'TAAGCGTTCGTGGGTC-1',
       'TGGGCGTAGGGCAGTT-1', 'ACGGAAGCACCGCTAG-1', 'GCTTTCGTCGGTCAGC-1',
       'AACGGGAAGACCGCCT-1'],
      dtype='object', length=14268)

In [None]:
filtered_list=[]
for i in range(len(files)):
    filtered_list.append(sc.read(files[i]))
filtered_list[0]