# Add metadata column indicating Prescott 2020 subtype

## 0. Imports

In [1]:
%load_ext autoreload
%autoreload 2
import scprinter as scp
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import time
import pandas as pd
import numpy as np
import os
import pickle
import torch
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
from scanpy.plotting.palettes import zeileis_28
from tqdm.contrib.concurrent import *
from tqdm.auto import *
import anndata
import scanpy as sc
import statistics as stat
import json
import csv
import re
import copy
from sklearn.preprocessing import OneHotEncoder

In [2]:
import snapatac2 as snap

In [3]:
scp.__version__

'1.0.0a'

### 0.1 Setup

In [4]:
# Specify the reference genome. This must match that of your ATAC fragments file
genome = scp.genome.mm10

genome

<scprinter.genome.Genome at 0x7ff96d977590>

In [5]:
import snapatac2 as snap

In [6]:
scp.__version__

'1.0.0a'

In [7]:
chromVAR_or_seq2PRINT = 'chromvar'

## 1. Paths

### 1.1 Data directories

In [8]:
master_data_dir = '/bap/bap/collab_asthma_multiome/'

In [9]:
# inputs
printer_h5ad_output_dir = os.path.join(master_data_dir, 'ATAC', '2_Analysis_Outputs', '1b_ChromVAR_scPrinter_object')
scprinter_obj_path = os.path.join(printer_h5ad_output_dir, 'Asthma_Multiome_Collab_scPrinter.h5ad')
barcode_subtype_mapping_csv_file_path = os.path.join(master_data_dir, 'outputs', 'ATAC', '2_Analysis_Outputs', '1a_ChromVAR_Inputs', 'sample_barcode_predicted_cluster_df.csv')


output_dir = os.path.join(master_data_dir, 'ATAC', '2_Analysis_Outputs', f'1d_{chromVAR_or_seq2PRINT}_Outputs')

# if the output directory does not exist, create it
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [10]:
output_dir

'/bap/bap/collab_asthma_multiome/ATAC/2_Analysis_Outputs/1d_chromvar_Outputs'

## 2. ```scPrinter``` analysis

### 2.1 Load the scPrinter object

When you finish using the object, run ```printer.close()``` otherwise you won't be able to load it properly next time.

In [11]:
printer = scp.load_printer(scprinter_obj_path, genome)

In [12]:
printer

head project
AnnData object with n_obs x n_vars = 7418 x 0 backed at '/bap/bap/collab_asthma_multiome/ATAC/2_Analysis_Outputs/1b_ChromVAR_scPrinter_object/Asthma_Multiome_Collab_scPrinter.h5ad'
    obs: 'sample', 'n_fragment', 'frac_dup', 'frac_mito', 'frag_path', 'frag_sample_name', 'tsse'
    uns: 'binding score', 'bias_path', 'gff_db', 'genome', 'bias_bw', 'insertion', 'peak_calling', 'footprints', 'reference_sequences', 'unique_string'
    obsm: 'insertion_chr16', 'insertion_chr14', 'insertion_chr4', 'insertion_chrY', 'insertion_chr10', 'insertion_chr13', 'insertion_chrX', 'insertion_chr3', 'insertion_chr2', 'insertion_chr9', 'insertion_chr15', 'insertion_chr11', 'insertion_chr17', 'insertion_chr1', 'insertion_chr5', 'insertion_chr6', 'insertion_chr12', 'insertion_chr19', 'insertion_chr18', 'insertion_chr7', 'insertion_chr8'




In [13]:
printer.obs[:]

  printer.obs[:]


sample,n_fragment,frac_dup,frac_mito,frag_path,frag_sample_name,tsse
str,u64,f64,f64,str,str,f64
"""NT""",12836,0.309558,0.0,"""/bap/bap/collab_asthma_multiom…","""NT""",18.232891
"""NT""",27125,0.34161,0.0,"""/bap/bap/collab_asthma_multiom…","""NT""",12.819292
"""NT""",13429,0.326901,0.0,"""/bap/bap/collab_asthma_multiom…","""NT""",18.673219
"""NT""",538,0.317259,0.0,"""/bap/bap/collab_asthma_multiom…","""NT""",3.636364
"""NT""",80889,0.338385,0.0,"""/bap/bap/collab_asthma_multiom…","""NT""",12.59537
…,…,…,…,…,…,…
"""PBS""",19617,0.322383,0.0,"""/bap/bap/collab_asthma_multiom…","""PBS""",12.753124
"""PBS""",65500,0.317872,0.0,"""/bap/bap/collab_asthma_multiom…","""PBS""",14.318
"""PBS""",40833,0.314779,0.0,"""/bap/bap/collab_asthma_multiom…","""PBS""",15.953079
"""PBS""",35084,0.317007,0.0,"""/bap/bap/collab_asthma_multiom…","""PBS""",11.588141


In [14]:
print(printer.obs_names[:10])
print(len(printer.obs_names))

['NT_AAACAGCCAGTTTCTC-1', 'NT_AAACGGATCAATAGCC-1', 'NT_AAACGGATCCTAATGA-1', 'NT_AAACGTACAAAGCCTC-1', 'NT_AAACGTACATGTGGGA-1', 'NT_AAAGCAAGTTAGGTGC-1', 'NT_AAAGGCTCAATAGTCT-1', 'NT_AAAGGCTCAGAAATTG-1', 'NT_AAAGGTTAGTCAATTG-1', 'NT_AAATCCGGTGAGCGAA-1']
7418


### 2.2 Load the barcode:subtype mapping metadata

In [15]:
# Read csv yes header

barcode_subtype_mapping_df = pd.read_csv(barcode_subtype_mapping_csv_file_path, header=0)

barcode_subtype_mapping_df

Unnamed: 0,Barcode,Cluster
0,NT_AAACAGCCAGTTTCTC-1,NG4
1,NT_AAACGGATCAATAGCC-1,JG6
2,NT_AAACGGATCCTAATGA-1,JG1
3,NT_AAACGTACAAAGCCTC-1,NG5
4,NT_AAACGTACATGTGGGA-1,JG1
...,...,...
7413,OVA_C_TTTGGTGCATTATCCC-1,JG4
7414,OVA_C_TTTGTGAAGTTATGTG-1,JG5
7415,OVA_C_TTTGTGGCAGAATGAC-1,JG1
7416,OVA_C_TTTGTGGCAGGTTACC-1,NG20


In [16]:
type(printer.obs_names)

list

In [17]:
# Map the list of barcodes "printer.obs_names" to "Cluster" in the barcode_subtype_mapping_df, but must match by "Barcode" column in barcode_subtype_mapping_df

# First, verify that all barcodes in printer.obs_names are present in the barcode_subtype_mapping_df["Barcode"] column
# check set equality

set(printer.obs_names) == set(barcode_subtype_mapping_df['Barcode'])

True

In [18]:
# Now, build the mapping by reordering the rows of barcode_subtype_mapping_df to match the order of printer.obs_names

# Reorder the dataframe rows according to printer.obs_names:
ordered_df = barcode_subtype_mapping_df.set_index('Barcode').loc[printer.obs_names].reset_index()

ordered_df


Unnamed: 0,Barcode,Cluster
0,NT_AAACAGCCAGTTTCTC-1,NG4
1,NT_AAACGGATCAATAGCC-1,JG6
2,NT_AAACGGATCCTAATGA-1,JG1
3,NT_AAACGTACAAAGCCTC-1,NG5
4,NT_AAACGTACATGTGGGA-1,JG1
...,...,...
7413,PBS_TTTGTGGCAATAAGCA-1,JG4
7414,PBS_TTTGTGGCAGGACCAA-1,NG13
7415,PBS_TTTGTGTTCCTAGTTT-1,NG13
7416,PBS_TTTGTGTTCTGTTGCC-1,JG6


In [19]:
# Extract the Barcode column as a list:
ordered_barcodes = ordered_df['Barcode'].tolist()

print(len(ordered_barcodes))

7418


In [20]:
# Sanity check for identical order of both ordered_barcodes and printer.obs_names
printer_obs_names_l = printer.obs_names

assert ordered_barcodes == printer_obs_names_l, "The order of barcodes does not match the order in printer.obs_names"

In [21]:
# Finally, get the list of subtypes
ordered_subtypes = ordered_df['Cluster'].tolist()

In [22]:
print(ordered_subtypes[:10])
print(len(ordered_subtypes))

['NG4', 'JG6', 'JG1', 'NG5', 'JG1', 'NG11', 'NG19', 'NG25', 'NG11', 'NG9']
7418


### 2.3 Add subtype as column of ```printer``` object metadata

In [23]:
printer.obs['predicted_cluster'] = ordered_subtypes

## Close object

In [24]:
printer.close()

# END