# CODEX Generation

### Imports (run first)

In [None]:
import pandas as pd
import os
import warnings

os.chdir('/home/LULAB/wboohar/CODEX/data_processing/code')
from codex_project import codex_project, replace_marker_match_centroids, read_marker_combos, get_marker_list
warnings.filterwarnings('ignore')

### Parameters
annotations_path  
> .json annotations list

base_dir  
> location of base directory, should include raw_data folder w/ project folders inside

project_name
> name of project, will create a folder with this name in the base directory

project_folders
> list of folders with marker data inside to be used for this project (names of folders are not used)

sample_labels
> dict relating project_folders names to a label, will be in 'Parent' column (typically this is used to show mouse information)

In [None]:
annotations_path = '/store/Projects/wboohar/PhenoCycler/annotation_strategies/marker_combos_062525_updated_verified.json'   
base_dir = '/store/Projects/wboohar/PhenoCycler' 
project_name = 'QuantCellPaperO3'
project_path = f'{base_dir}/{project_name}'
data_path = f'{base_dir}/raw_data'

project_folders = ['QuantCellPaperO3']

sample_labels = {'QuantCellPaperO3':'QuantCellPaperO3'}


Initialize the codex_project object from the marker .csv files. If there are multiple projects together, you can label them using the name of the folder in which all of the .csv files are found in.

In [None]:
os.makedirs(project_path, exist_ok=True)

codex = codex_project()
codex.initialize(data_path=data_path, folders=project_folders, project_name=project_name, annotation_strategy_path=annotations_path)
codex.set_sample_labels(sample_labels)

Ensure that all of the required .csv files for every marker in marker_combos has been found. If any are not present or there are any extra, it will state them here.

In [None]:
codex.verify_marker_annotations()

Annotate cell types using conventional strategy. Any cell matching more than one cell type will be left as "Other".

drop_missing params:
 - max_missing_per_row: int = 2 # drop columns with more than X NAs
 - max_missing_per_col: int = 100 # drop rows with more than X NAs
 - drop_axis=0 # if there are any remaining NAs, 0 to drop cells (rows), 1 to drop features (columns)

In [None]:
codex.annotate()
codex.drop_missing(max_missing_per_col=1000000)

Sanity check to make sure every cell type defined is present at last once

In [None]:
assert(len(codex.codex.loc[:, 'cell_type'].unique()) == len(codex._marker_combos.keys())+1)

If spatial location is specified and you have different samples you want to keep separate spatially, do that here.

First, DBSCAN is used to find all clusters. Use visualize_sectioning to see if you need to adjust eps

In [None]:
codex.section_samples(eps=500)

In [None]:
#codex.override_sectioning() # replace all sections with the name of the data folder + _0, use for IF images
codex.visualize_sectioning()

Verify every cell is assigned to a section

In [None]:
codex.verify_sectioning()

Save to .csv file, it is now ready for QuantCell

In [None]:
codex.save_csv(f'{project_path}/codex_conventional_{project_name}.csv')

# Load in
Includes all data except the folders in which the raw data is

In [None]:
other_codex = codex_project()
other_codex.read_csv(f'{project_path}/codex_conventional_{project_name}.csv', project_name=project_name)
other_codex.read_annotation_strategy(annotations_path)