In [1]:
from pathlib import Path
import pandas as pd
import anndata
from facsforge.core.index import load_index_csv
import os
import glob

In [2]:
# INPUT folders
INDEX_DIR = Path("./")
BARCODE_XLSX = Path("../Sequencing/Well-ID_S8-Sort.xlsx")
ADATA_PATH = Path("../First_salamander_batch_analyzed.adata")

# OUTPUT
OUT_CSV = Path("facs_index_merged.csv")
OUT_ADATA = Path("First_salamander_batch_analyzed_with_facs.h5ad")

# Column contract
CELL_ID_COL     = "cell_id"   # target unified ID
WELL_COL        = "Well"
BARCODE_COL     = "barcode"
SORT_FLAG_COL   = "Well"


In [3]:
os.getcwd()

'/home/med-sal/BMC_Projects/Leigh/Marlene/Phagozytosis_FACS_Seq/BioInformatics_IntegrationOfFACS-DataANDscRNA-Seq/RAW_Sort-data'

In [4]:
f"{INDEX_DIR}/*.csv"

'./*.csv'

In [5]:
glob.glob(f"{INDEX_DIR}/*.csv")

['./E2-E3.csv',
 './P24.csv',
 './A1-B10.csv',
 './E1.csv',
 './A7-A12.csv',
 './A24-A23.csv',
 './C9.csv',
 './C8-E4.csv',
 './A13-A24.csv',
 './J16-O5.csv',
 './O4-P23.csv',
 './B11-C10.csv',
 './F1-J15.csv']

In [6]:
def load_all_sorted_csvs(folder: Path):
    frames = []
    for csv in folder.glob("*.csv"):
        print(f"Loading {csv.name}")
        df = load_index_csv(csv)   # ✅ YOUR API
        df["__source_file"] = csv.name
        frames.append(df)

    all_cells = pd.concat(frames, axis=0, ignore_index=True)
    return all_cells

In [7]:
df_index = load_all_sorted_csvs(INDEX_DIR)
df_index.head()

Loading E2-E3.csv
Loaded index CSV E2-E3.csv: 2 rows, removed 382 zero rows.
Loading P24.csv
Loaded index CSV P24.csv: 1 rows, removed 383 zero rows.
Loading A1-B10.csv
Loaded index CSV A1-B10.csv: 16 rows, removed 368 zero rows.
Loading E1.csv
Loaded index CSV E1.csv: 1 rows, removed 383 zero rows.
Loading A7-A12.csv
Loaded index CSV A7-A12.csv: 6 rows, removed 378 zero rows.
Loading A24-A23.csv
Loaded index CSV A24-A23.csv: 2 rows, removed 382 zero rows.
Loading C9.csv
Loaded index CSV C9.csv: 1 rows, removed 383 zero rows.
Loading C8-E4.csv
Loaded index CSV C8-E4.csv: 53 rows, removed 331 zero rows.
Loading A13-A24.csv
Loaded index CSV A13-A24.csv: 12 rows, removed 372 zero rows.
Loading J16-O5.csv
Loaded index CSV J16-O5.csv: 125 rows, removed 259 zero rows.
Loading O4-P23.csv
Loaded index CSV O4-P23.csv: 27 rows, removed 357 zero rows.
Loading B11-C10.csv
Loaded index CSV B11-C10.csv: 29 rows, removed 355 zero rows.
Loading F1-J15.csv
Loaded index CSV F1-J15.csv: 111 rows, removed

Unnamed: 0,Well,EventID,Sort Population,ImagePath,EventIndex,Time,FSC-A,FSC-H,FSC-W,SSC (Violet)-A,...,EventNumber,DeltaTime,DropId,SaturatedChannels1,SaturatedChannels2,SpectralEventWidth,EventWidthInDrops,SpectralUnmixingFlags,WaveformPresent,__source_file
0,E02,1,Leukocytes 1-3,images\E2-E3_E02_Leukocytes 1-3_00001624.tiff,1624,8.0775,48192072,35821,1345,17417544,...,1187404,144294,0,0,0,5189,0,0,0,E2-E3.csv
1,E03,1,Leukocytes 1-3,images\E2-E3_E03_Leukocytes 1-3_00001301.tiff,1301,5.7294,48734760,30120,1618,4906290,...,1187081,267876,0,0,0,6845,0,0,0,E2-E3.csv
2,P24,1,Leukocytes 2-2,images\P24_P24_Leukocytes 2-2_00000970.tiff,970,12.4942,41336260,37630,1098,8144450,...,1363747,6789746,0,0,0,4790,0,0,0,P24.csv
3,A01,1,Erythrocytes 1,images\Sort_003_A01_Erythrocytes 1_00014065.tiff,14065,125.6402,85159640,34788,2447,2900781,...,994472,165914,0,0,0,6962,0,0,0,A1-B10.csv
4,A02,1,Erythrocytes 1,images\Sort_003_A02_Erythrocytes 1_00011974.tiff,11974,102.9451,107748880,53907,1998,27843924,...,992381,2290597,0,0,0,5415,0,0,0,A1-B10.csv


In [8]:
def mark_double_sorts(df, key):
    counts = df[key].value_counts()
    doubles = counts[counts > 1].index

    df["double_sorted"] = df[key].isin(doubles)

    print(f"Double-sorted events: {df['double_sorted'].sum()}")
    return df, doubles

In [9]:
df_index, DUP_IDS = mark_double_sorts(df_index, "Well")
DUP_IDS

Double-sorted events: 4


Index(['A23', 'A24'], dtype='object', name='Well')

In [10]:
df_barcode = pd.read_excel(BARCODE_XLSX)
df_barcode.head()

Unnamed: 0,BCset,plateBC,XC_DNBPE,XC_DNBPE_TS,WellID,S8_Sort,gate,phagocytosing no phagocytosing
0,8E,11280436,AGTCACCGAATGCTTGTCGT,AGGTATAGCTTGCTTGTCGT,A1,Low_Phagocytosis_Erythrocytes,,
1,8E,11280436,AGTCACCGAATGTCTGAAGG,AGGTATAGCTTGTCTGAAGG,A10,Low_Phagocytosis_Erythrocytes,,
2,8E,11280436,AGTCACCGAAGCCAAGAATC,AGGTATAGCTGCCAAGAATC,A11,Low_Phagocytosis_Erythrocytes,,
3,8E,11280436,AGTCACCGAAAGCAGTTAGC,AGGTATAGCTAGCAGTTAGC,A12,Low_Phagocytosis_Erythrocytes,,
4,8E,11280436,AGTCACCGAAAAGCTGACCA,AGGTATAGCTAAGCTGACCA,A13,Low_Phagocytosis_Erythrocytes,,


In [11]:
def normalize_well(w):
    if pd.isna(w):
        return None
    row = w[0].upper()
    col = int(w[1:])
    return f"{row}{col:02d}"

In [12]:
df_barcode["Well_norm"] = df_barcode["WellID"].apply(normalize_well)

In [13]:
df_index["Well_norm"] = df_index["Well"].apply(normalize_well)

In [14]:
merged =df_barcode.merge(df_index ,left_on="Well_norm",right_on="Well_norm")
merged

Unnamed: 0,BCset,plateBC,XC_DNBPE,XC_DNBPE_TS,WellID,S8_Sort,gate,phagocytosing no phagocytosing,Well_norm,Well,...,DeltaTime,DropId,SaturatedChannels1,SaturatedChannels2,SpectralEventWidth,EventWidthInDrops,SpectralUnmixingFlags,WaveformPresent,__source_file,double_sorted
0,8E,11280436,AGTCACCGAATGCTTGTCGT,AGGTATAGCTTGCTTGTCGT,A1,Low_Phagocytosis_Erythrocytes,,,A01,A01,...,165914,0,0,0,6962,0,0,0,A1-B10.csv,False
1,8E,11280436,AGTCACCGAATGTCTGAAGG,AGGTATAGCTTGTCTGAAGG,A10,Low_Phagocytosis_Erythrocytes,,,A10,A10,...,37988,0,0,0,5638,0,0,0,A7-A12.csv,False
2,8E,11280436,AGTCACCGAAGCCAAGAATC,AGGTATAGCTGCCAAGAATC,A11,Low_Phagocytosis_Erythrocytes,,,A11,A11,...,2458656,0,0,0,4970,0,0,0,A7-A12.csv,False
3,8E,11280436,AGTCACCGAAAGCAGTTAGC,AGGTATAGCTAGCAGTTAGC,A12,Low_Phagocytosis_Erythrocytes,,,A12,A12,...,269849,0,0,0,6386,0,0,0,A7-A12.csv,False
4,8E,11280436,AGTCACCGAAAAGCTGACCA,AGGTATAGCTAAGCTGACCA,A13,Low_Phagocytosis_Erythrocytes,,,A13,A13,...,2104402,0,0,0,7044,0,0,0,A13-A24.csv,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
381,8E,11280436,GTAGACGAAGGCAAGACTCA,TGCGATTATGGCAAGACTCA,P5,High_Phagocytosis_Leukocytes2,,,P05,P05,...,2503783,0,0,0,5149,0,0,0,O4-P23.csv,False
382,8E,11280436,GTAGACGAAGGACATCACCT,TGCGATTATGGACATCACCT,P6,High_Phagocytosis_Leukocytes2,,,P06,P06,...,2012759,0,0,0,5076,0,0,0,O4-P23.csv,False
383,8E,11280436,GTAGACGAAGCATCAGGATC,TGCGATTATGCATCAGGATC,P7,High_Phagocytosis_Leukocytes2,,,P07,P07,...,3821502,0,0,0,5141,0,0,0,O4-P23.csv,False
384,8E,11280436,GTAGACGAAGACGTCTGAAC,TGCGATTATGACGTCTGAAC,P8,High_Phagocytosis_Leukocytes2,,,P08,P08,...,2390977,0,0,0,4903,0,0,0,O4-P23.csv,False


In [15]:
merged = merged [ merged['double_sorted'] == False]

In [16]:
adata = anndata.read_h5ad( ADATA_PATH )
adata

AnnData object with n_obs × n_vars = 354 × 12323
    obs: 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'geneSum', 'RP[LS]sum', 'n_genes', 'leiden', 'leiden_10_overclustered', 'leiden_10_0.85', 'leiden_10_0.85_renamed', 'celltype'
    var: 'gene_ids', 'feature_types', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'n_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'celltype_colors', 'hvg', 'leiden', 'leiden_10_0.85_renamed_colors', 'leiden_10_0.85_renamed_stats', 'leiden_10_overclustered', 'leiden_10_overclustered_colors', 'leiden_colors', 'leiden_stats', 'log1p', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

In [17]:
adata.obs

Unnamed: 0,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,pct_counts_in_top_50_genes,pct_counts_in_top_100_genes,pct_counts_in_top_200_genes,pct_counts_in_top_500_genes,geneSum,RP[LS]sum,n_genes,leiden,leiden_10_overclustered,leiden_10_0.85,leiden_10_0.85_renamed,celltype
AACTGAACACAAGCTGACCA,5318,8.579041,23883.0,10.080964,15.534062,21.525771,29.518905,44.747310,23627.0,0,5266,0,23,23,4,Myeloid / DC-like
AACTGAACACACGTCTGAAC,6768,8.820109,61701.0,11.030071,25.881266,39.524481,51.655565,65.246917,49501.0,0,6710,0,40,45,5,Macrophage
AACTGAACACAGCAGTTAGC,3591,8.186464,13561.0,9.515027,29.311998,37.209645,47.157289,62.030824,12657.0,0,3533,5,12,12,1,Megakaryocyte / Platelet
AACTGAACACAGCCGAGTTA,2710,7.905073,7207.0,8.882947,18.828916,25.697239,35.063133,53.364784,7121.0,0,2671,5,1,1,10,Megakaryocyte (mature)
AACTGAACACAGCGATAACG,5201,8.556798,24477.0,10.105530,18.135392,25.840585,36.258528,52.400212,22100.0,0,5143,5,1,1,10,Megakaryocyte (mature)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTCCTGTGACTCTGGACAGT,3009,8.009695,13992.0,9.546312,35.484563,50.371641,63.221841,74.835620,10894.0,0,2951,6,2,8,0,Hematopoietic progenitor
TTCCTGTGACTGCTTGTCGT,2436,7.798523,10681.0,9.276316,39.134912,53.599850,65.518210,77.062073,8432.0,0,2378,6,2,8,0,Hematopoietic progenitor
TTCCTGTGACTGTCTGAAGG,5410,8.596189,37165.0,10.523149,22.233284,31.564644,43.446791,59.537199,33689.0,0,5352,8,13,13,13,Granulocyte / Neutrophil
TTCCTGTGACTGTCTGACCA,5080,8.533263,30644.0,10.330225,24.928208,33.331158,44.054301,59.939956,28262.0,0,5022,8,13,13,13,Granulocyte / Neutrophil


In [20]:
adata.obs = adata.obs.merge(
    merged,
    right_on="XC_DNBPE_TS",
left_index= True,
    how="left"
).set_index(adata.obs.index)

In [21]:
adata 

AnnData object with n_obs × n_vars = 354 × 12323
    obs: 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'geneSum', 'RP[LS]sum', 'n_genes', 'leiden', 'leiden_10_overclustered', 'leiden_10_0.85', 'leiden_10_0.85_renamed', 'celltype', 'BCset', 'plateBC', 'XC_DNBPE', 'XC_DNBPE_TS', 'WellID', 'S8_Sort', 'gate', 'phagocytosing no phagocytosing', 'Well_norm', 'Well', 'EventID', 'Sort Population', 'ImagePath', 'EventIndex', 'Time', 'FSC-A', 'FSC-H', 'FSC-W', 'SSC (Violet)-A', 'SSC (Violet)-H', 'SSC (Violet)-W', 'LightLoss (Imaging)-A', 'LightLoss (Imaging)-H', 'LightLoss (Imaging)-W', 'SSC (Imaging)-A', 'SSC (Imaging)-H', 'SSC (Imaging)-W', 'LightLoss (Violet)-A', 'LightLoss (Violet)-H', 'LightLoss (Violet)-W', 'E.coli Lysosom Alexa Fluor 488-A', 'L-D eF780-right*-A', 'Size (LightLoss (Imaging))', 'Size (FSC)', 'Size (SSC (Imaging))'

In [22]:
adata = adata[ adata.obs['double_sorted'] ==False].copy()
adata

AnnData object with n_obs × n_vars = 352 × 12323
    obs: 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'geneSum', 'RP[LS]sum', 'n_genes', 'leiden', 'leiden_10_overclustered', 'leiden_10_0.85', 'leiden_10_0.85_renamed', 'celltype', 'BCset', 'plateBC', 'XC_DNBPE', 'XC_DNBPE_TS', 'WellID', 'S8_Sort', 'gate', 'phagocytosing no phagocytosing', 'Well_norm', 'Well', 'EventID', 'Sort Population', 'ImagePath', 'EventIndex', 'Time', 'FSC-A', 'FSC-H', 'FSC-W', 'SSC (Violet)-A', 'SSC (Violet)-H', 'SSC (Violet)-W', 'LightLoss (Imaging)-A', 'LightLoss (Imaging)-H', 'LightLoss (Imaging)-W', 'SSC (Imaging)-A', 'SSC (Imaging)-H', 'SSC (Imaging)-W', 'LightLoss (Violet)-A', 'LightLoss (Violet)-H', 'LightLoss (Violet)-W', 'E.coli Lysosom Alexa Fluor 488-A', 'L-D eF780-right*-A', 'Size (LightLoss (Imaging))', 'Size (FSC)', 'Size (SSC (Imaging))'

In [23]:
adata.obs["double_sorted"] = (
    adata.obs["double_sorted"]
    .map({True: "True", False: "False"})
    .astype("category")
)

In [24]:
def sanitize_adata_keys(adata):
    adata.obs.columns = (
        adata.obs.columns
        .str.replace("/", "_", regex=False)
        .str.replace(" ", "_", regex=False)
    )

    adata.var.columns = (
        adata.var.columns
        .str.replace("/", "_", regex=False)
        .str.replace(" ", "_", regex=False)
    )

    return adata

adata = sanitize_adata_keys(adata)

In [25]:
adata.write_h5ad( OUT_ADATA )
! ls -lh {OUT_ADATA}

-rw-r--r-- 1 ubuntu ubuntu 31M Nov 28 14:51 First_salamander_batch_analyzed_with_facs.h5ad


In [117]:
adata

AnnData object with n_obs × n_vars = 352 × 12323
    obs: 'n_genes_by_counts_x', 'log1p_n_genes_by_counts_x', 'total_counts_x', 'log1p_total_counts_x', 'pct_counts_in_top_50_genes_x', 'pct_counts_in_top_100_genes_x', 'pct_counts_in_top_200_genes_x', 'pct_counts_in_top_500_genes_x', 'geneSum_x', 'RP[LS]sum_x', 'n_genes_x', 'leiden_x', 'leiden_10_overclustered_x', 'leiden_10_0.85_x', 'leiden_10_0.85_renamed_x', 'leiden_10_0.85_0_x', 'leiden_10_0.85_0_renamed_x', 'leiden_10_0.85_1_x', 'leiden_10_0.85_1_renamed_x', 'leiden_10_0.85_2_x', 'leiden_10_0.85_2_renamed_x', 'leiden_10_0.85_3_x', 'leiden_10_0.85_3_renamed_x', 'leiden_10_0.85_4_x', 'leiden_10_0.85_4_renamed_x', 'celltype_x', 'n_genes_by_counts_y', 'log1p_n_genes_by_counts_y', 'total_counts_y', 'log1p_total_counts_y', 'pct_counts_in_top_50_genes_y', 'pct_counts_in_top_100_genes_y', 'pct_counts_in_top_200_genes_y', 'pct_counts_in_top_500_genes_y', 'geneSum_y', 'RP[LS]sum_y', 'n_genes_y', 'leiden_y', 'leiden_10_overclustered_y', 'leide