In [None]:
# Copyright 2025 Sony Corporation

# Notebook for jaccard evaluation, perform jaccard evaluation, and create Table3 data.

### Create data for multiple SEEDs

In [None]:
import subprocess
import os
import glob
import shutil
import sys
import pandas as pd

In [None]:
workdir = os.getcwd()

In [None]:
BL_iter_hang_ev = "evaluate_BL_hang.R"
FSOM_no_hang_seed_ev = "evaluate_FlowSOM_hang.R"

In [None]:
Outdir = "Jaccard_Eval"
fsom_experiment_name = "jaccard-fsom-result"
os.chdir(workdir)
seed_num=10
max_process = 30 
Samusik_div_num = 38

meta_s = {"FP7000_34c":[20],
          "FP7000_34c-pca":[20],
          "Samusik_all":[Samusik_div_num],
          "Samusik_all-pca":[Samusik_div_num],
          }

it_list = [10]
seed_num = 10
clust_sample_list =["FP7000_34c","FP7000_34c-pca","Samusik_all"]

### set sample

In [None]:
Samplefcses = []
Samplefcs = os.path.join(workdir,"benchmark_data_sets","Samusik_all","data","Samusik_all.fcs")
Samplefcses.append(Samplefcs)
Samplefcs = os.path.join(workdir,"benchmark_data_sets","FP7000_34c","data","FP7000_34c.fcs")
Samplefcses.append(Samplefcs)

## experiment of flowsom

In [None]:
def do_experiment_flowsom(fsom_exp_path, mode, sample_name,Samplefcs,meta_list, it_list,seed_num):
    os.makedirs(fsom_exp_path,exist_ok=True)
    loop = len(meta_list) * seed_num *len(it_list)
    init_mode = mode    
    proc_list = []
    total_num = 0
    for meta in meta_list:#meta
        print(meta)
        vseed_dir = os.path.join(fsom_exp_path, "%d"%meta) #exp_path/{meta}
        os.makedirs(vseed_dir,exist_ok=True)
        for it in it_list:#iteration
            it_dir = os.path.join( vseed_dir, "%d"%it)
            os.makedirs(it_dir,exist_ok=True)
            for i in range(1,seed_num+1):#seed
                cmd = "RScript.exe cytometry-clustering-comparison/run_methods/run_FlowSOM2_variable_iter_seed_meta_sample_jaccard.R %s %s %d %d %d %s %s"%(Samplefcs,it_dir, it, i, meta,sample_name,init_mode)
                proc = subprocess.Popen(cmd.split())
                proc_list.append(proc)
                total_num = total_num +1
                if (total_num) % max_process == 0 or (total_num == loop):
                    #wait process
                    for subproc in proc_list:
                        subproc.wait()
                    proc_list = []    

In [None]:
def do_evaluate_fsom(fsom_exp_path,eval_tool_path, Samplefcs,meta_list, it_list,seed_num):
    os.chdir(eval_tool_path)
    loop = len(meta_list)*len(it_list)*seed_num    
    sample_name = os.path.splitext(os.path.basename(Samplefcs))[0]
    proc_list = []
    total_num = 0
    for meta in meta_list:
        print(meta)
        fsom_eval_dir = os.path.join(fsom_exp_path, "%d"%meta) #exp_path/{meta}
        for it in it_list:
            it_dir = os.path.join(fsom_eval_dir, "%d"%it)
            for seed in range(1,seed_num + 1):
                cmd = "RScript.exe %s %s %s/FlowSOM_labels_seed_%d_iter_%d_meta_%d.txt %s/vseed_f1hang_%03d_iter_%d_meta_%d.resh"%(FSOM_no_hang_seed_ev,Samplefcs,it_dir,seed,it,meta,it_dir,seed,it,meta)
                proc = subprocess.Popen(cmd.split())
                proc_list.append(proc)
                total_num = total_num +1
                if (total_num) % max_process == 0 or (total_num == loop):
                    #wait process
                    for subproc in proc_list:
                        subproc.wait()
                    proc_list = []
    print("done")

In [None]:
os.chdir(workdir)

#do with RAND init 
for Samplefcs in Samplefcses:
    mode = "RAND" #RAND/PCA
    sample_name = os.path.splitext(os.path.basename(Samplefcs))[0]
    print(sample_name)
    meta_list = meta_s.get(sample_name)
    fsom_exp_path = os.path.join(workdir,fsom_experiment_name,sample_name) 
    # do BL-flowsom experiments
    do_experiment_flowsom(fsom_exp_path,mode, sample_name,Samplefcs,meta_list,it_list,seed_num)
    os.chdir(workdir)
    eval_tool_path = os.path.join(workdir,"cytometry-clustering-comparison/evaluate_results")
    do_evaluate_fsom(fsom_exp_path,eval_tool_path,Samplefcs,meta_list,it_list,seed_num)
    os.chdir(workdir)
os.chdir(workdir)

In [None]:
#do with PCA init 
for Samplefcs in Samplefcses:
    mode = "PCA" #RAND/PCA
    sample_name = os.path.splitext(os.path.basename(Samplefcs))[0]
    sample_name = sample_name + "-pca"
    meta_list = meta_s.get(sample_name)
    fsom_exp_path = os.path.join(workdir,fsom_experiment_name,sample_name) 
    # do BL-flowsom experiments
    do_experiment_flowsom(fsom_exp_path,mode, sample_name,Samplefcs,meta_list,it_list,seed_num)
    os.chdir(workdir)
    eval_tool_path = os.path.join(workdir,"cytometry-clustering-comparison/evaluate_results")
    do_evaluate_fsom(fsom_exp_path,eval_tool_path,Samplefcs,meta_list,it_list,seed_num)
    os.chdir(workdir)
os.chdir(workdir)

### make jaccard file

In [None]:
def df2jacdf(indf):
    max = indf[0].max()
    index_list = []   
    for i in range(1,max + 1):
        ind = indf.loc[indf[0]==i].index 
        ind = ind+1 
        index_list.append(ind.tolist())
    df = pd.DataFrame(index_list)
    df.index = df.index + 1 
    return df

In [None]:
# random init 
for Sample in Samplefcses:
    sample_name = os.path.splitext(os.path.basename(Sample))[0]
    print(sample_name)
    meta_list = meta_s.get(sample_name)
    meta = str(meta_list[0])
    it = str(it_list[0])
    
    base_dir = os.path.join(workdir,fsom_experiment_name,sample_name,meta,it)
    out_dir = os.path.join(workdir, Outdir,"FSOM",sample_name)
    os.makedirs(out_dir,exist_ok=True)
    
    #clust
    for seed in range(1,seed_num+1):
        in_file = os.path.join(base_dir,"FlowSOM_labels_seed_%d_iter_%s_clust.txt"%(seed,it))
        out_filename = "fsom_%s_jaccard_%d_clust.csv"%(sample_name,seed)
        out_path = os.path.join(out_dir, out_filename)

        df = pd.read_csv(in_file, header=None,skiprows=1)
        out_df = df2jacdf(df) 
        a = out_df.T
        a.to_csv(out_path, index=False, header=True)
    
    #meta
    for seed in range(1,seed_num + 1):
        in_file = os.path.join(base_dir,"FlowSOM_labels_seed_%d_iter_%s_meta_%s.txt"%(seed,it,meta))
        out_filename = "fsom_%s_jaccard_%d_meta.csv"%(sample_name,seed)
        out_path = os.path.join(out_dir, out_filename)

        df = pd.read_csv(in_file, header=None,skiprows=1)
        out_df = df2jacdf(df) 
        a = out_df.T
        a.to_csv(out_path, index=False, header=True)
print("done")    

In [None]:
# PCA init 
for Sample in Samplefcses:
    sample_name = os.path.splitext(os.path.basename(Sample))[0]
    sample_name = sample_name + "-pca"
    print(sample_name)
    meta_list = meta_s.get(sample_name)
    meta = str(meta_list[0])
    it = str(it_list[0])
    
    base_dir = os.path.join(workdir,fsom_experiment_name,sample_name,meta,it)
    out_dir = os.path.join(workdir, Outdir,"FSOM",sample_name)
    os.makedirs(out_dir,exist_ok=True)
    
    #clust
    for seed in range(1,seed_num+1):
        in_file = os.path.join(base_dir,"FlowSOM_labels_seed_%d_iter_%s_clust.txt"%(seed,it))
        out_filename = "fsom_%s_jaccard_%d_clust.csv"%(sample_name,seed)
        out_path = os.path.join(out_dir, out_filename)
        
        df = pd.read_csv(in_file, header=None,skiprows=1)
        out_df = df2jacdf(df) 
        a = out_df.T
        a.to_csv(out_path, index=False, header=True)
    
    #meta
    for seed in range(1,seed_num + 1):
        in_file = os.path.join(base_dir,"FlowSOM_labels_seed_%d_iter_%s_meta_%s.txt"%(seed,it,meta))
        out_filename = "fsom_%s_jaccard_%d_meta.csv"%(sample_name,seed)
        out_path = os.path.join(out_dir, out_filename)

        df = pd.read_csv(in_file, header=None,skiprows=1)
        out_df = df2jacdf(df) 
        a = out_df.T
        a.to_csv(out_path, index=False, header=True)
print("done")

# calcrate jaccard index

In [None]:
from multiprocessing import Process

In [None]:
from workers import calc_jaccard_clust,calc_jaccard_meta # import from workers for multi-process

In [None]:
proc_list = []
for sample_name in clust_sample_list:
    div_num = 100
    fold_num = 10
    proc = Process(target=calc_jaccard_clust,args=(Outdir,workdir,fold_num, div_num, sample_name))
    proc.start()
    proc_list.append(proc)

for sample_name in clust_sample_list:
    if sample_name == "Samusik_all":
        div_num = Samusik_div_num
    else:
        div_num = 20 # for FP7000_34c
    fold_num = 10
    proc = Process(target=calc_jaccard_meta,args=(Outdir,workdir,fold_num, div_num, sample_name))
    proc.start()
    proc_list.append(proc)

for p in proc_list:
    p.join()

## calc each sample's average and std of jaccard

In [None]:
for sample_name in clust_sample_list:
    file = sample_name + "_jaccard_fsom_clust.csv"
    dat = pd.read_csv(file)
    print(file)
    print(dat["4"].describe()) #4 is jaccard index
    file = sample_name + "_jaccard_fsom_meta.csv"
    dat = pd.read_csv(file)
    print(file)
    print(dat["4"].describe()) #4 is jaccard index