### Nonredundant set of 1065 clones - validation of RF models
- (in a separate slurm job) 3 RF models trained for the 1065 clones, each training on two measurements and testing on one
- now let's quantify the accuracy scores

In [35]:
import pandas as pd
from sklearn.metrics import accuracy_score, confusion_matrix

In [36]:
savepath_confusion_matrix = "/nobackup/lab_kubicek/jreinis/2023-10-17_manuscript_figures_source_data/figS6_1065_clones_confusion_matrix.tsv"

In [37]:
measurements = [1,2,3]

In [38]:
cf_collector = []

preds_dict = {}
for measurement in measurements:
    meas_train = [x for x in measurements if measurement != x]
    preds_path = f"/nobackup/lab_kubicek/jreinis/2023-09-18_train_RF_as_many_clones_as_possible/RF_preds_1measurement_validation/1065_nonredundant_clones_measurement{measurement}_predictions.pickle"
    preds = pd.read_pickle(preds_path)
    preds_dict[measurement] = preds
    display(preds.sample(n=10))
    print(accuracy_score(preds["clone_label"], preds["clone_label_pred"]))
    
    cf = pd.DataFrame(confusion_matrix(preds["clone_label"], preds["clone_label_pred"])).reset_index().rename(columns={"index":"label_clone_true"}).melt(id_vars=["label_clone_true"], value_vars=list(range(1065)), var_name="label_clone_predicted", value_name="n_cells")
    cf["train_measurements"] = ", ".join([str(x) for x in measurements if measurement != x])
    cf["validate_measurement"] = measurement
    cf_collector.append(cf[["train_measurements", "validate_measurement", "label_clone_true", "label_clone_predicted", "n_cells"]])

Unnamed: 0,plate,measurement,image,row,column,FOV,ImageNumber_cell,ObjectNumber_cell,clone_label,clone_label_pred
567467,JR_20220119_HAP1cancer_plate1,1,r16c24f02p01,16,24,2,2683,12,104,104
200042,JR_20200420_HAP1_cancer_plate5,1,r09c04f04p01,9,4,4,979,6,608,608
557985,JR_20220119_HAP1cancer_plate1,1,r14c18f03p01,14,18,3,2306,17,91,91
518289,JR_20220119_HAP1cancer_plate1,1,r06c20f04p01,6,20,4,977,4,42,42
379722,JR_20210630_38_multicolor_plate6,1,r14c06f05p01,14,6,5,1907,50,881,881
435221,JR_20211021_doubleHEKs_plate2,1,r15c04f03p01,15,4,3,2037,16,977,977
332118,JR_20210630_38_multicolor_plate4,1,r06c10f06p01,6,10,6,780,9,802,802
270172,JR_20210630_38_multicolor_plate1,1,r12c20f06p01,12,20,6,1704,52,700,700
775295,JR_20220119_HAP1cancer_plate3,1,r15c10f02p01,15,10,2,2417,3,339,339
503877,JR_20220119_HAP1cancer_plate1,1,r04c16f03p01,4,16,3,612,18,25,25


0.9474662992262552


Unnamed: 0,plate,measurement,image,row,column,FOV,ImageNumber_cell,ObjectNumber_cell,clone_label,clone_label_pred
593225,JR_20220119_HAP1cancer_plate2,2,r05c07f04p01,5,7,4,718,47,133,133
636292,JR_20220119_HAP1cancer_plate2,2,r10c23f05p01,10,23,5,1671,51,180,180
439132,JR_20211021_doubleHEKs_plate2,2,r16c16f07p01,16,16,7,2632,57,983,983
762009,JR_20220119_HAP1cancer_plate3,2,r13c04f07p01,13,4,7,2044,17,323,323
422507,JR_20211021_doubleHEKs_plate2,2,r09c20f03p01,9,20,3,1480,55,958,958
357781,JR_20210630_38_multicolor_plate5,2,r13c01f03p01,13,1,3,867,122,843,843
33507,JR_20200420_HAP1_cancer_plate4,2,r05c01f02p01,5,1,2,578,49,392,392
316090,JR_20210630_38_multicolor_plate3,2,r14c14f01p01,14,14,1,1301,6,775,775
534827,JR_20220119_HAP1cancer_plate1,2,r10c04f07p01,10,4,7,1540,8,62,62
349658,JR_20210630_38_multicolor_plate5,2,r04c17f02p01,4,17,2,266,43,829,829


0.9612556456901189


Unnamed: 0,plate,measurement,image,row,column,FOV,ImageNumber_cell,ObjectNumber_cell,clone_label,clone_label_pred
515021,JR_20220119_HAP1cancer_plate1,3,r06c05f05p01,6,5,5,749,97,37,37
664811,JR_20220119_HAP1cancer_plate2,3,r14c11f01p01,14,11,1,1933,49,211,211
470993,JR_20211116_doubleHEKs2_plate2,3,r03c12f02p01,3,12,2,356,35,1038,1038
582981,JR_20220119_HAP1cancer_plate2,3,r03c18f01p01,3,18,1,391,36,120,120
193154,JR_20200420_HAP1_cancer_plate5,3,r08c06f05p01,8,6,5,1043,7,599,599
521595,JR_20220119_HAP1cancer_plate1,3,r07c11f04p01,7,11,4,928,96,45,45
218442,JR_20200420_HAP1_cancer_plate5,3,r11c04f04p01,11,4,4,1462,57,630,630
786019,JR_20220119_HAP1cancer_plate3,3,r16c19f03p01,16,19,3,2271,42,351,351
112317,JR_20200420_HAP1_cancer_plate4,3,r14c13f03p01,14,13,3,1947,7,492,492
120582,JR_20200420_HAP1_cancer_plate4,3,r16c16f04p01,16,16,4,2254,28,503,503


0.9345917151127435


### Cell counts

In [39]:
for meas in measurements:
    print(meas, preds_dict[meas].clone_label.nunique(), preds_dict[meas].clone_label.value_counts().median())

1 1063 78.0
2 1065 215.0
3 1065 360.0


### Save melted confusion matrix for all settings

In [45]:
cf_merged = pd.concat(cf_collector).reset_index(drop=True).query('n_cells > 0')
cf_merged.to_csv(savepath_confusion_matrix, sep="\t", index=False)
cf_merged

Unnamed: 0,train_measurements,validate_measurement,label_clone_true,label_clone_predicted,n_cells
0,"2, 3",1,0,0,151
25,"2, 3",1,25,0,1
70,"2, 3",1,70,0,1
111,"2, 3",1,111,0,2
145,"2, 3",1,145,0,5
...,...,...,...,...,...
3402597,"1, 2",3,987,1064,10
3402616,"1, 2",3,1006,1064,1
3402640,"1, 2",3,1030,1064,3
3402651,"1, 2",3,1041,1064,1
