In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import treeswift
import math
import re

In [2]:
%load_ext autoreload
%autoreload 2
    
import helpers.utils
from helpers.utils import build_summary_df
from helpers.utils import plot_genotype_confidence
from helpers.utils import clustermap_genos
from helpers.utils import distdict_to_df, leaf_pairs, get_geno_dict
from helpers.utils import im_ehd, empirical_site_dists, ehd, sm_ehd, pair_metrics
from helpers.utils import plot_concordance_scatterplot, plot_concordance_distribution
from helpers.utils import plot_state_counts, report_genotype_call_stats, save_df_to_pdf
from helpers.utils import plot_correlation, plot_bl_variance, add_internal_labels, branch_table
from helpers.utils import plot_tree_3d, edge_ratio_table, plot_genotypecall_summary

Remember not to re-run the process petracer code... it reshuffles the cell names!

I subset to clone 2 after I assigned the target_idx, so I have to remap the character_idx and target_idx. 

In [4]:
# import sys
# sys.path.insert(0, "/Users/gc3045/git/fast-laml/scripts") 
# import euclidean_solver as es

In [45]:
inputs_basename = "/Users/gc3045/git/laml2-experiments/real_data/PEtracer/inputs/"
lamlpro_prefix = "/Users/gc3045/git/laml2-experiments/real_data/PEtracer/runjobs/outputs_petracer_seq_validation_clone2_zrot/fastlaml_clone2.neighbor_joining"

In [46]:
lp_input_argmax = inputs_basename + "petracer_clone2_kde_character_matrix.csv"
bm_input_geno = inputs_basename + "petracer_clone2_petracer_genotypes.csv"
all_bm_input_geno = inputs_basename + "petracer_full_training.csv"
lp_map_geno = lamlpro_prefix + "_posterior_argmax.csv"

bm_treefile = inputs_basename + "trees/seq_validation/clone2.neighbor_joining.nwk"
lp_treefile = lamlpro_prefix + "_tree.newick"

In [47]:
lp_input_geno = inputs_basename + "/petracer_clone2_kde_scores.csv"

#### Read inputs

In [48]:
lp_input_argmax_df = pd.read_csv(lp_input_argmax)

In [122]:
lookup_codebook = pd.read_csv(inputs_basename + "/lookup_codebook.csv")

In [123]:
lookup_codebook

Unnamed: 0,target_idx,kde_pred_string,kde_pred_label
0,0,ACTCC,2
1,0,TATAT,5
2,1,unedited,0
3,1,CTCTC,4
4,1,CTTTG,5
...,...,...,...
200,59,unedited,0
201,59,ACAAT,1
202,59,ATTCG,4
203,59,CCCTA,5


In [50]:
petracer_full_traindf = pd.read_csv(all_bm_input_geno)

In [51]:
lp_input_geno_df = pd.read_csv(lp_input_geno)

In [144]:
bm_input_geno_df = pd.read_csv(bm_input_geno)
lp_map_geno_df = pd.read_csv(lp_map_geno, skiprows=2, index_col=0)

In [145]:
# bm_input_geno_df['target_idx'].unique() didn't get mapped to enumerated values

unique_indices = sorted(bm_input_geno_df['target_idx'].unique())
lookup_map = {idx: i for i, idx in enumerate(unique_indices)}

# assign the mapped enumeration
bm_input_geno_df['lookup_target_idx'] = bm_input_geno_df['target_idx'].map(lookup_map)

In [146]:
bm_input_geno_df['seq_state'].isna().sum()

np.int64(0)

#### Inspect a single cell as sanity check

In [147]:
lp_map_geno_df.loc['4T1_preedited-1420']

character_0    -1
character_1    -1
character_2    -1
character_3    -1
character_4    -1
character_5    -1
character_6    -1
character_7    -1
character_8    -1
character_9    -1
character_10   -1
character_11   -1
character_12   -1
character_13   -1
character_14   -1
character_15    1
character_16    7
character_17    8
character_18    1
character_19    4
character_20    8
character_21    8
character_22    5
character_23    1
character_24   -1
character_25   -1
character_26   -1
character_27   -1
character_28   -1
character_29   -1
character_30    3
character_31    4
character_32    2
character_33   -1
character_34   -1
character_35   -1
character_36    1
character_37    2
character_38    6
character_39   -1
character_40   -1
character_41   -1
character_42   -1
character_43   -1
character_44   -1
character_45    7
character_46    1
character_47    1
character_48   -1
character_49   -1
character_50   -1
character_51   -1
character_52   -1
character_53   -1
character_54    3
character_

In [152]:
mask = bm_input_geno_df['cellBC'] == '4T1_preedited-1420'
bm_input_geno_df['code'] = bm_input_geno_df['kde_pred_label']
# bm_input_geno_df.loc[mask].sort_values('lookup_target_idx')[['target_idx', 'pet_state', 'seq_state', 'pet_prob', 'lp_string']]

tmp = bm_input_geno_df.loc[mask]
tmp = tmp.merge(code_map[['target_idx','code','lp_string']],
                        on=['target_idx','code'], how='left')
tmp.sort_values('target_idx')

Unnamed: 0,cellBC,intID,clone,target_site,pet_state,seq_state,brightest_state,pet_prob,feature_0,feature_1,...,state5_prob,state6_prob,state7_prob,state8_prob,cassette_idx,target_idx,kde_pred_label,lookup_target_idx,code,lp_string
14,4T1_preedited-1420,intID1427,2,RNF2,ACAGT,ACAGT,ACAGT,0.999709,1238.569,168.804,...,-1.963672,-3.60859,-14.709714,-1.68307,5,15,1,15,1,ACAGT
0,4T1_preedited-1420,intID1427,2,HEK3,GCAAG,GCAAG,GCAAG,0.991975,215.313,128.766,...,-3.2332,-3.895794,-0.789175,-9.728599,5,16,7,16,7,GCAAG
7,4T1_preedited-1420,intID1427,2,EMX1,GGACA,GGACA,GGACA,0.996284,624.457,147.204,...,-2.901614,-6.748714,-3.927557,0.181983,5,17,8,17,8,GGACA
15,4T1_preedited-1420,intID1469,2,RNF2,ACAGT,ACAGT,ACAGT,0.999899,2367.686,262.06,...,-6.994211,-13.082957,-72.687009,-8.686134,6,18,1,18,1,ACAGT
1,4T1_preedited-1420,intID1469,2,HEK3,CTCTC,CTCTC,CTCTC,0.760447,208.722,403.199,...,-5.315813,-5.038218,-3.66065,-4.871158,6,19,4,19,4,CTCTC
8,4T1_preedited-1420,intID1469,2,EMX1,GGACA,GGACA,GGACA,0.997755,997.479,250.467,...,-14.311987,-10.380438,-18.15437,0.038359,6,20,8,20,8,GGACA
16,4T1_preedited-1420,intID1584,2,RNF2,TTCCT,TTCCT,TTCCT,0.984526,470.801,185.33,...,-1.527905,-5.171654,-10.674471,1.418322,7,21,8,21,8,TTCCT
2,4T1_preedited-1420,intID1584,2,HEK3,CTTTG,CTTTG,CTTTG,0.999422,166.978,148.886,...,3.336114,-4.322587,-3.700636,-10.80035,7,22,5,22,5,CTTTG
9,4T1_preedited-1420,intID1584,2,EMX1,ACAAT,ACAAT,ACAAT,0.97738,227.827,1339.118,...,-4.880692,-5.961624,-0.819885,-2.844193,7,23,1,23,1,ACAAT
17,4T1_preedited-1420,intID1882,2,RNF2,ACTTA,ACTTA,ACTTA,0.997825,212.464,1090.734,...,-2.975991,-1.651909,-6.449576,-1.430001,10,30,1,30,1,ACAGT


In [150]:
bm_input_geno_df['seq_state'].unique()

array(['CTCTC', 'ATCAA', 'AATCG', 'CTTTG', 'GCAAG', 'ATTTA', 'GCGCC',
       'AGTAC', 'ACAAT', 'CCTTT', 'GGACA', 'CCCTA', 'CCGAT', 'ATTCG',
       'ACTCC', 'TCCAA', 'TTCCT', 'GTTCA', 'ACAGT', 'ACTTA', 'TATAT',
       'TGCCA'], dtype=object)

In [151]:
pd.read_csv("/Users/gc3045/git/laml2-experiments/real_data/PEtracer/PEtracer raw data/edit_codebook.csv")

Unnamed: 0,site,edit,bit
0,HEK3,GATAG,r25
1,HEK3,AATCG,r26
2,HEK3,GCAAG,r27
3,HEK3,GCGCC,r28
4,HEK3,CTTTG,r29
5,HEK3,ATCAA,r30
6,HEK3,CTCTC,r31
7,HEK3,ATTTA,r32
8,EMX1,GGACA,r33
9,EMX1,ACAAT,r34


In [156]:
code_map = (lookup_codebook
            .rename(columns={'kde_pred_label':'code', 'kde_pred_string':'lp_string'})
            .copy())

# enforce types
code_map['target_idx'] = pd.to_numeric(code_map['target_idx'], errors='coerce').astype('Int64')
code_map['code']       = pd.to_numeric(code_map['code'], errors='coerce').astype('Int64')

# --- 2) Prepare lp_map_geno_df: wide -> long with integer target_idx ---
lp = lp_map_geno_df.copy()
if lp.index.name != 'node':
    lp.index.name = 'node'
lp.index = lp.index.astype(str)

# rename 'character_k' -> k (int)
col_map = {c: int(re.search(r'(\d+)', str(c)).group(1)) for c in lp.columns}
lp = lp.rename(columns=col_map)

lp_long = (lp
    .stack()
    .rename('code')
    .reset_index()
    .rename(columns={'node':'cell_name', 'level_1':'target_idx'})
)

lp_long['target_idx'] = pd.to_numeric(lp_long['target_idx'], errors='coerce').astype('Int64')
lp_long['code']       = pd.to_numeric(lp_long['code'], errors='coerce').astype('Int64')

# --- 3) Map numeric code -> string using lookup_codebook ---
lp_long = lp_long.merge(code_map[['target_idx','code','lp_string']],
                        on=['target_idx','code'], how='left')

# treat -1 as missing explicitly
lp_long.loc[lp_long['code'].eq(-1), 'lp_string'] = 'missing'

In [160]:
mask = lp_long['lp_string'].isna()
lp_long.loc[mask]

Unnamed: 0,cell_name,target_idx,code,lp_string
1619,4T1_preedited-2154,59,8,
1679,4T1_preedited-8365,59,8,
13929,internal_232,9,0,
13933,internal_232,13,0,
13935,internal_232,15,0,
...,...,...,...,...
24853,internal_414,13,0,
25273,internal_421,13,0,
25393,internal_423,13,0,
25513,internal_425,13,0,


In [162]:
lookup_codebook.loc[lookup_codebook['target_idx'] == 59]

Unnamed: 0,target_idx,kde_pred_string,kde_pred_label
200,59,unedited,0
201,59,ACAAT,1
202,59,ATTCG,4
203,59,CCCTA,5
204,59,CCTTT,7


In [99]:


# (optional) if any codes didn’t find a mapping, label as 'unknown'
lp_long['lp_string'] = lp_long['lp_string'].fillna('unknown')

# --- 4) Long or wide outputs ---
# Long (cell_name, target_idx, lp_string)
lp_long_out = lp_long[['cell_name','target_idx','lp_string']].copy()

# Wide (per-row table like your example, columns are target_idx)
lp_wide_out = (lp_long_out
               .pivot(index='cell_name', columns='target_idx', values='lp_string')
               .sort_index(axis=1))


In [100]:
mask = ~lp_long_out['cell_name'].astype(str).str.startswith('internal', na=False)
lp_out = lp_long_out.loc[mask].copy()

In [101]:
lp_out

Unnamed: 0,cell_name,target_idx,lp_string
0,4T1_preedited-6320,0,missing
1,4T1_preedited-6320,1,missing
2,4T1_preedited-6320,2,missing
3,4T1_preedited-6320,3,TCCAA
4,4T1_preedited-6320,4,ATCAA
...,...,...,...
13915,4T1_preedited-19,55,GCGCC
13916,4T1_preedited-19,56,CCCTA
13917,4T1_preedited-19,57,TTCCT
13918,4T1_preedited-19,58,GCGCC


In [102]:
#mask = bm_input_geno_df['pet_prob'] >= 0.70
#bm_input_geno_df = bm_input_geno_df.loc[mask]

In [103]:
tmp_bm = bm_input_geno_df[['cellBC', 'lookup_target_idx', 'pet_state', 'seq_state']]
tmp_bm.columns = ['cell_name', 'target_idx', 'pet_state', 'seq_state']

In [104]:
tmp_bm.shape

(10770, 4)

In [105]:
tmp_bm

Unnamed: 0,cell_name,target_idx,pet_state,seq_state
0,4T1_preedited-1153,1,CTCTC,CTCTC
1,4T1_preedited-1215,1,CTCTC,CTCTC
2,4T1_preedited-1280,1,CTCTC,CTCTC
3,4T1_preedited-1335,1,CTCTC,CTCTC
4,4T1_preedited-1349,1,CTCTC,CTCTC
...,...,...,...,...
10765,4T1_preedited-8440,57,TTCCT,TTCCT
10766,4T1_preedited-8557,57,TTCCT,TTCCT
10767,4T1_preedited-8611,57,TTCCT,TTCCT
10768,4T1_preedited-8623,57,TTCCT,TTCCT


In [106]:
merged = tmp_bm.merge(lp_out, on=["cell_name", "target_idx"], how="inner")

In [107]:
merged.isna().sum().sum()

np.int64(0)

In [108]:
merged.shape

(10770, 5)

In [109]:
(merged == 'missing').sum().sum()

np.int64(0)

In [110]:
merged['pet_state'].nunique()

23

In [111]:
merged['seq_state'].nunique()

22

In [112]:
merged['lp_string'].nunique()

24

When using the PCA-rotated Z matrix, we get the following accuracy: 

```
Accuracy (lp_string vs seq_state), ignoring ONLY both-missing:
  n_evaluated = 10770
  n_agree     = 10548
  accuracy    = 0.9794
✅ All LP-missing spots are also BM-missing.
```

In [113]:
merged["seq_state"].unique()

array(['CTCTC', 'ATCAA', 'AATCG', 'CTTTG', 'GCAAG', 'ATTTA', 'GCGCC',
       'AGTAC', 'ACAAT', 'CCTTT', 'GGACA', 'CCCTA', 'CCGAT', 'ATTCG',
       'ACTCC', 'TCCAA', 'TTCCT', 'GTTCA', 'ACAGT', 'ACTTA', 'TATAT',
       'TGCCA'], dtype=object)

In [114]:
merged["lp_string"].unique()

array(['CTCTC', 'ATCAA', 'AATCG', 'CTTTG', 'GCAAG', 'ATTTA', 'GCGCC',
       'unedited', 'AGTAC', 'ACAAT', 'CCTTT', 'GGACA', 'CCCTA', 'CCGAT',
       'ATTCG', 'unknown', 'ACTCC', 'TCCAA', 'TTCCT', 'GTTCA', 'TATAT',
       'ACAGT', 'TGCCA', 'ACTTA'], dtype=object)

In [115]:
np.mean(merged["seq_state"] == merged["lp_string"])

np.float64(0.9793871866295265)

In [116]:
np.mean(merged["seq_state"] == merged["pet_state"])

np.float64(0.9847725162488393)

In [117]:
merged

Unnamed: 0,cell_name,target_idx,pet_state,seq_state,lp_string
0,4T1_preedited-1153,1,CTCTC,CTCTC,CTCTC
1,4T1_preedited-1215,1,CTCTC,CTCTC,CTCTC
2,4T1_preedited-1280,1,CTCTC,CTCTC,CTCTC
3,4T1_preedited-1335,1,CTCTC,CTCTC,CTCTC
4,4T1_preedited-1349,1,CTCTC,CTCTC,CTCTC
...,...,...,...,...,...
10765,4T1_preedited-8440,57,TTCCT,TTCCT,TTCCT
10766,4T1_preedited-8557,57,TTCCT,TTCCT,ACTCC
10767,4T1_preedited-8611,57,TTCCT,TTCCT,TTCCT
10768,4T1_preedited-8623,57,TTCCT,TTCCT,TTCCT


In [154]:
mask = merged["seq_state"] != merged["lp_string"]
merged.loc[mask, ['pet_state', 'seq_state', 'lp_string']].drop_duplicates()

Unnamed: 0,pet_state,seq_state,lp_string
705,GCAAG,GCAAG,CTTTG
846,AATCG,GCAAG,CTTTG
1467,CTTTG,CTTTG,ATTTA
3212,GCGCC,GCGCC,CTTTG
3230,GCAAG,GCGCC,CTTTG
3422,AATCG,GCGCC,CTTTG
3437,CTTTG,GCGCC,CTTTG
3449,CTCTC,GCGCC,CTCTC
3454,ATCAA,GCGCC,ATCAA
3458,ATCAA,GCGCC,CTTTG
