# Check cell matching

Highlights:
- the **agreement score matrix** is the key dataset for all cell-level sorting error analyses 
- We selected the best Ground truth match for each sorted unit and discarded other ground truth units as misses, because there were fewer sorted units than ground truth units. But we cannot rely on this cell matching to claim that some ground truth units were entirely missed.
- One must look at the matrix of agreement scores.

Setup python environment `env_kilosort_silico`

see related pipeline "match_sorted_to_true_neuropixels_2023_02_19.py"

In [2]:
import logging
import logging.config
import os

import numpy as np
import pandas as pd
import spikeinterface as si
import yaml
from spikeinterface import comparison

# SET PROJECT PATH
PROJ_PATH = "/gpfs/bbp.cscs.ch/project/proj68/home/laquitai/spike-sorting"
os.chdir(PROJ_PATH)

# import custom package
from src.nodes.utils import get_config_silico_neuropixels

# SETUP RUN CONFIG
SIMULATION_DATE = "2023_02_19"
WRITE = False

# setup logging
with open("conf/logging.yml", "r", encoding="utf-8") as logging_conf:
    LOG_CONF = yaml.load(logging_conf, Loader=yaml.FullLoader)
logging.config.dictConfig(LOG_CONF)
logger = logging.getLogger("root")

# get config
data_conf, param_conf = get_config_silico_neuropixels(SIMULATION_DATE).values()

# SET PATHS
# set Kilosort sorted spikes and cells path
KS3_SORTING_PATH = data_conf["sorting"]["sorters"]["kilosort3"]["output"]

# set ground truth spikes and cells path
GT_SORTING_PATH = data_conf["sorting"]["simulation"]["ground_truth"]["output"]

# set ground truth spikes and cells path
CELL_MATCHING_PATH = data_conf["postprocessing"]["cell_matching"]

## Create SpikeInterface comparison object

In [3]:
# load Kilosort3 Sorted spikes and cells
SortingExtractorKS3 = si.load_extractor(KS3_SORTING_PATH)

# load ground truth spikes and cells
GTSortingExtractor = si.load_extractor(GT_SORTING_PATH)

# agreement score between sorted and true cells
MatchingObject = comparison.compare_sorter_to_ground_truth(
    GTSortingExtractor, SortingExtractorKS3, exhaustive_gt=True
)

## Matching method 1: get sorted unit's max-agreement Ground truth unit

Ground truth unit 165135 (values) does not have a sorted unit's (index) match with this method. It selects only the best ground truth match for each sorted unit, but a sorted unit can have many ground truth matches. We know that some of unit 165135's spikes were matched with one of the sorted unit, so unit 165135 was "missed" with this method because it was not the best match for any of the sorted units.

In [4]:
# list the matching
cell_matching = MatchingObject.agreement_scores.idxmax().to_frame()
cell_matching

Unnamed: 0,0
0,1981516
1,3427256
2,2145865
3,1931679
4,1816718
...,...
298,499827
299,281388
300,141668
301,394076


In [5]:
# list the match between
MatchingObject.agreement_scores.idxmax()

0      1981516
1      3427256
2      2145865
3      1931679
4      1816718
        ...   
298     499827
299     281388
300     141668
301     394076
302     355657
Length: 303, dtype: int64

In [6]:
any(cell_matching==165135)

False

Issue found ! : "Hit score near chance levels are set to zero" https://spikeinterface.readthedocs.io/en/0.96.1/module_comparison.html#more-information-about-hungarian-or-best-match-methods

In [7]:
# list the sorted units that actually match the target ground truth unit 165135
agreement_scores_165135 = MatchingObject.agreement_scores.loc[165135]
agreement_scores_165135[agreement_scores_165135!=0]

14     0.001401
16     0.001908
38     0.000417
54     0.003448
96     0.001650
115    0.000552
197    0.000956
204    0.002611
274    0.001190
295    0.002632
Name: 165135, dtype: float64

In [8]:
# get true cell best matches based on max accuracy (Hungarian Method)
cell_matching = MatchingObject.agreement_scores.idxmax().to_frame()
cell_matching.columns = ["true_cell_match"]
cell_matching.index.name = "sorted_cell"
cell_matching = cell_matching.reset_index()

# DETECT SORTING ERRORS:
# ---------------------

# detect cases of "oversplit": when a true cell is matched with many sorted cells
cell_matching["oversplit_true_cell"] = False
oversplit = cell_matching[
    cell_matching["true_cell_match"].duplicated(keep=False)
]
cell_matching["oversplit_true_cell"].iloc[oversplit.index] = True
logger.info("Detecting true cell oversplit - done")

# detect cases of "cell misses": when a true cell is not matched with any sorted cell
true_cells = MatchingObject.agreement_scores.index
matched_true_cells = np.unique(cell_matching["true_cell_match"])
missed_true_cells = set(true_cells) - set(matched_true_cells)
missed_true_cells = pd.DataFrame(
    tuple(missed_true_cells), columns=["true_cell_match"]
)
# record missed cells into dataset
if not len(missed_true_cells) == 0:
    cell_matching = cell_matching.merge(
        missed_true_cells, on="true_cell_match", how="outer"
    )
    logger.info("Detecting true cell misses - done")

# Add agreement scores
# --------------------
# add max agreement score to dataframe
sorted_cells = cell_matching["sorted_cell"].dropna().astype("int")
max_agreement_scores = []

# get the agreement scores of the matched sorted-true pairs
for s_i in sorted_cells:
    max_agreement_scores.append(
        MatchingObject.agreement_scores.loc[
            cell_matching["true_cell_match"][s_i], s_i
        ]
    )

# add agreement scores to dataset
max_agreement_scores = pd.DataFrame(
    max_agreement_scores, columns=["agreement_score"]
)
cell_matching = cell_matching.join(max_agreement_scores, how="outer")

# write to .parquet
if WRITE:
    parent_path = os.path.dirname(CELL_MATCHING_PATH)
    if not os.path.isdir(parent_path):
        os.makedirs(parent_path)
    cell_matching.to_parquet(CELL_MATCHING_PATH)

2023-05-31 13:37:11,999 - root - 3143256996.py - <module> - INFO - Detecting true cell oversplit - done
2023-05-31 13:37:12,009 - root - 3143256996.py - <module> - INFO - Detecting true cell misses - done


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cell_matching["oversplit_true_cell"].iloc[oversplit.index] = True


Consequently, the "cell_matching" dataframe does not record a sorted unit match for 165135.

In [9]:
# list sorted unit match to ground truth unit
cell_matching[cell_matching["true_cell_match"]==165135]

Unnamed: 0,sorted_cell,true_cell_match,oversplit_true_cell,agreement_score
517,,165135,,


We will store cell matching and the agreement score matrix for future analyses.

In [10]:
# display cell_matching dataframe
cell_matching

  output = repr(obj)
  return method()


Unnamed: 0,sorted_cell,true_cell_match,oversplit_true_cell,agreement_score
0,0.0,1981516,True,0.430918
1,215.0,1981516,True,0.000717
2,1.0,3427256,True,0.000000
3,45.0,3427256,True,0.000000
4,2.0,2145865,False,0.000880
...,...,...,...,...
640,,4046827,,
641,,4138989,,
642,,4063216,,
643,,2623483,,


In [11]:
# display all agreement scores
MatchingObject.agreement_scores

sorted_cell,0,1,2,3,4,5,6,7,8,9,...,293,294,295,296,297,298,299,300,301,302
19690,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000898,0.0,0.00000,0.0,0.0
24768,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.00000,0.0,0.0
37423,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.020000,0.0,0.009464,0.000000,0.000000,0.000898,0.0,0.00000,0.0,0.0
39862,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.052000,0.0,0.000000,0.004090,0.003630,0.000000,0.0,0.00000,0.0,0.0
45637,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.002551,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.00000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4216128,0.001008,0.000000,0.000497,0.000000,0.0,0.001078,0.000000,0.000000,0.001412,0.001269,...,0.000000,0.0,0.000000,0.000000,0.001274,0.000935,0.0,0.00076,0.0,0.0
4217493,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.00000,0.0,0.0
4221920,0.002813,0.002271,0.002699,0.000546,0.0,0.002080,0.000535,0.000569,0.000000,0.002212,...,0.000536,0.0,0.000518,0.000477,0.000928,0.000000,0.0,0.00000,0.0,0.0
4228700,0.001629,0.002335,0.000474,0.000000,0.0,0.000347,0.000725,0.000000,0.000000,0.000991,...,0.000726,0.0,0.000693,0.000000,0.000600,0.000000,0.0,0.00000,0.0,0.0
