# Match ground truth to sorted units

Setup python environment `env_kilosort_silico`

see related pipeline "match_sorted_to_true_neuropixels_2023_02_19.py"

In [1]:
import logging
import logging.config
import os
from time import time

import numpy as np
import pandas as pd
import spikeinterface as si
import yaml
from spikeinterface import comparison

# SET PROJECT PATH
PROJ_PATH = "/gpfs/bbp.cscs.ch/project/proj68/home/laquitai/spike-sorting"
os.chdir(PROJ_PATH)

# import custom package
from src.nodes.utils import get_config_silico_neuropixels

# SETUP RUN CONFIG
SIMULATION_DATE = "2023_02_19"
WRITE = True

# setup logging
with open("conf/logging.yml", "r", encoding="utf-8") as logging_conf:
    LOG_CONF = yaml.load(logging_conf, Loader=yaml.FullLoader)
logging.config.dictConfig(LOG_CONF)
logger = logging.getLogger("root")

# get config
data_conf, param_conf = get_config_silico_neuropixels(SIMULATION_DATE).values()

# SET PATHS
# set Kilosort sorted spikes and cells path
KS3_SORTING_PATH = data_conf["sorting"]["sorters"]["kilosort3"]["output"]

# set ground truth spikes and cells path
GT_SORTING_PATH = data_conf["sorting"]["simulation"]["ground_truth"]["output"]

# set ground truth spikes and cells path
CELL_MATCHING_PATH = data_conf["postprocessing"]["cell_matching"]

## Get spikeinterface comparison object

In [2]:
# load Kilosort3 Sorted spikes and cells
SortingExtractorKS3 = si.load_extractor(KS3_SORTING_PATH)

# load ground truth spikes and cells
GTSortingExtractor = si.load_extractor(GT_SORTING_PATH)

# agreement score between sorted and true cells
MatchingObject = comparison.compare_sorter_to_ground_truth(
    GTSortingExtractor, SortingExtractorKS3, exhaustive_gt=True
)

## Get true unit's sorted match via "Hungarian Method"

In [3]:
# get true cell best matches based on max accuracy (Hungarian Method)
cell_matching = MatchingObject.agreement_scores.idxmax().to_frame()

In [19]:
MatchingObject.agreement_scores.idxmax()

0      1981516
1      3427256
2      2145865
3      1931679
4      1816718
        ...   
298     499827
299     281388
300     141668
301     394076
302     355657
Length: 303, dtype: int64

TODO: Why doesn' 165135 have any match, while my analysis show some of its spikes were detected? Check its agreement scores with sorted cells

In [4]:
any(cell_matching==165135)

False

Why doesn't the Hungarian methid find a match while it has non-null agreement scores with many sorted units. Somethin's wrong with the "Hungarian Method"....!!

- issue found ! : "Hit score near chance levels are set to zero" https://spikeinterface.readthedocs.io/en/0.96.1/module_comparison.html#more-information-about-hungarian-or-best-match-methods

In [8]:
# sorted units that match the test true unit
agreement_scores_165135 = MatchingObject.agreement_scores.loc[165135]
agreement_scores_165135[agreement_scores_165135!=0]

14     0.001401
16     0.001908
38     0.000417
54     0.003448
96     0.001650
115    0.000552
197    0.000956
204    0.002611
274    0.001190
295    0.002632
Name: 165135, dtype: float64

In [3]:
# get true cell best matches based on max accuracy (Hungarian Method)
cell_matching = MatchingObject.agreement_scores.idxmax().to_frame()
cell_matching.columns = ["true_cell_match"]
cell_matching.index.name = "sorted_cell"
cell_matching = cell_matching.reset_index()

# DETECT SORTING ERRORS:
# ---------------------

# detect cases of "oversplit": when a true cell is matched with many sorted cells
cell_matching["oversplit_true_cell"] = False
oversplit = cell_matching[
    cell_matching["true_cell_match"].duplicated(keep=False)
]
cell_matching["oversplit_true_cell"].iloc[oversplit.index] = True
logger.info("Detecting true cell oversplit - done")

# detect cases of "cell misses": when a true cell is not matched with any sorted cell
true_cells = MatchingObject.agreement_scores.index
matched_true_cells = np.unique(cell_matching["true_cell_match"])
missed_true_cells = set(true_cells) - set(matched_true_cells)
missed_true_cells = pd.DataFrame(
    tuple(missed_true_cells), columns=["true_cell_match"]
)
# record missed cells into dataset
if not len(missed_true_cells) == 0:
    cell_matching = cell_matching.merge(
        missed_true_cells, on="true_cell_match", how="outer"
    )
    logger.info("Detecting true cell misses - done")

# Add agreement scores
# --------------------
# add max agreement score to dataframe
sorted_cells = cell_matching["sorted_cell"].dropna().astype("int")
max_agreement_scores = []

# get the agreement scores of the matched sorted-true pairs
for s_i in sorted_cells:
    max_agreement_scores.append(
        MatchingObject.agreement_scores.loc[
            cell_matching["true_cell_match"][s_i], s_i
        ]
    )

# add agreement scores to dataset
max_agreement_scores = pd.DataFrame(
    max_agreement_scores, columns=["agreement_score"]
)
cell_matching = cell_matching.join(max_agreement_scores, how="outer")

# write to .parquet
if WRITE;
    parent_path = os.path.dirname(CELL_MATCHING_PATH)
    if not os.path.isdir(parent_path):
        os.makedirs(parent_path)
    cell_matching.to_parquet(CELL_MATCHING_PATH)

2023-05-26 15:15:23,148 - root - 3666794839.py - <module> - INFO - loading kilosort3 Sorting Extractor
2023-05-26 15:15:23,166 - root - 3666794839.py - <module> - INFO - loading kilosort3 Sorting Extractor - done: 0.0
2023-05-26 15:15:23,167 - root - 3666794839.py - <module> - INFO - loading Ground truth Sorting Extractor
2023-05-26 15:15:23,177 - root - 3666794839.py - <module> - INFO - loading Ground truth Sorting Extractor - done: 0.0
2023-05-26 15:15:40,504 - root - 3666794839.py - <module> - INFO - Detecting true cell oversplit - done
2023-05-26 15:15:40,511 - root - 3666794839.py - <module> - INFO - Detecting true cell misses - done


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cell_matching["oversplit_true_cell"].iloc[oversplit.index] = True


In [6]:
cell_matching[cell_matching["true_cell_match"]==165135]

Unnamed: 0,sorted_cell,true_cell_match,oversplit_true_cell,agreement_score
517,,165135,,
