# Run mAP line-by-line
**Author:** Jessica Ewald <br>

The purpose of this script is to run through mAP line-by-line for a few cells that return a mAP of 1 to try understand why this happens. Code chunks will be copied from the copairs repo to accomplish this. 

The first step is to choose some cells.

In [17]:
# general imports
import pathlib
import pandas as pd
import numpy as np
import polars as pl
import seaborn as sns
import black
import jupyter_black
jupyter_black.load(
    lab=False,
    line_length=79,
    verbosity="DEBUG",
    target_version=black.TargetVersion.PY310,
)
import warnings
warnings.filterwarnings("ignore")

# copairs imports
import itertools
import copairs as cps
from copairs.matching import Matcher
from copairs import compute
from typing import List, Tuple


Below are copies of all of the internal copairs functions that are not exported, but that are needed to compute average precision.

In [18]:
# define functions here

def prep_for_map(df_path: str, map_cols: [str], sample_col: [str], sample_n: int = 5): # type: ignore

    # define filters
    q = pl.scan_parquet(df_path).filter(
        (pl.col("Metadata_node_type") != "TC") &  # remove transfection controls
        (pl.col("Metadata_node_type") != "NC") &
        (pl.col("Metadata_node_type") != "PC") &
        (pl.col("Metadata_node_type") != "CC") &
        (pl.col("Metadata_allele") != "_NA") & 
        (pl.sum_horizontal(pl.col(map_cols).is_null()) == 0)  # remove any row with missing values for selected meta columns
        ).with_columns(pl.concat_str(sample_col).alias('Metadata_samplecol'))
    
    # if a sample column name was provided, randomly sample sample_n rows from each column category
    if sample_col:
        q = q.filter(pl.int_range(0, pl.len()).shuffle().over('Metadata_samplecol') < sample_n)
    
    # different data frames for metadata and profiling data
    map_cols_id = map_cols.copy()
    map_cols_id.append("Metadata_CellID")
    meta_cols = q.select(map_cols_id)
    meta_df = meta_cols.collect().to_pandas()

    feat_col = [i for i in q.columns if "Metadata_" not in i] 
    q = q.select(feat_col)
    feat_df = q.collect().to_pandas()

    map_input = {'meta': meta_df, 'feats': feat_df}

    return map_input

def flatten_str_list(*args):
    """create a single list with all the params given"""
    columns = set()
    for col in args:
        if isinstance(col, str):
            columns.add(col)
        elif isinstance(col, dict):
            columns.update(itertools.chain.from_iterable(col.values()))
        else:
            columns.update(col)
    columns = list(columns)
    return columns

def evaluate_and_filter(df, columns) -> Tuple[pd.DataFrame, List[str]]:
    """Evaluate the query and filter the dataframe"""
    parsed_cols = []
    for col in columns:
        if col in df.columns:
            parsed_cols.append(col)
            continue

        column_names = re.findall(r"(\w+)\s*[=<>!]+", col)
        valid_column_names = [col for col in column_names if col in df.columns]
        if not valid_column_names:
            raise ValueError(f"Invalid query or column name: {col}")

        try:
            df = df.query(col)
            parsed_cols.extend(valid_column_names)
        except:
            raise ValueError(f"Invalid query expression: {col}")

    return df, parsed_cols

def build_rank_lists(pos_pairs, neg_pairs, pos_sims, neg_sims):
    labels = np.concatenate(
        [
            np.ones(pos_pairs.size, dtype=np.int32),
            np.zeros(neg_pairs.size, dtype=np.int32),
        ]
    )
    ix = np.concatenate([pos_pairs.ravel(), neg_pairs.ravel()])
    sim_all = np.concatenate([np.repeat(pos_sims, 2), np.repeat(neg_sims, 2)])
    ix_sort = np.lexsort([1 - sim_all, ix])
    rel_k_list = labels[ix_sort]
    paired_ix, counts = np.unique(ix, return_counts=True)
    return paired_ix, rel_k_list, counts

In [3]:
# Set paths for accessing data
batch_name = 'B1A1R1'
data_dir = pathlib.Path("/dgx1nas1/storage/data/jess/varchamp/sc_data/processed_profiles").resolve(strict=True)
anno_cellID = pathlib.Path(data_dir / f"{batch_name}_annotated_cellID.parquet")

# Set paramters for mAP
pos_sameby = ['Metadata_allele']
pos_diffby = ['Metadata_Plate']
neg_sameby = ['Metadata_Plate']
neg_diffby = ['Metadata_allele']
batch_size = 20000
sample_n_cells = 5
sample_neg = True
map_cols = list(set(pos_sameby + pos_diffby + neg_sameby + neg_diffby))


In [None]:

# Prepare the data (filter, sample, & format)
map_input = prep_for_map(anno_cellID, map_cols, ['Metadata_Well', 'Metadata_Plate'], sample_n_cells)

In [7]:
# Define map inputs
meta = map_input['meta']
feats = map_input['feats'].values
sample_factor = 10


The "average_precision" function starts here. It is broken down into chunks. Some extra plots etc are added to further investigate the results. 

In [13]:
# Format inputs & define matcher
columns = flatten_str_list(pos_sameby, pos_diffby, neg_sameby, neg_diffby)
meta = meta.reset_index(drop=True).copy()

matcher = Matcher(*evaluate_and_filter(meta, columns), seed=0)


In [15]:
# Compute positive pair indices
pos_pairs = matcher.get_all_pairs(sameby=pos_sameby, diffby=pos_diffby)
pos_total = sum(len(p) for p in pos_pairs.values())

pos_pairs = np.fromiter(
    itertools.chain.from_iterable(pos_pairs.values()),
    dtype=np.dtype((np.int32, 2)),
    count=pos_total,
)

In [16]:
# Compute negative pair indices
neg_pairs = matcher.get_all_pairs(sameby=neg_sameby, diffby=neg_diffby)
neg_total = sum(len(p) for p in neg_pairs.values())
neg_pairs = np.fromiter(
    itertools.chain.from_iterable(neg_pairs.values()),
    dtype=np.dtype((np.int32, 2)),
    count=neg_total,
)

In [None]:
# if sample_neg, randomly sample negative pairs
if sample_neg:
    sample_size = pos_pairs.shape[0]*sample_factor
    if sample_size < neg_pairs.shape[0]:
        sampled_rows = np.random.choice(neg_pairs.shape[0], size=sample_size, replace=False)
        neg_pairs = neg_pairs[sampled_rows]

Before running the next steps, I should further filter the pos & neg pairs to focus on a few examples where AP equals one and where AP equals something else.

In [None]:
# Compute positive cosine distances
pos_sims = compute.pairwise_cosine(feats, pos_pairs, batch_size)

In [None]:
# Compute negative cosine distances
neg_sims = compute.pairwise_cosine(feats, neg_pairs, batch_size)

In [None]:
# Build ranked lists
paired_ix, rel_k_list, counts = build_rank_lists(
    pos_pairs, neg_pairs, pos_sims, neg_sims
)

In [None]:
# Compute average precision
ap_scores, null_confs = compute.ap_contiguous(rel_k_list, counts)

In [None]:
# Populate metadata with results
meta["n_pos_pairs"] = 0
meta["n_total_pairs"] = 0
meta.loc[paired_ix, "average_precision"] = ap_scores
meta.loc[paired_ix, "n_pos_pairs"] = null_confs[:, 0]
meta.loc[paired_ix, "n_total_pairs"] = null_confs[:, 1]