In [1]:
import pprint
import string
import itertools

import datasets
import pandas as pd
import numpy as np
import torch
import joblib
import scipy.sparse
import sklearn.feature_extraction


ds_names = [
    "MU-NLPC/Calc-gsm8k",
    "MU-NLPC/Calc-aqua_rat",
    "MU-NLPC/Calc-math_qa",
    "MU-NLPC/Calc-ape210k",
    "MU-NLPC/Calc-mawps",
    "MU-NLPC/Calc-svamp",
]

In [2]:
keep_symbols = set(string.ascii_lowercase.lower() + " ")
dss = {}
split_names = set()

for full_name in ds_names:
    ds = datasets.load_dataset(full_name)
    ds_name = full_name.split("/")[-1].lower()
    for split_name, split in ds.items():
        split_names.add(split_name)
        key = ds_name, split_name
        dss[key] = split.to_pandas()[["question", "chain", "result"]]
        dss[key]["question_simplified"] = (
            dss[key]["question"]
            .str.encode("ascii", errors="ignore")
            .str.decode("ascii")
            .str.lower()
            .str.split()
            .str.join(" ")
            .apply(lambda text: "".join([c for c in text if c in keep_symbols]))
            .str.split()
            .str.join(" ")
        )

In [3]:
bow_ngrams_vectorizer = sklearn.feature_extraction.text.CountVectorizer(binary=True, dtype=np.int32, ngram_range=(1, 2))

bow_ngrams_vectorizer.fit(
    itertools.chain.from_iterable(ds["question_simplified"] for ds in dss.values())
)

bows = {}

for key, ds in dss.items():
    bows[key] = bow_ngrams_vectorizer.transform(ds["question_simplified"])

In [4]:
def pairwise_jaccard_sim(bows_1: scipy.sparse.csr_matrix, bows_2: scipy.sparse.csr_matrix) -> np.ndarray:
    """
    Computes the Jaccard distance between each row of X matrix and each row of Y matrix.
    """
    sizes_of_1 = bows_1.getnnz(axis=1).astype(np.float32)
    sizes_of_2 = bows_2.getnnz(axis=1).astype(np.float32)
    intersect = (bows_1 @ bows_2.T).toarray().astype(np.float32)
    union = sizes_of_1.reshape(-1, 1) + sizes_of_2.reshape(1, -1) - intersect
    with np.errstate(divide='ignore', invalid='ignore'):
        result = intersect / union
        np.nan_to_num(result, nan=0, posinf=0, neginf=0, copy=False)
    return result


def get_highest_k_matches(scores: torch.Tensor, k: int):
    top_in_rows = torch.topk(k=k, dim=1, sorted=False, largest=True, input=scores)
    top_in_cols = torch.topk(k=k, dim=1, sorted=False, largest=True, input=scores.T)
    return top_in_rows, top_in_cols


def check_leak(bows_1, bows_2, top_k=10):
    scores = pairwise_jaccard_sim(bows_1, bows_2)
    return get_highest_k_matches(torch.tensor(scores), k=top_k)

In [5]:
check_leaks = []
for ds_name_1, ds_split_name_1 in dss.keys():
    for ds_name_2, ds_split_name_2 in dss.keys():
        if ds_split_name_1 == "train" and ds_split_name_2 != "train":
            check_leaks.append(((ds_name_1, ds_split_name_1), (ds_name_2, ds_split_name_2)))


pprint.pprint(check_leaks)
print(len(check_leaks))

[(('calc-gsm8k', 'train'), ('calc-gsm8k', 'test')),
 (('calc-gsm8k', 'train'), ('calc-aqua_rat', 'test')),
 (('calc-gsm8k', 'train'), ('calc-aqua_rat', 'validation')),
 (('calc-gsm8k', 'train'), ('calc-math_qa', 'test')),
 (('calc-gsm8k', 'train'), ('calc-math_qa', 'validation')),
 (('calc-gsm8k', 'train'), ('calc-ape210k', 'test')),
 (('calc-gsm8k', 'train'), ('calc-ape210k', 'validation')),
 (('calc-gsm8k', 'train'), ('calc-mawps', 'validation')),
 (('calc-gsm8k', 'train'), ('calc-mawps', 'test')),
 (('calc-gsm8k', 'train'), ('calc-svamp', 'test')),
 (('calc-aqua_rat', 'train'), ('calc-gsm8k', 'test')),
 (('calc-aqua_rat', 'train'), ('calc-aqua_rat', 'test')),
 (('calc-aqua_rat', 'train'), ('calc-aqua_rat', 'validation')),
 (('calc-aqua_rat', 'train'), ('calc-math_qa', 'test')),
 (('calc-aqua_rat', 'train'), ('calc-math_qa', 'validation')),
 (('calc-aqua_rat', 'train'), ('calc-ape210k', 'test')),
 (('calc-aqua_rat', 'train'), ('calc-ape210k', 'validation')),
 (('calc-aqua_rat', 'trai

In [6]:
candidates = {}

with joblib.Parallel(n_jobs=-1) as parallel:
    jobs = (joblib.delayed(check_leak)(bows[ds_1], bows[ds_2]) for ds_1, ds_2 in check_leaks)
    results = parallel(jobs)
    for (ds_train, ds_eval), leak_candidates in zip(check_leaks, results):
        candidates[ds_train, ds_eval] = leak_candidates


In [7]:
threshold = 0.5
print_examples = False

for (ds_train, ds_eval), (train_sim, eval_sim) in candidates.items():
    is_mostly_formula_problem = (dss[ds_eval]["question_simplified"].apply(len) / dss[ds_eval]["question"].apply(len)) < 0.5
    # example of mostly_formula_problem is: Solve 2x + 3x^2 + 8/5 = 1295
    # on those examples, we don't want to check for similarity on words
    sus_mask = (eval_sim.values > threshold) # has shape (len_eval, top_k)
    sus_mask[is_mostly_formula_problem] = False
    suspicious_frac = sus_mask.any(dim=1).float().mean().item()
    if suspicious_frac > 0.05:
        print(f"{suspicious_frac:.2%} of {'/'.join(ds_eval):<30} examples appear similar to some examples in {'/'.join(ds_train)}")
        sus_mask_in_train = (train_sim.values > threshold).any(dim=1).float().mean().item()
        print(f"-> {sus_mask_in_train:.2%} of {'/'.join(ds_train):<27} examples would have to be dropped")
        print()
        if not print_examples:
            continue
        all_sus_eval_idxs, train_nth_similar = sus_mask.nonzero(as_tuple=True)
        sample = torch.randint(0, len(all_sus_eval_idxs), (10,))
        sampled_sus_eval_idxs = all_sus_eval_idxs[sample]
        sampled_train_nth_similar = train_nth_similar[sample]
        sampled_eval_questions = dss[ds_eval]["question"].iloc[sampled_sus_eval_idxs]
        sampled_train_questions = dss[ds_train]["question"].iloc[eval_sim.indices[sampled_sus_eval_idxs, sampled_train_nth_similar]]
        sampled_similarities = eval_sim.values[sampled_sus_eval_idxs, sampled_train_nth_similar]
        for eval_question, train_question, similarity in zip(sampled_eval_questions, sampled_train_questions, sampled_similarities):
            print("  eval: ", eval_question)
            print("  train:", train_question)
            print(f"  {similarity=:.2f}")
            print()

        print()
        print("-" * 100)


30.71% of calc-aqua_rat/test             examples appear similar to some examples in calc-aqua_rat/train
-> 1.36% of calc-aqua_rat/train         examples would have to be dropped

25.59% of calc-aqua_rat/validation       examples appear similar to some examples in calc-aqua_rat/train
-> 0.33% of calc-aqua_rat/train         examples would have to be dropped

97.49% of calc-math_qa/test              examples appear similar to some examples in calc-aqua_rat/train
-> 24.56% of calc-aqua_rat/train         examples would have to be dropped

97.00% of calc-math_qa/validation        examples appear similar to some examples in calc-aqua_rat/train
-> 30.43% of calc-aqua_rat/train         examples would have to be dropped

7.87% of calc-aqua_rat/test             examples appear similar to some examples in calc-math_qa/train
-> 0.99% of calc-math_qa/train          examples would have to be dropped

7.48% of calc-aqua_rat/validation       examples appear similar to some examples in calc-math_qa/tra

In [8]:
# Data leaks:
# aqua_rat train -> math_qa test + validation # math_qa is basically whole a subset of train aqua_rat
# math_qa train -> math_qa test + validation
# ape210k train -> ape210k test + validation
# mawps train -> mawps test + validation

# Fair evaluation for models trained on aquarat+ape210k+gsm8k+mathqa:
# - don't eval on mathqa at all -> remove completely from latex table
# - evaluation on gsm8k is ok
# - need to evaluate on svamp and mawps -> we don't need to filter anything
# - drop a lot of ape210k eval samples
# - drop some aqua_rat eval samples