In [1]:
import numpy as np

In [2]:
y_train = np.load("../concated_embs_npy/y_train.npy")
y_test  = np.load("../concated_embs_npy/y_test.npy")
y_val   = np.load("../concated_embs_npy/y_val.npy")
X_train = np.load("../concated_embs_npy/X_train.npy")
X_test  = np.load("../concated_embs_npy/X_test.npy")
X_val   = np.load("../concated_embs_npy/X_val.npy")

In [6]:
344/4

86.0

In [3]:
X_train.shape

(238673, 344)

In [5]:
# ---- быстрый "фингерпринт" для точного совпадения float-битов ----
# Работает корректно, если X dtype float32/float64 и массив C-contiguous.
def hash64_rows(X: np.ndarray) -> np.ndarray:
    Xc = np.ascontiguousarray(X)
    v = Xc.view(np.uint64).reshape(Xc.shape[0], -1)

    # простой хороший микс (uint64 overflow по модулю 2**64)
    h = np.uint64(1469598103934665603)  # FNV offset basis
    prime = np.uint64(1099511628211)    # FNV prime

    # FNV-1a по "словам" uint64 (векторизованно по строкам)
    hv = np.full((v.shape[0],), h, dtype=np.uint64)
    for j in range(v.shape[1]):
        hv ^= v[:, j]
        hv *= prime
    return hv

def build_positive_hashset(*Xs: np.ndarray) -> set:
    hs = set()
    for X in Xs:
        h = hash64_rows(X)
        hs.update(h.tolist())
    return hs


def make_negatives_by_tail_shuffle(
    X: np.ndarray,
    pos_hashes: set,
    n_neg: int,
    tail: int = 86,
    max_tries_per_sample: int = 50,
    seed: int = 42,
):
    rng = np.random.default_rng(seed)
    n, d = X.shape
    assert tail <= d

    start = d - tail
    neg = np.empty((n_neg, d), dtype=X.dtype)
    neg_hashes = set()

    for i in range(n_neg):
        base_idx = rng.integers(0, n)     # <-- фикс: выбираем базовую строку случайно
        base_row = X[base_idx]

        ok = False
        xi = base_row.copy()

        for _ in range(max_tries_per_sample):
            perm = rng.permutation(tail)
            xi[start:d] = base_row[start:d][perm]

            h = hash64_rows(xi[None, :])[0].item()

            if (h not in pos_hashes) and (h not in neg_hashes):
                neg[i] = xi
                neg_hashes.add(h)
                ok = True
                break

        if not ok:
            raise RuntimeError(f"Couldn't generate unique negative for i={i} in {max_tries_per_sample} tries")

    return neg

In [9]:
# ------------------ пример использования ------------------

# 1) множество всех позитивов (я бы включал train+val+test, чтобы случайно не пересечься нигде)
pos_hashes = build_positive_hashset(X_train, X_val, X_test)

# 2) сколько негативов хочешь
n_neg_train = 2_000_000  # пример: сколько тебе нужно

X_neg_train = make_negatives_by_tail_shuffle(
    X_train,
    pos_hashes=pos_hashes,
    n_neg=n_neg_train,
    tail=86,
    max_tries_per_sample=50,
    seed=123,
)

y_neg_train = np.zeros((n_neg_train, 1), dtype=y_train.dtype)  # или (n_neg_train,) как у тебя

# 3) итоговый train (позитивы + негативы)
X_train_aug = np.concatenate([X_train, X_neg_train], axis=0)
y_train_aug = np.concatenate([y_train, y_neg_train], axis=0)

# 4) перемешать датасет
rng = np.random.default_rng(999)
idx = rng.permutation(X_train_aug.shape[0])
X_train_aug = X_train_aug[idx]
y_train_aug = y_train_aug[idx]

In [10]:
X_train_aug.shape

(2238673, 344)

In [11]:
np.save("../concated_embs_npy/X_train_aug_1v10.npy", X_train_aug)
np.save("../concated_embs_npy/y_train_aug_1v10.npy", y_train_aug)