In [1]:
import numpy as np
import xgboost as xgb
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import pairwise_distances
import gurobipy as gp
from gurobipy import GRB

Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.


In [2]:
df = pd.read_csv('data/blood_protein_cancers_clean.csv')

  df = pd.read_csv('data/blood_protein_cancers_clean.csv')


In [3]:
# todo
# 1. split into train test (80:20) based on breast cancer time to diagnosis
# 2. in train, get 100 nearest neighbors of breast cancer patients using demographic features and most important features from XGBoost
#     - use this cohort to apply optimization algorithm

In [4]:
def bin_breast_ttd(x):
    if pd.isna(x):         return "NA"
    if x <= 0:             return "<0"
    if 0 < x <= 1:        return "0-1"   # 0 and 1 included
    if 1 < x <= 5:         return "1-5"   # (1, 5]
    return ">5"

def proportions(frame):
    return (frame["_strata"].value_counts(normalize=True)
            .reindex(["<0","0-1","1-5",">5","NA"])
            .fillna(0))

In [5]:
df["_strata"] = df["breast_time_to_diagnosis"].apply(bin_breast_ttd)

In [6]:
train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df["_strata"]
)

print("Overall:\n", proportions(df))
print("Train:\n", proportions(train_df))
print("Test:\n", proportions(test_df))

# train_df = train_df.drop(columns=["_strata"])
# test_df  = test_df.drop(columns=["_strata"])

Overall:
 <0     0.018983
0-1    0.001642
1-5    0.007265
>5     0.012567
NA     0.959543
Name: _strata, dtype: float64
Train:
 <0     0.018988
0-1    0.001628
1-5    0.007265
>5     0.012572
NA     0.959548
Name: _strata, dtype: float64
Test:
 <0     0.018964
0-1    0.001698
1-5    0.007265
>5     0.012548
NA     0.959524
Name: _strata, dtype: float64


In [61]:
with open("output/xgb_breast_cancer_top_100_features.txt", 'r') as f:
    top_feats = f.read().splitlines()

In [62]:
TIME_COL = "breast_time_to_diagnosis"
K = 5  # number of neighbors per positive row

# --- Detect the single categorical feature from top_feats (or set explicitly) ---
auto_cats = [c for c in top_feats if str(train_df[c].dtype) in ("object", "category")]

CAT_FEAT = auto_cats[0]

num_feats = [c for c in top_feats if c != CAT_FEAT]

# --- Preprocessing: impute+scale numeric, impute+onehot categorical ---
preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler())  # normalizes numeric features
        ]), num_feats),
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore", sparse=False))
        ]), [CAT_FEAT]),
    ],
    remainder="drop",
    sparse_threshold=0.0,  # force dense output
)

# Fit preprocesser on ALL train rows to keep a single feature space
preprocess.fit(train_df[top_feats])

# --- Nearest neighbors: fit on negative rows, query with positive rows ---
nn = NearestNeighbors(n_neighbors=K, metric="euclidean", algorithm="auto")



In [67]:
#### run this first for sanity check ####

matched = []
strata = "0-1"
# --- Identify negative vs positive subsets on the target column ---
neg_mask = train_df[TIME_COL].isna()
strata_mask = train_df["_strata"] == strata
df_neg = train_df.loc[neg_mask].copy()
df_pos = train_df.loc[~neg_mask & strata_mask].copy()

X_neg  = preprocess.transform(df_neg[top_feats])
X_pos = preprocess.transform(df_pos[top_feats])

nn.fit(X_neg)

# For each positive row, get indices of K nearest negative rows (indices refer to df_neg order)
distances, knn_indices = nn.kneighbors(X_pos, n_neighbors=K, return_distance=True)

# Union of all negative neighbor indices across all positive rows
neg_idx = np.unique(knn_indices)
selected_neg_eid = df_neg.iloc[unique_neg_positions,:]['eid']

# --- Result: the selected negative rows (union of neighbors) ---
XA = X_pos
XB = X_neg[neg_idx]
nA, nB = XA.shape[0], XB.shape[0]

# --- Matching: perform 2 to 1 matching on nearest neighbor subset ---
cost = pairwise_distances(XA, XB, metric="euclidean")

m = gp.Model("two_to_one_matching")

# Binary decision vars: x[i,j] = 1 if A_i assigned to B_j
x = m.addVars(nA, nB, vtype=GRB.BINARY, name="x")

# Objective: minimize total distance
m.setObjective(gp.quicksum(cost[i, j] * x[i, j] for i in range(nA) for j in range(nB)),
               GRB.MINIMIZE)

# Each B_j gets exactly two A_i
for j in range(nB):
    m.addConstr(gp.quicksum(x[i, j] for i in range(nA)) == 2, name=f"two_per_B[{j}]")

# Each A_i is used at most once
for i in range(nA):
    m.addConstr(gp.quicksum(x[i, j] for j in range(nB)) <= 1, name=f"at_most_one_per_A[{i}]")

m.optimize()

for j in range(nB):
    matched_i = [i for i in range(nA) if x[i, j].X > 0.5]
    if len(matched_i) > 0:
        matched.append(selected_neg_eid[matched_i])

Gurobi Optimizer version 12.0.3 build v12.0.3rc0 (linux64 - "CentOS Linux 7 (Core)")

CPU model: AMD EPYC 7352 24-Core Processor, instruction set [SSE2|AVX|AVX2]
Thread count: 48 physical cores, 48 logical processors, using up to 32 threads



GurobiError: Model too large for size-limited license; visit https://gurobi.com/unrestricted for more information

In [None]:
matched = []

for strata in ["<0","0-1","1-5",">5"]:

    # --- Identify negative vs positive subsets on the target column ---
    neg_mask = train_df[TIME_COL].isna()
    strata_mask = train_df["_strata"] == strata
    df_neg = train_df.loc[neg_mask].copy()
    df_pos = train_df.loc[~neg_mask & strata_mask].copy()

    X_neg  = preprocess.transform(df_neg[top_feats])
    X_pos = preprocess.transform(df_pos[top_feats])

    nn.fit(X_neg)

    # For each positive row, get indices of K nearest negative rows (indices refer to df_neg order)
    distances, knn_indices = nn.kneighbors(X_pos, n_neighbors=K, return_distance=True)

    # Union of all negative neighbor indices across all positive rows
    neg_idx = np.unique(knn_indices)
    #selected_neg_eid = df_neg.iloc[unique_neg_positions,:]['eid']

    # --- Result: the selected negative rows (union of neighbors) ---
    XA = X_pos
    XB = X_neg[neg_idx]
    nA, nB = XA.shape[0], XB.shape[0]
    
    # --- Matching: perform 2 to 1 matching on nearest neighbor subset ---
    cost = pairwise_distances(XA, XB, metric="euclidean")

    m = gp.Model("two_to_one_matching")

    # Binary decision vars: x[i,j] = 1 if A_i assigned to B_j
    x = m.addVars(nA, nB, vtype=GRB.BINARY, name="x")

    # Objective: minimize total distance
    m.setObjective(gp.quicksum(cost[i, j] * x[i, j] for i in range(nA) for j in range(nB)),
                   GRB.MINIMIZE)

    # Each B_j gets exactly two A_i
    for i in range(nA):
        m.addConstr(gp.quicksum(x[i, j] for j in range(nB)) == 2, name=f"two_per_A[{i}]")
    
    # Each A_i is used at most once
    for j in range(nB):
        m.addConstr(gp.quicksum(x[i, j] for i in range(nA)) <= 1, name=f"at_most_one_per_A[{i}]")
    
    m.optimize()
    
    for i in range(nA):
        matched_j = [j for j in range(nB) if x[i, j].X > 0.5]
        if len(matched_j) > 0:
            matched = matched + list(neg_idx[matched_j])
    selected_eid = df_neg.iloc[matched,:][['eid']]

In [18]:
train_neg = pd.read_csv("data/breast_cancer_matched_negative.csv")
train_pos = pd.read_csv("data/breast_cancer_matched_positive.csv")

In [51]:
PROP = len(train_neg) / len(train_pos)

rng = np.random.default_rng(42)

# --- Train set from successful matches (A and B) ---
train_eids = (train_pos["eid"].append(train_neg["eid"])).astype(int)
train_part = pd.DataFrame({"eid": train_eids, "train": 1})

# --- Test set: all NOT-NA rows, plus 2x as many NA rows sampled at random ---
test_notna_eids = test_df.loc[test_df[TIME_COL].notna(), "eid"]
test_na_pool_eids = test_df.loc[test_df[TIME_COL].isna(), "eid"]

n_notna = len(test_notna_eids)

test_na_sampled = pd.Index(rng.choice(test_na_pool_eids.values, size=int(PROP*n_notna), replace=False))

test_eids_final = pd.concat([
    pd.Series(test_notna_eids, dtype=test_na_sampled.dtype),
    pd.Series(test_na_sampled, dtype=test_na_sampled.dtype)
], ignore_index=True)
test_part = pd.DataFrame({"eid": test_eids_final, "train": 0})

# --- Final dataframe ---
final_df = pd.concat([train_part, test_part], ignore_index=True)
final_df.to_csv("data/breast_cancer_matched_eids.csv", index=False)

  train_eids = (train_pos["eid"].append(train_neg["eid"])).astype(int)
