In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import xgboost as xgb
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, average_precision_score, confusion_matrix, classification_report
from sklearn.model_selection import RandomizedSearchCV
from imblearn.over_sampling import SMOTE
import seaborn as sns
from scipy.stats import spearmanr
import warnings
warnings.filterwarnings('ignore')

In [5]:
train_df = pd.read_csv('/orcd/pool/003/dbertsim_shared/ukb/ukb_cancer_train.csv')
val_df = pd.read_csv('/orcd/pool/003/dbertsim_shared/ukb/ukb_cancer_valid.csv')
test_df = pd.read_csv('/orcd/pool/003/dbertsim_shared/ukb/ukb_cancer_test.csv')

In [6]:
df = pd.concat([train_df, val_df, test_df], axis=0)

In [25]:
cols_to_keep = [
    col for col in df.columns if "olink" in col
]

X = df[cols_to_keep]


missing_pct = X.isnull().mean()
cols_to_keep = missing_pct.sort_values().index[:2000]  # keep least-missing
df_reduced = df[cols_to_keep]

In [3]:
# bladder_cancer, breast_cancer, colorectal_cancer, liver_cancer, lung_cancer
# pancreatic_cancer, prostate_cancer

In [4]:
train_df['bladder_cancer'].sum()/len(train_df) # 0.00085
train_df['liver_cancer'].sum()/len(train_df) # 6.289901563040539e-5
train_df['lung_cancer'].sum()/len(train_df) # 0.000817
train_df['pancreatic_cancer'].sum()/len(train_df) # 0.0001572
train_df['prostate_cancer'].sum()/len(train_df) # 0.006541

0.00654149762556216

In [None]:
X_cols_file = "cancer_ordinal_columns.txt"
with open(X_cols_file, "r") as file:
    X_cols = file.read().splitlines()
X_cols = [col for col in X_cols if col != "eid"]

NA_COLS = [
    "Age at recruitment",
    "Sex_male",
    "Body mass index (BMI)",
    "Systolic blood pressure, automated reading",
    "Diastolic blood pressure, automated reading",
]

DRINK_MAP = {
    "Never": 0,
    "Special occasions only": 37,
    "Daily or almost daily": 365,
    "Once or twice a week": int(1.5 * 52),
    "Three or four times a week": int(3.5 * 52),
    "One to three times a month": int(2 * 12),
}

def make_X_y(df, X_cols, target_col):
    X = df[X_cols].copy()
    
    X["current_smoker"] = (X["Smoking status"] == "Current").astype(int)
    X["prev_smoker"]    = (X["Smoking status"] == "Previous").astype(int)
    X["never_smoker"]   = (X["Smoking status"] == "Never").astype(int)

    X["avg_drinks_per_year"] = X["Alcohol intake frequency."].map(DRINK_MAP)
    X["avg_drinks_per_year"] = X["avg_drinks_per_year"].fillna(0)

    X = X.drop(columns=["Alcohol intake frequency.", "Smoking status"])

    rows_with_nans = X[NA_COLS].isna().any(axis=1)
    X = X.loc[~rows_with_nans].copy()

    X = X.fillna(0)

    y = df.loc[~rows_with_nans, target_col]

    return X, y

In [6]:
target_col = "

X_train, y_train = make_X_y(train_df, X_cols, "bladder_cancer")
X_val, y_val   = make_X_y(val_df,   X_cols, "bladder_cancer")
X_test, y_test  = make_X_y(test_df,  X_cols, "bladder_cancer")

In [7]:
print('Train: ', y_train.sum()/len(y_train), y_train.sum())
print('Val: ', y_val.sum()/len(y_val), ' ', y_val.sum())
print('Test: ', y_test.sum()/len(y_test), ' ', y_test.sum())

Train:  0.0008750673128702208 26
Val:  0.0009134273825230895   9
Test:  0.0008091433195104683   8


In [8]:
correlations = pd.DataFrame({
    'feature': X_train.columns,
    'correlation': [spearmanr(X_train[col], y_train.values.ravel())[0] for col in X_train.columns],
    'p_value': [spearmanr(X_train[col], y_train.values.ravel())[1] for col in X_train.columns]
})

correlations['abs_correlation'] = correlations['correlation'].abs()
correlations = correlations.sort_values('abs_correlation', ascending=False)

print("Top 20 features by correlation with target:")
print(correlations.head(30)[['feature', 'correlation', 'p_value']])

# Feature-feature correlations (to identify redundant features)
top_feats = correlations.head(30)['feature'].tolist()
X_top = X_train[top_feats]

feature_corr_matrix = X_top.corr(method='spearman')

# Find highly correlated feature pairs
high_corr_pairs = []
for i in range(len(feature_corr_matrix.columns)):
    for j in range(i+1, len(feature_corr_matrix.columns)):
        if abs(feature_corr_matrix.iloc[i, j]) > 0.8:
            high_corr_pairs.append({
                'feature1': feature_corr_matrix.columns[i],
                'feature2': feature_corr_matrix.columns[j],
                'correlation': feature_corr_matrix.iloc[i, j]
            })

if high_corr_pairs:
    print(f"\nFound {len(high_corr_pairs)} highly correlated feature pairs (|r| > 0.8):")
    for pair in high_corr_pairs[:10]:  # Show first 10
        print(f"  {pair['feature1']} <-> {pair['feature2']}: {pair['correlation']:.3f}")

Top 20 features by correlation with target:
                          feature  correlation   p_value
0              Age at recruitment     0.025950  0.000008
499                   olink_cdhr2     0.025491  0.000011
632                 olink_col18a1     0.022813  0.000084
388                   olink_ccl16     0.022760  0.000087
1889                  olink_npdc1     0.022671  0.000093
2273                   olink_relt     0.022258  0.000125
1895                   olink_nppc     0.021819  0.000169
2721              olink_tnfrsf10b     0.021697  0.000184
348                   olink_calca     0.021484  0.000213
2969             blood_Cystatin C     0.021334  0.000235
409                    olink_ccn3     0.021164  0.000264
31                   olink_acvrl1     0.021138  0.000269
1100              olink_fut3_fut5     0.020960  0.000303
2165                  olink_prss8     0.020767  0.000344
396                   olink_ccl23     0.020633  0.000376
1737                  olink_mmp12     0.0205

In [9]:
to_drop = set()

for pair in high_corr_pairs:
    f1, f2 = pair['feature1'], pair['feature2']
    corr1 = abs(correlations.loc[correlations['feature'] == f1, 'correlation']).values[0]
    corr2 = abs(correlations.loc[correlations['feature'] == f2, 'correlation']).values[0]
    # Drop the weaker one
    if corr1 >= corr2:
        to_drop.add(f2)
    else:
        to_drop.add(f1)

print(f"Dropping {len(to_drop)} redundant features")

# Filter the top_feats list
top_feats_pruned = [f for f in top_feats if f not in to_drop]

print(f"Kept {len(top_feats_pruned)} features after redundancy pruning")

# Subset your data
X_train = X_train[top_feats_pruned]
X_val   = X_val[top_feats_pruned]
X_test  = X_test[top_feats_pruned]

Dropping 0 redundant features
Kept 30 features after redundancy pruning


In [10]:
y_train_np = np.array(y_train).astype(int)
y_val_np = np.array(y_val).astype(int)
y_test_np = np.array(y_test).astype(int)

X_train_np = np.array(X_train)
X_val_np = np.array(X_val)
X_test_np = np.array(X_test)

print("Original class distribution:")
print(f"Train: {np.mean(y_train_np):.4f} ({np.sum(y_train_np)} positive out of {len(y_train_np)})")
print(f"Val: {np.mean(y_val_np):.4f} ({np.sum(y_val_np)} positive out of {len(y_val_np)})")
print(f"Test: {np.mean(y_test_np):.4f} ({np.sum(y_test_np)} positive out of {len(y_test_np)})")

n_positive = np.sum(y_train_np)
desired_ratio = 0.01
n_negative_needed = int(n_positive / desired_ratio - n_positive)
# it corresponds to the desired ratio of the number of samples in the minority class over the number of samples in the majority class after resampling.
sampling_strategy = n_positive / n_negative_needed

print(f"\nApplying SMOTE with sampling_strategy={sampling_strategy:.4f}")

smote = SMOTE(sampling_strategy=sampling_strategy, random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_np, y_train_np)

print(f"Resampled train set: {np.mean(y_train_resampled):.4f} ({np.sum(y_train_resampled)} positive out of {len(y_train_resampled)})")

Original class distribution:
Train: 0.0009 (26 positive out of 29712)
Val: 0.0009 (9 positive out of 9853)
Test: 0.0008 (8 positive out of 9887)

Applying SMOTE with sampling_strategy=0.0101
Resampled train set: 0.0100 (299 positive out of 29985)


In [11]:
print("XGB")

simple_xgb = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=4,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=100,
    objective='binary:logistic',
    eval_metric='auc',
    use_label_encoder=False,
    early_stopping_rounds=20,
    random_state=42
)

simple_xgb.fit(
    X_train_resampled, 
    y_train_resampled,
    eval_set=[(X_val_np, y_val_np)],
    verbose=False
)

y_train_pred = simple_xgb.predict_proba(X_train_resampled)[:, 1]
y_val_pred = simple_xgb.predict_proba(X_val_np)[:, 1]
y_test_pred = simple_xgb.predict_proba(X_test_np)[:, 1]

train_auc = roc_auc_score(y_train_resampled, y_train_pred)
val_auc = roc_auc_score(y_val_np, y_val_pred)
test_auc = roc_auc_score(y_test_np, y_test_pred)

print(f"\nTrain AUC (on resampled data): {train_auc:.4f}")
print(f"Validation AUC: {val_auc:.4f}")
print(f"Test AUC: {test_auc:.4f}")

XGB

Train AUC (on resampled data): 0.9791
Validation AUC: 0.8062
Test AUC: 0.7949
