In [4]:
import pandas as pd 
import numpy as np 
import os 
import warnings
import random
import json
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split, GridSearchCV, GroupKFold
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report, f1_score
from sklearn.utils import class_weight
import xgboost as xgb
import glob
import gc

In [5]:
data_path = "/orcd/pool/003/dbertsim_shared/ukb"

variant_map = pd.read_csv(f"{data_path}/bgen/ch18/c18_b0_v1_variants.csv")
sample_map = pd.read_csv(f"{data_path}/bgen/ch18/c18_b0_v1_samples.csv")


In [6]:
sample_map

Unnamed: 0,ID_1,ID_2,missing,sex,sample_idx
0,4776604,4776604,0.013090,2,0
1,5258330,5258330,0.011615,1,1
2,1728654,1728654,0.011573,1,2
3,5928357,5928357,0.014106,2,3
4,3940511,3940511,0.011192,1,4
...,...,...,...,...,...
469829,3547010,3547010,0.012194,2,469829
469830,2264443,2264443,0.013056,1,469830
469831,1533729,1533729,0.011098,2,469831
469832,5030729,5030729,0.017895,1,469832


In [63]:
data_path = "/orcd/pool/003/dbertsim_shared/ukb"
df_diag = pd.read_csv(f"{data_path}/ukb_cancer_valid.csv", usecols=cols_to_read) # TRAINING to change

In [64]:
sample_map = pd.read_csv(f"{data_path}/bgen/ch18/c18_b0_v1_samples.csv")
assert(len(sample_map.loc[sample_map['ID_1'] != sample_map['ID_2']]) == 0)
cols_to_read = ['eid', 'breast_cancer', 'breast_time_to_diagnosis']
sample_map = sample_map.rename(columns = {"ID_1": "eid"})

In [65]:
sample_map = pd.merge(sample_map, df_diag, on = 'eid', how = 'right')

In [66]:
sample_map

Unnamed: 0,eid,ID_2,missing,sex,sample_idx,breast_cancer,breast_time_to_diagnosis
0,1000380,1000380.0,0.012411,2.0,363605.0,0,
1,1001803,1001803.0,0.013524,2.0,20830.0,0,
2,1004280,1004280.0,0.012813,1.0,34934.0,0,
3,1006573,1006573.0,0.012181,2.0,125675.0,0,
4,1009015,,,,,0,
...,...,...,...,...,...,...,...
10594,6016298,6016298.0,0.011444,2.0,301820.0,0,
10595,6016959,6016959.0,0.011938,1.0,218821.0,0,
10596,6017829,6017829.0,0.012379,1.0,175039.0,0,
10597,6020293,6020293.0,0.011804,2.0,147792.0,0,


In [67]:
def clean_chrom(path):
    df = pd.read_parquet(path)
    df = df.fillna(-1)
    try:
        df = df.loc[df['sample_idx'].isin(sample_map['sample_idx'])]
    except Exception as e:
        print(path)
    df_pivot = df.pivot(index='sample_idx', columns='variant_idx', values='dosage')
    df_pivot = df_pivot.fillna(0)
    df_pivot = df_pivot.add_prefix("c18_").reset_index()
    df_pivot = pd.merge(sample_map, df_pivot, on = 'sample_idx', how = 'inner')

    return df_pivot

In [14]:
def sis(X, y, k=None, frac=None):
    is_df = isinstance(X, pd.DataFrame)
    colnames = X.columns if is_df else None
    
    # convert to array
    X = np.asarray(X)
    y = np.asarray(y).astype(float)

    n, p = X.shape

    # determine number of features to keep
    if k is None:
        if frac is None:
            k = p
        else:
            k = max(1, int(np.floor(frac * p)))

    # center
    Xc = X - X.mean(axis=0, keepdims=True)
    yc = y - y.mean()

    # correlation numerator
    cov = np.sum(Xc * yc[:, None], axis=0)

    # std dev
    X_std = np.sqrt(np.sum(Xc**2, axis=0))
    y_std = np.sqrt(np.sum(yc**2))

    # SIS scores
    scores = np.abs(cov / (X_std * y_std + 1e-12))

    # sort features
    ranked_idx = np.argsort(-scores)[:k]

    score_dict = {colnames[i]: scores[i] for i in range(len(colnames))}


    return score_dict

In [17]:
all_scores = {}
files = sorted(glob.glob(f"{data_path}/bgen/ch18/c18_*parquet"))
count = len(files)
for path in files:
    df = clean_chrom(path)
    df['breast_cancer_now'] = ((df['breast_cancer'] == 1) & (df['breast_time_to_diagnosis'] <= 30)).astype(int)
    X_train = df.loc[:, df.columns.str.startswith("c18_")]
    print(path, "number of columns: ", len(X_train.columns))
    y_train = df['breast_cancer_now']
    scores = sis(X_train, y_train)
    all_scores = {**all_scores,**scores}
    del df
    gc.collect()

/orcd/pool/003/dbertsim_shared/ukb/bgen/ch18/c18_b0_v1_0_500.parquet number of columns:  382
/orcd/pool/003/dbertsim_shared/ukb/bgen/ch18/c18_b0_v1_100000_101000.parquet number of columns:  789
/orcd/pool/003/dbertsim_shared/ukb/bgen/ch18/c18_b0_v1_10000_10500.parquet number of columns:  366
/orcd/pool/003/dbertsim_shared/ukb/bgen/ch18/c18_b0_v1_101000_102000.parquet number of columns:  795
/orcd/pool/003/dbertsim_shared/ukb/bgen/ch18/c18_b0_v1_102000_103000.parquet number of columns:  517
/orcd/pool/003/dbertsim_shared/ukb/bgen/ch18/c18_b0_v1_103000_104000.parquet number of columns:  764
/orcd/pool/003/dbertsim_shared/ukb/bgen/ch18/c18_b0_v1_104000_105000.parquet number of columns:  854
/orcd/pool/003/dbertsim_shared/ukb/bgen/ch18/c18_b0_v1_105000_106000.parquet number of columns:  628
/orcd/pool/003/dbertsim_shared/ukb/bgen/ch18/c18_b0_v1_10500_11000.parquet number of columns:  249
/orcd/pool/003/dbertsim_shared/ukb/bgen/ch18/c18_b0_v1_106000_107000.parquet number of columns:  519
/o

In [18]:
df_sis = pd.DataFrame(list(all_scores.items()), columns=["feature", "score"])

In [22]:
df_sis = df_sis.sort_values(by = 'score', ascending = False).reset_index(drop = True)

In [24]:
df_sis.to_parquet(f"{data_path}/bgen/ch18/c18_sis.parquet")

In [62]:
df_sis = pd.read_parquet(f"{data_path}/bgen/ch18/c18_sis.parquet")
cols_to_keep = list(df_sis.loc[df_sis['score']>0.03]['feature'])
len(cols_to_keep)

1132

In [68]:
# Select top columns only 
df_selected = pd.DataFrame(columns=["eid"])
files = sorted(glob.glob(f"{data_path}/bgen/ch18/c18_*parquet"))

for path in files:
    if "sis" in path:
        continue
    df = clean_chrom(path)
    valid_cols = [c for c in cols_to_keep if c in df.columns]
    df = df[valid_cols + ['eid']]
    df_selected = pd.merge(df_selected, df, how = 'outer', on = 'eid')
    del df
    gc.collect()

In [69]:
df_selected.shape

(10157, 363)

In [70]:
df_selected.to_csv('/orcd/pool/003/dbertsim_shared/ukb/bgen/ch18/valid_c18_selected_features_score0.03.csv', index =False)

In [53]:
df_diag = pd.read_csv(f"{data_path}/ukb_cancer_valid.csv", usecols=cols_to_read) # TRAINING to change

In [71]:
cohort = "valid"
df_diag = pd.read_csv(f"{data_path}/ukb_cancer_{cohort}.csv", usecols=cols_to_read) 
df = pd.read_csv(f'/orcd/pool/003/dbertsim_shared/ukb/bgen/ch18/{cohort}_c18_selected_features_score0.03.csv')

df_merged = pd.merge(df_diag, df, how = 'outer', on = 'eid')
df_merged.to_csv(f'/orcd/pool/003/dbertsim_shared/ukb/bgen/ch18/{cohort}_c18_selected_features_score0.03_with_outcome.csv', index =False)

In [72]:
df_merged.shape

(10599, 365)