In [1]:
import numpy as np
import pandas as pd
import pyarrow.parquet as pq

START = 500
END = 1000
CHROM = 18
file = f"c{CHROM}_b0_v1"
filepath = f"/../../orcd/pool/003/dbertsim_shared/ukb/bgen/ch{CHROM}"

# ---- 1. Read parquet + mapping tables ----
tbl = pq.read_table(f"{filepath}/bgen/{file}_{START}_{END}.parquet")
dosage_df  = tbl.to_pandas()

samples = pd.read_csv(f"{filepath}/{file}_samples.csv")
variants = pd.read_csv(f"{filepath}/{file}_variants.csv")

samples = samples[samples["ID_1"] >= 0]
variants = variants.rename(columns={"variant_i": "variant_idx"})[["variant_idx", "rsid"]]

In [5]:
# ---- 2. Attach eid (ID_1) and rsid to the sparse long table ----
merged = (
    dosage_df
    .merge(samples, on="sample_idx", how="inner")
    .merge(variants, on="variant_idx", how="inner")
)
# merged columns: [variant_idx, sample_idx, dosage, ID_1, rsid]

merged["is_missing"] = merged["dosage"].isna()

# ---- 3. Pivot to wide matrices: dosage and missingness ----
dosage_wide = merged.pivot(
    index="ID_1",
    columns="rsid",
    values="dosage"
)

# miss_wide: True where genotype is missing, False otherwise
miss_wide = merged.pivot(
    index="ID_1",
    columns="rsid",
    values="is_missing"
).fillna(False)

# ---- 4. Fill only true zeros; keep missing as NaN ----
zero_mask = (~miss_wide) & dosage_wide.isna()

final = dosage_wide.copy()
final[zero_mask] = 0

final = final.reset_index().rename(columns={"ID_1": "eid"})
# final.to_csv(f"{filepath}{file}_{START}_{END}.csv", index=False)

In [10]:
missing_df = final.isnull().sum()

In [12]:
missing_df[missing_df > 0]

rsid
18:180439:T:C     526
18:192743:G:A    1110
18:192744:T:C    2596
18:192748:G:A     965
18:192749:G:A    1130
                 ... 
18:202978:C:T      38
18:202979:G:A      33
18:202979:G:T      33
18:202990:A:G      18
18:202991:T:A       9
Length: 466, dtype: int64

In [None]:
missing_df.loc[missing_df['

In [13]:
DIR = "/../../orcd/pool/003/dbertsim_shared/ukb/"
train_df = pd.read_csv(f'{DIR}ukb_cancer_train.csv')
valid_df = pd.read_csv(f'{DIR}ukb_cancer_valid.csv')
test_df = pd.read_csv(f'{DIR}ukb_cancer_test.csv')

eids = list(train_df['eid']) + list(valid_df['eid']) + list(test_df['eid'])

  train_df = pd.read_csv(f'{DIR}ukb_cancer_train.csv')
  valid_df = pd.read_csv(f'{DIR}ukb_cancer_valid.csv')
  test_df = pd.read_csv(f'{DIR}ukb_cancer_test.csv')


In [17]:
eids = list(train_df['eid']) + list(valid_df['eid']) + list(test_df['eid'])

In [21]:
filtered_df = final.loc[final['eid'].isin(eids)]

In [29]:
missing_df = filtered_df.isnull().sum()
missing_df[missing_df > 1000]

rsid
18:197741:A:AT            7137
18:197741:A:ATT           7137
18:197741:AT:A            7137
18:197741:ATT:A           7137
18:197741:ATTTTT:A        7137
18:197743:T:C             7137
18:197753:A:AT            1049
18:197753:A:T             1049
18:197753:AT:A            1049
18:197754:T:C             1049
18:197955:TA:T           50735
18:199366:GAGTAAGCC:G    50735
dtype: int64

In [26]:
max(missing_df)

50735