**Input:** 
- ACCEL-UKBB merger leftover IDs

**Process:**
 - Attempt merging the leftover entries, based on .cwa files' sha256 sum

**Output:**
 - Newly paired IDs

# Preparation (Execute all in this section!)

## Import libraries & set environment variables

In [1]:
import collections
import csv
from datetime import datetime
import os
import numpy as np
from pathlib import Path
import polars as pl
import re

# Draw flowcharts using Mermaid
import base64
from IPython.display import Image, display

def mm(graph):
  graphbytes = graph.encode("ascii")
  base64_bytes = base64.b64encode(graphbytes)
  base64_string = base64_bytes.decode("ascii")
  display(Image(url="https://mermaid.ink/img/" + base64_string))

dir_home = Path(os.getcwd()).parent.parent
os.chdir(dir_home)
print("Current directory (check that it's your home directory):", os.getcwd())

Current directory (check that it's your home directory): J:\sugai\UKBiobank


In [4]:
mm("""
graph LR;
    classDef sourcedata fill:#FFFFFF
    classDef final fill:#BBBBBB
    
    New[New dataset: 671006] --> Pair[ID pairs];
    Old[Old dataset: 34134] --> Pair
    New --> Left1[Leftover IDs]
    Old --> Left2[Leftover IDs]
    Left1 --> Merge[Attempt merge]
    Left2 --> Merge
""")


In [45]:
# Input
DIR_SOURCE_IDS = os.path.join("data", "accel_ukbb", "merging")
FILE_IDS_PAIRED = os.path.join(DIR_SOURCE, "pair_ids_20230518.csv")
FILE_IDS_OLD = os.path.join(DIR_SOURCE_IDS, "nonpaired_ids_leftover_old_20230518.csv")
FILE_IDS_NEW = os.path.join(DIR_SOURCE_IDS, "nonpaired_ids_leftover_new_20230518.csv")

df_paired_id = (pl.read_csv(FILE_IDS_PAIRED)
                .rename({"eid": "eid_new"}))
df_nonpaired_id_old = (pl.read_csv(FILE_IDS_OLD))
df_nonpaired_id_new = (pl.read_csv(FILE_IDS_NEW))

print(df_paired_id.head)
print(df_nonpaired_id_old.head)
print(df_nonpaired_id_new.head)

<bound method DataFrame.head of shape: (488_160, 2)
┌─────────┬─────────┐
│ eid_old ┆ eid_new │
│ ---     ┆ ---     │
│ i64     ┆ i64     │
╞═════════╪═════════╡
│ 1327806 ┆ 1000010 │
│ 2071371 ┆ 1000028 │
│ 3017169 ┆ 1000034 │
│ 2373493 ┆ 1000045 │
│ …       ┆ …       │
│ 2350278 ┆ 6024784 │
│ 5200659 ┆ 6024795 │
│ 5061586 ┆ 6024804 │
│ 6016103 ┆ 6024818 │
└─────────┴─────────┘>
<bound method DataFrame.head of shape: (14_367, 1)
┌─────────┐
│ eid_old │
│ ---     │
│ i64     │
╞═════════╡
│ 1000360 │
│ 1000438 │
│ 1001101 │
│ 1001649 │
│ …       │
│ 6023841 │
│ 6023896 │
│ 6024689 │
│ 6025116 │
└─────────┘>
<bound method DataFrame.head of shape: (14_226, 1)
┌─────────┐
│ eid_new │
│ ---     │
│ i64     │
╞═════════╡
│ 1000104 │
│ 1000557 │
│ 1000719 │
│ 1001090 │
│ …       │
│ 6024409 │
│ 6024677 │
│ 6024820 │
│ 6024833 │
└─────────┘>


# Process

## Get the corresponding cwa sha256

In [18]:
DIR_SOURCE_CWA_DIGEST = os.path.join("data", "accel_ukbb", "cwa_merge")
FILE_DIGEST_OLD = os.path.join(DIR_SOURCE_CWA_DIGEST, "file_list_old_sorted.txt")
FILE_DIGEST_NEW = os.path.join(DIR_SOURCE_CWA_DIGEST, "file_list_new_sorted.txt")

df_digest_old = (pl.read_csv(FILE_DIGEST_OLD, separator="\t")
                .filter(pl.col("eid_old").is_in(df_nonpaired_id_old["eid_old"])))
df_digest_new = (pl.read_csv(FILE_DIGEST_NEW, separator="\t")
                .filter(pl.col("eid_new").is_in(df_nonpaired_id_new["eid_new"])))

print(df_digest_old.head)
print(df_digest_new.head)

<bound method DataFrame.head of shape: (60, 2)
┌───────────────────────────────────┬─────────┐
│ sha256                            ┆ eid_old │
│ ---                               ┆ ---     │
│ str                               ┆ i64     │
╞═══════════════════════════════════╪═════════╡
│ 073ef6d938573107bd445c8c6eb8f24e… ┆ 4458619 │
│ 0c2785b32c53ab29febb6374d4492eea… ┆ 3872217 │
│ 10186064a6d202e0da2fade338d681e5… ┆ 5130042 │
│ 16364a94fa701e2cd04068fa729b9e6c… ┆ 1082096 │
│ …                                 ┆ …       │
│ f2372e766d168737fe43c6297d53af4a… ┆ 2341658 │
│ f27f21f80d334c5357aa68a8e10e0f41… ┆ 4037308 │
│ f7b85ae25975336e675c649fbdf648da… ┆ 5629432 │
│ ff66b0a3c41d49173d8ee80ccef91deb… ┆ 3967016 │
└───────────────────────────────────┴─────────┘>
<bound method DataFrame.head of shape: (19, 2)
┌───────────────────────────────────┬─────────┐
│ sha256                            ┆ eid_new │
│ ---                               ┆ ---     │
│ str                               ┆ i64

## Merge
- Check that no entries are duplicated
- After merge, check that the number of joined entries are the same as the number of new dataset IDs

In [40]:
df = df_digest_old.select(pl.col("sha256").is_duplicated())
count = df.filter(pl.col("sha256")).shape[0]
print(count, "entries in the old dataset are duplicated")

df = df_digest_new.select(pl.col("sha256").is_duplicated())
count = df.filter(pl.col("sha256")).shape[0]
print(count, "entries in the new dataset are duplicated")

0 entries in the old dataset are duplicated
0 entries in the new dataset are duplicated


In [43]:
df_joined = df_digest_new.join(df_digest_old, on="sha256", how="outer").drop_nulls()
df_joined

sha256,eid_new,eid_old
str,i64,i64
"""0c2785b32c53ab…",1993670,3872217
"""1659f1e3add502…",5880336,5335805
"""179ada25506c6f…",5642658,2836584
"""2273920a69ca77…",4783491,4560985
"""30727d02983b4d…",5389871,1533850
"""3766d1b02bc05f…",1866543,3146475
"""3cdc34d7b4bf26…",5755710,1886308
"""4537c1cf76b88e…",2834670,3947128
"""6173c9af945c50…",2465163,1467128
"""80c9619dde4811…",4993070,4050460


## Merge to the already paired fraction and export

In [52]:
df_paired_id_all = pl.concat([
    df_paired_id.select(pl.col(["eid_new", "eid_old"])),
    df_joined.select(pl.col(["eid_new", "eid_old"]))
])

print(df_paired_id_all.head)

<bound method DataFrame.head of shape: (488_179, 2)
┌─────────┬─────────┐
│ eid_new ┆ eid_old │
│ ---     ┆ ---     │
│ i64     ┆ i64     │
╞═════════╪═════════╡
│ 1000010 ┆ 1327806 │
│ 1000028 ┆ 2071371 │
│ 1000034 ┆ 3017169 │
│ 1000045 ┆ 2373493 │
│ …       ┆ …       │
│ 3412694 ┆ 4769313 │
│ 2028177 ┆ 1470562 │
│ 5838377 ┆ 2657355 │
│ 2878903 ┆ 3967016 │
└─────────┴─────────┘>


In [55]:
FILE_OUT = os.path.join(DIR_SOURCE_IDS, "pair_ids_sorted_old.csv")
df_paired_id_all.sort(by="eid_old").select(["eid_old", "eid_new"]).write_csv(FILE_OUT)

FILE_OUT = os.path.join(DIR_SOURCE_IDS, "pair_ids_sorted_new.csv")
df_paired_id_all.sort(by="eid_new").write_csv(FILE_OUT)