**Input:** 
- ACCEL-UKBB merged data

**Process:**
 - Merge relevant fields

**Output:**
 - Covariate files as PLINK2 inputs

# Preparation (Execute all in this section!)

## Import libraries & set environment variables

In [1]:
import collections
import csv
from datetime import datetime
import os
import numpy as np
from pathlib import Path
import polars as pl
import re
from matplotlib import pyplot as plt

from scipy import stats

import statsmodels.api as sm

from contextlib import redirect_stdout
from io import StringIO

import warnings
# suppress DeprecationWarning messages
warnings.filterwarnings("ignore", category=DeprecationWarning)


# Draw flowcharts using Mermaid
import base64
from IPython.display import Image, display

def mm(graph):
  graphbytes = graph.encode("ascii")
  base64_bytes = base64.b64encode(graphbytes)
  base64_string = base64_bytes.decode("ascii")
  display(Image(url="https://mermaid.ink/img/" + base64_string))


dir_home = Path(os.getcwd()).parent.parent
os.chdir(dir_home)
print("Current directory (check that it's your home directory):", os.getcwd())

Current directory (check that it's your home directory): J:\sugai\UKBiobank


In [11]:
DIR_SOURCE_TABULAR = os.path.join("data", "accel_ukbb", "split")
FILE_SOURCE_PC = os.path.join("data", "ukbb", "4020457_671006_all", "genotype",
                             "processed", "22828_imp_gen_pgen", "white_british.pca.eigenvec")
DIR_OUT = os.path.join("data", "accel_ukbb", "plink_covar")

if not os.path.exists(DIR_OUT):
    os.makedirs(DIR_OUT)

In [32]:
mm("""
graph LR;
    classDef sourcedata fill:#FFFFFF
    classDef final fill:#BBBBBB
    
    UKBB(UKBB-ACCEL dataset):::sourcedata --> Age;
    UKBB --> Sex[Sex]:::sourcedata;

    Sex -.-> Sex2[Female=-1\\nMale=1]
    
    Age -..-> Comb1[Age x Sex];
    Sex2 -.-> Comb1[Age x Sex];

    Age -.-> Comb2[Age^2];
    
    Age -..-> Comb3[Age^2 x Sex];
    Sex2 -.-> Comb3[Age^2 x Sex];

    PLINK2(PLINK2):::sourcedata -----> PC[PCs];
    
    Age -----> Covar[Covariate\\nFiles]:::final;
    Sex2 ----> Covar;
    Comb1 --> Covar;
    Comb2 --> Covar;
    Comb3 --> Covar;
    PC --> Covar;

""")


## Functions

In [6]:
def merge_files(list_files, dir):
    list_files = [os.path.join(dir, file) for file in list_files]

    # Initialize an empty DataFrame
    df_merged = pl.DataFrame()
    
    print(datetime.now(), "Start processing", len(list_files), "files")

    # Loop over each file in the list and add it to the merged DataFrame
    for i, file in enumerate(list_files):
        func_print(i + 1)
        # Read in the current file as a DataFrame
        df_current = pl.read_csv(file, separator="\t")

        # Get the column name for the current file's specific column
        column_name = os.path.splitext(os.path.basename(file))[0]

        # Rename the specific column to the name of the current file
        #df_current = df_current.rename(columns={df_current.columns[1]: column_name})
        df_current = df_current.rename({df_current.columns[1]: column_name})

        # If this is the first file being added, simply set df_merged to df_current
        if df_merged.shape == (0, 0):
            df_merged = df_current

        # Otherwise, merge df_current with df_merged on the "eid" column
        else:
            df_merged = df_merged.join(df_current, on="eid", how="outer")

    return df_merged
    
    

def func_print(i):
    if i < 10 or str(i)[1:].count('0') == len(str(i))-1:
        now = datetime.now()
        print(f"{now.strftime('%Y-%m-%d %H:%M:%S')}: {i}")


# Process

## Load

### Load UKBB-ACCEL

In [41]:
list_files = [
"ukb671006_12163_21003-0.0.txt", # Age
"ukb671006_00026_31-0.0.txt", # Sex
]
df_tabular = merge_files(list_files, DIR_SOURCE_TABULAR)
df_tabular = df_tabular.rename({"eid": "#FID",
                               "ukb671006_12163_21003-0.0": "Age",
                               "ukb671006_00026_31-0.0": "Sex",})
df_tabular

2023-04-26 16:00:40.141888 Start processing 2 files
2023-04-26 16:00:40: 1
2023-04-26 16:00:40: 2


#FID,Age,Sex
i64,i64,i64
1000010,63,0
1000028,47,0
1000034,58,0
1000045,67,0
1000052,64,0
1000069,42,1
1000076,69,0
1000087,63,0
1000091,67,1
1000104,65,1


### Load PLINK2-PCs

In [45]:
### Only for practice (small file)
# FILE_SOURCE_PC_PRACTICE = os.path.join("data", "ukbb", "4020457_671006_all", "genotype",
#                              "processed", "22828_imp_gen_pgen", "head.txt")
# df_pc = pl.read_csv(FILE_SOURCE_PC_PRACTICE, separator="\t")

df_pc = pl.read_csv(FILE_SOURCE_PC, separator="\t")
df_pc

#FID,IID,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13,PC14,PC15,PC16,PC17,PC18,PC19,PC20,PC21,PC22,PC23,PC24,PC25,PC26,PC27,PC28,PC29,PC30
i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1000010,1000010,-0.000892,0.002689,0.004079,-0.002191,-0.000797,0.001218,-0.001312,0.00066,-0.000703,0.000095,0.000706,0.00054,-0.00119,0.000471,-0.00098,-0.001153,-0.001408,0.000021,0.003236,-0.000909,0.00095,0.000093,0.003274,-0.000323,0.002166,0.000784,2.5917e-7,0.00018,0.001995,0.001329
1000028,1000028,-0.000471,-0.002047,-0.000748,0.000461,-0.000755,0.001297,-0.000282,0.000438,-0.000922,-0.000048,0.000907,-0.000802,-0.000513,-0.002508,-0.000634,0.001605,-0.001255,0.000877,-0.000742,0.0024112,-0.001834,-0.002736,-0.000683,0.000039,0.001607,-0.001381,0.001527,0.001939,-0.001501,-0.000491
1000034,1000034,0.0017603,0.000476,-0.001139,0.000192,-0.002018,0.00054,-0.001851,0.000626,-0.000226,-0.000451,0.000788,0.000938,-0.000036,0.001619,0.000266,-0.003187,-0.000586,-0.000459,0.000406,-0.001457,-0.001566,0.0007,-0.000044,-0.002162,0.000384,0.000239,-0.000186,0.0012559,0.000909,0.001271
1000045,1000045,0.000617,-0.002361,0.003893,0.002225,-0.001103,-0.000891,0.000084,-0.000536,0.000053,0.000022,-0.000753,0.001031,-0.003581,0.000342,0.001265,0.000315,0.000702,0.00045,-0.00089,-0.001404,0.000011,0.001237,0.00028,-0.000753,0.001129,0.001527,0.000172,0.000721,0.000452,0.000796
1000052,1000052,0.004033,-0.00088,-0.002209,-0.001374,-0.001971,-0.001118,-0.000997,0.00139,-0.001738,0.000704,0.000232,-0.000465,0.0012899,-0.00042,-0.000452,-0.00127,-0.000698,0.002015,-0.000757,0.000644,-0.001778,0.001365,0.001648,0.000344,-0.001475,0.00038,-0.000036,-0.000861,0.000304,-0.000256
1000076,1000076,-0.000789,0.000114,-0.000083,0.000623,-0.003142,0.000668,-0.00216,0.000372,-0.00023,0.000418,0.001531,-0.003507,-0.001884,0.001466,-0.000353,-0.000732,0.000536,-0.00172,-0.000313,-0.00041,-0.001283,-0.000732,-0.000024,0.001427,0.000081,-0.000619,-0.000538,-0.002038,-0.000164,0.001419
1000087,1000087,-0.001075,-0.001881,0.002837,-0.000307,-0.000096,0.001592,0.0015619,-0.002839,0.001233,-0.001343,-0.000163,0.001426,0.000223,-0.002026,0.000878,0.003561,-0.00061,0.001066,0.000048,-0.002325,-0.000491,-0.002953,-0.000025,-0.000293,-0.000798,-0.001131,0.000434,0.000332,-0.003127,0.001305
1000091,1000091,-0.001084,-0.002091,-0.000996,-0.000215,-0.002503,-0.001989,-0.002098,-0.000628,0.001017,0.000015,-0.00037,0.001749,0.000113,0.000438,-0.002894,-0.000704,-0.001561,0.001479,0.002289,-0.001179,0.0018,0.001725,0.000867,-0.002852,0.003718,0.0014467,-0.003404,-0.000912,0.001834,-0.002691
1000104,1000104,-0.001129,-0.002598,-0.000212,0.001835,-0.000825,0.002155,0.001301,-0.003622,0.001843,-0.00198,0.000375,0.0013351,0.002333,0.000826,0.00067,-0.001056,-0.000616,0.000934,0.00157,0.000495,0.000511,-0.001447,-0.001087,-0.004348,0.001088,-0.000894,-0.001209,0.001293,0.0012656,-0.002716
1000118,1000118,0.000631,0.000298,0.001954,0.006409,0.000856,0.000346,0.000626,0.001469,-0.001753,0.002845,-0.001838,0.001675,-0.000219,0.001595,-0.000769,-0.002683,0.001783,-0.002957,0.001839,-0.002299,-0.000798,-0.000908,0.000789,0.000411,-0.001799,0.000928,0.001702,-0.001282,-0.002072,0.000247


## Preprocess the UKBB-ACCEL data
- Sex: "Female=0, Male=1" -> "Female=-1, Male=1"
- Calculate combinations (Age^2, AgexSex, Age^2xSex)

In [42]:
df_tabular = (df_tabular
 .with_columns([
     (df_tabular['Sex'] * 2 -1).alias('Sex'),
 ])
)

df_tabular = (df_tabular
 .with_columns([
     (df_tabular['Age'] ** 2).alias('Age^2'),
     (df_tabular['Age'] * df_tabular['Sex']).alias('Age_Sex'),
     (df_tabular['Age'] ** 2 * df_tabular['Sex']).alias('Age^2_Sex'),
 ])
)

df_tabular

#FID,Age,Sex,Age^2,Age_Sex,Age^2_Sex
i64,i64,i64,f64,i64,f64
1000010,63,-1,3969.0,-63,-3969.0
1000028,47,-1,2209.0,-47,-2209.0
1000034,58,-1,3364.0,-58,-3364.0
1000045,67,-1,4489.0,-67,-4489.0
1000052,64,-1,4096.0,-64,-4096.0
1000069,42,1,1764.0,42,1764.0
1000076,69,-1,4761.0,-69,-4761.0
1000087,63,-1,3969.0,-63,-3969.0
1000091,67,1,4489.0,67,4489.0
1000104,65,1,4225.0,65,4225.0


## Join

In [46]:
df_joined = df_pc.join(df_tabular, on="#FID", how="inner")
df_joined

#FID,IID,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13,PC14,PC15,PC16,PC17,PC18,PC19,PC20,PC21,PC22,PC23,PC24,PC25,PC26,PC27,PC28,PC29,PC30,Age,Sex,Age^2,Age_Sex,Age^2_Sex
i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64,f64,i64,f64
1000010,1000010,-0.000892,0.002689,0.004079,-0.002191,-0.000797,0.001218,-0.001312,0.00066,-0.000703,0.000095,0.000706,0.00054,-0.00119,0.000471,-0.00098,-0.001153,-0.001408,0.000021,0.003236,-0.000909,0.00095,0.000093,0.003274,-0.000323,0.002166,0.000784,2.5917e-7,0.00018,0.001995,0.001329,63,-1,3969.0,-63,-3969.0
1000028,1000028,-0.000471,-0.002047,-0.000748,0.000461,-0.000755,0.001297,-0.000282,0.000438,-0.000922,-0.000048,0.000907,-0.000802,-0.000513,-0.002508,-0.000634,0.001605,-0.001255,0.000877,-0.000742,0.0024112,-0.001834,-0.002736,-0.000683,0.000039,0.001607,-0.001381,0.001527,0.001939,-0.001501,-0.000491,47,-1,2209.0,-47,-2209.0
1000034,1000034,0.0017603,0.000476,-0.001139,0.000192,-0.002018,0.00054,-0.001851,0.000626,-0.000226,-0.000451,0.000788,0.000938,-0.000036,0.001619,0.000266,-0.003187,-0.000586,-0.000459,0.000406,-0.001457,-0.001566,0.0007,-0.000044,-0.002162,0.000384,0.000239,-0.000186,0.0012559,0.000909,0.001271,58,-1,3364.0,-58,-3364.0
1000045,1000045,0.000617,-0.002361,0.003893,0.002225,-0.001103,-0.000891,0.000084,-0.000536,0.000053,0.000022,-0.000753,0.001031,-0.003581,0.000342,0.001265,0.000315,0.000702,0.00045,-0.00089,-0.001404,0.000011,0.001237,0.00028,-0.000753,0.001129,0.001527,0.000172,0.000721,0.000452,0.000796,67,-1,4489.0,-67,-4489.0
1000052,1000052,0.004033,-0.00088,-0.002209,-0.001374,-0.001971,-0.001118,-0.000997,0.00139,-0.001738,0.000704,0.000232,-0.000465,0.0012899,-0.00042,-0.000452,-0.00127,-0.000698,0.002015,-0.000757,0.000644,-0.001778,0.001365,0.001648,0.000344,-0.001475,0.00038,-0.000036,-0.000861,0.000304,-0.000256,64,-1,4096.0,-64,-4096.0
1000076,1000076,-0.000789,0.000114,-0.000083,0.000623,-0.003142,0.000668,-0.00216,0.000372,-0.00023,0.000418,0.001531,-0.003507,-0.001884,0.001466,-0.000353,-0.000732,0.000536,-0.00172,-0.000313,-0.00041,-0.001283,-0.000732,-0.000024,0.001427,0.000081,-0.000619,-0.000538,-0.002038,-0.000164,0.001419,69,-1,4761.0,-69,-4761.0
1000087,1000087,-0.001075,-0.001881,0.002837,-0.000307,-0.000096,0.001592,0.0015619,-0.002839,0.001233,-0.001343,-0.000163,0.001426,0.000223,-0.002026,0.000878,0.003561,-0.00061,0.001066,0.000048,-0.002325,-0.000491,-0.002953,-0.000025,-0.000293,-0.000798,-0.001131,0.000434,0.000332,-0.003127,0.001305,63,-1,3969.0,-63,-3969.0
1000091,1000091,-0.001084,-0.002091,-0.000996,-0.000215,-0.002503,-0.001989,-0.002098,-0.000628,0.001017,0.000015,-0.00037,0.001749,0.000113,0.000438,-0.002894,-0.000704,-0.001561,0.001479,0.002289,-0.001179,0.0018,0.001725,0.000867,-0.002852,0.003718,0.0014467,-0.003404,-0.000912,0.001834,-0.002691,67,1,4489.0,67,4489.0
1000104,1000104,-0.001129,-0.002598,-0.000212,0.001835,-0.000825,0.002155,0.001301,-0.003622,0.001843,-0.00198,0.000375,0.0013351,0.002333,0.000826,0.00067,-0.001056,-0.000616,0.000934,0.00157,0.000495,0.000511,-0.001447,-0.001087,-0.004348,0.001088,-0.000894,-0.001209,0.001293,0.0012656,-0.002716,65,1,4225.0,65,4225.0
1000118,1000118,0.000631,0.000298,0.001954,0.006409,0.000856,0.000346,0.000626,0.001469,-0.001753,0.002845,-0.001838,0.001675,-0.000219,0.001595,-0.000769,-0.002683,0.001783,-0.002957,0.001839,-0.002299,-0.000798,-0.000908,0.000789,0.000411,-0.001799,0.000928,0.001702,-0.001282,-0.002072,0.000247,62,1,3844.0,62,3844.0


In [47]:
# Full set
file_out = os.path.join(DIR_OUT, "covar_age_sex_combination_pc30.txt")
df_joined.write_csv(file_out, separator="\t")

In [48]:
# Same as PanUKBB
df_joined_10 = df_joined.drop([
"PC11",
"PC12",
"PC13",
"PC14",
"PC15",
"PC16",
"PC17",
"PC18",
"PC19",
"PC20",
"PC21",
"PC22",
"PC23",
"PC24",
"PC25",
"PC26",
"PC27",
"PC28",
"PC29",
"PC30",
])

file_out = os.path.join(DIR_OUT, "covar_age_sex_combination_pc10.txt")
df_joined_10.write_csv(file_out, separator="\t")
df_joined_10

#FID,IID,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,Age,Sex,Age^2,Age_Sex,Age^2_Sex
i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64,f64,i64,f64
1000010,1000010,-0.000892,0.002689,0.004079,-0.002191,-0.000797,0.001218,-0.001312,0.00066,-0.000703,0.000095,63,-1,3969.0,-63,-3969.0
1000028,1000028,-0.000471,-0.002047,-0.000748,0.000461,-0.000755,0.001297,-0.000282,0.000438,-0.000922,-0.000048,47,-1,2209.0,-47,-2209.0
1000034,1000034,0.0017603,0.000476,-0.001139,0.000192,-0.002018,0.00054,-0.001851,0.000626,-0.000226,-0.000451,58,-1,3364.0,-58,-3364.0
1000045,1000045,0.000617,-0.002361,0.003893,0.002225,-0.001103,-0.000891,0.000084,-0.000536,0.000053,0.000022,67,-1,4489.0,-67,-4489.0
1000052,1000052,0.004033,-0.00088,-0.002209,-0.001374,-0.001971,-0.001118,-0.000997,0.00139,-0.001738,0.000704,64,-1,4096.0,-64,-4096.0
1000076,1000076,-0.000789,0.000114,-0.000083,0.000623,-0.003142,0.000668,-0.00216,0.000372,-0.00023,0.000418,69,-1,4761.0,-69,-4761.0
1000087,1000087,-0.001075,-0.001881,0.002837,-0.000307,-0.000096,0.001592,0.0015619,-0.002839,0.001233,-0.001343,63,-1,3969.0,-63,-3969.0
1000091,1000091,-0.001084,-0.002091,-0.000996,-0.000215,-0.002503,-0.001989,-0.002098,-0.000628,0.001017,0.000015,67,1,4489.0,67,4489.0
1000104,1000104,-0.001129,-0.002598,-0.000212,0.001835,-0.000825,0.002155,0.001301,-0.003622,0.001843,-0.00198,65,1,4225.0,65,4225.0
1000118,1000118,0.000631,0.000298,0.001954,0.006409,0.000856,0.000346,0.000626,0.001469,-0.001753,0.002845,62,1,3844.0,62,3844.0
