In [1]:
library("data.table")

# Drop close relatives

In [2]:
close_rel <- fread("/mnt/project/genotypes/ids_to_drop_close_relatives.txt")[, .(eid = ID, drop = TRUE)]

In [3]:
pheno_cov <- fread("/mnt/project/pheno_cov/smoke_lung_cancer_copd.csv")

In [4]:
df <- merge(pheno_cov, close_rel, by = "eid", all.x = TRUE)

In [5]:
df[is.na(drop), drop := FALSE]

In [6]:
df[drop == TRUE, .N]

In [7]:
df <- df[drop == FALSE]

# Format fields

In [8]:
pc_names <- colnames(df)[grepl("^22009-0\\.", colnames(df))]
df_pc <- df[, ..pc_names]
colnames(df_pc) <- sprintf("pc%s", 1:ncol(df_pc))
head(df_pc)

pc1,pc2,pc3,pc4,pc5,pc6,pc7,pc8,pc9,pc10,pc11,pc12,pc13,pc14,pc15,pc16,pc17,pc18,pc19,pc20
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
-10.7447,4.69757,-2.42724,2.85883,-1.74129,0.119959,2.76502,-1.7934,-11.8407,2.52803,1.91106,2.04605,-0.130913,-1.36422,3.48293,2.45149,0.780783,-1.85919,1.23869,4.45295
-14.6587,3.07706,-0.386946,1.63055,-0.579281,1.33932,2.71481,0.103133,-0.496704,1.20349,3.19874,0.614306,-0.198499,-3.43991,1.5524,0.889537,-2.21544,-2.63729,0.31653,1.37628
-11.1732,5.37414,-1.76331,0.999203,-4.46673,-0.915626,1.42504,-0.900987,-1.20553,3.36004,2.69228,-1.30167,-2.00032,-1.72457,-1.26217,-3.55559,0.243108,-1.12578,-6.97157,3.34912
-13.5205,6.94157,-3.6157,4.09157,3.63115,-2.79123,1.06405,-3.69868,0.612475,-2.3315,-2.43125,2.34491,1.27926,0.0081137,-0.856328,-0.155175,-2.39929,-2.5625,-0.927864,-0.163535
-13.8896,4.77904,-4.00437,4.64834,4.3298,2.23795,0.803446,3.65078,-2.87814,0.260571,1.46552,1.95843,-0.192651,6.08176,1.40899,1.38562,1.09385,-2.42232,-4.44264,-0.185869
-13.2771,7.10413,-3.61512,3.30126,3.54118,-0.453903,0.726986,0.601553,5.9943,0.464687,-2.10885,3.29011,-1.08941,4.82561,-0.872009,-0.0443844,3.21077,-0.812609,0.078049,2.89079


In [9]:
cancer_icd10_names <- colnames(df)[grepl("^40006", colnames(df))]
df_cancer_icd10 <- df[, ..cancer_icd10_names]
has_lung_cancer_mat <- df_cancer_icd10[, lapply(.SD, function(col){grepl("C34", col)})]
has_lung_cancer <- apply(has_lung_cancer_mat, any, MARGIN = 1)

In [10]:
copd_raw <- df[["131492-0.0"]]
copd <- ifelse(
    copd_raw == "",
    FALSE,
    ifelse(
        copd_raw %in% c(
            "1900-01-01", # Code has no event date
            "1901-01-01", # Code has event date before participant's date of birth
            "1902-02-02", # Code has event date matching participant's date of birth
            "1903-03-03", # Code has event date after participant's date of birth and falls in the same calendar year as date of birth
            "1909-09-09", # Code has event date in the future and is presumed to be a place-holder or other system default
            "2037-07-07"  # Code has event date in the future and is presumed to be a place-holder or other system default
        ),
        NA,
        TRUE
    )
)

In [11]:
df_others <- df[
    , .(
        eid,
        bmi = `21001-0.0`,
        age = `21022-0.0`,
        lung_cancer = has_lung_cancer,
        previous_or_current_smoker = (`20116-0.0` != "Never"),
        smoking_pack_years = `20161-0.0`,
        copd = copd
    )
]

In [12]:
ret <- cbind(df_others, df_pc)

In [13]:
fwrite(ret, "smoke_lung_cancer_copd_analysis_ready.csv")

In [14]:
system("dx upload smoke_lung_cancer_copd_analysis_ready.csv --path pheno_cov/")

# For plink2 sample filtering

In [15]:
ret2 <- ret[, .(`#FID` = eid, IID = eid)]

In [16]:
fwrite(ret2, "smoke_lung_cancer_copd_sample_list.txt", sep = "\t")

In [17]:
system("dx upload smoke_lung_cancer_copd_sample_list.txt --path pheno_cov/")