## 1. Setup

In [None]:
rm(list = ls())

install.packages("Rcpp")
library(Rcpp)

install.packages("dplyr")
library(dplyr)

endpoint_names <- c("AD", "CVD", "DM", "LD", "RD", "AF", "HF", "CAD", "VT", "ISS", "AAA", "PAD", "AS", "COPD", "LC", "MEL", "CRC", "PC", "BC", "PD", "OP", "CAT", "POAG", "HT")

## 2. Read in required files files

In [None]:
#read imp file
dl_cmd <- paste0("dx download 'UKBRISK_Imputed/final/imputed_data_02092024.tsv' --overwrite")
system(dl_cmd, intern = TRUE)
df <- read.delim("imputed_data_02092024.tsv", sep = "\t") 


In [None]:
system("dx download 'UKBRISK_Processed/mapping_ts.txt'")
system("dx download 'UKBRISK_Processed/Clinicalrisk_mapping_v2.tsv'")

mapping_ts <- read.delim("mapping_ts.txt", sep = "\t")
mapping_ts <- mapping_ts[-1,]
mapping_ts$Column.name <- paste0("ts_",mapping_ts$Column.name)
table(as.factor(mapping_ts$Data.type))
table(names(df)[grep("^ts_", names(df))] %in% mapping_ts$Column.name) #top

mapping_clinicalrisk <- read.delim("Clinicalrisk_mapping_v2.tsv", sep = "\t")
mapping_clinicalrisk <- mapping_clinicalrisk[-1,]
mapping_clinicalrisk$Column.name <- paste0("clinicalrisk_",mapping_clinicalrisk$Column.name)
table(as.factor(mapping_clinicalrisk$Data.type))
mapping_clinicalrisk[mapping_clinicalrisk$Column.name == "clinicalrisk_Systolic.blood.pressure","Column.name"] = 'clinicalrisk_SBP_mean'
table(names(df)[grep("^clinicalrisk_", names(df))] %in% mapping_clinicalrisk$Column.name) #top

## 3. Calc Risk Scores for NHC

### 3.1 GPPAQ

In [None]:
df$nhc_Activity.hours <- ((df$nhc_Duration.of.moderate.activity...Instance.0 * df$nhc_Number.of.days.week.of.moderate.physical.activity.10..minutes...Instance.0) + 
                         (df$nhc_Duration.of.vigorous.activity...Instance.0 * df$nhc_Number.of.days.week.of.vigorous.physical.activity.10..minutes...Instance.0))/60

plot(density(df$nhc_Activity.hours))

df$nhc_Inactive <- ifelse((df$nhc_Job.involves.heavy.manual.or.physical.work...Instance.0_1 == TRUE | 
                           df$nhc_Job.involves.heavy.manual.or.physical.work...Instance.0_2 == TRUE) &
                           (df$nhc_Job.involves.mainly.walking.or.standing...Instance.0_1 == TRUE |
                           df$nhc_Job.involves.mainly.walking.or.standing...Instance.0_2 == TRUE) &
                           df$nhc_Activity.hours == 0, TRUE, NA)

table(df$nhc_Inactive)

df$nhc_Moderately.inactive <- ifelse((df$nhc_Job.involves.heavy.manual.or.physical.work...Instance.0_1 == TRUE | 
                                      df$nhc_Job.involves.heavy.manual.or.physical.work...Instance.0_2 == TRUE) &
                                      (df$nhc_Job.involves.mainly.walking.or.standing...Instance.0_1 == TRUE |
                                      df$nhc_Job.involves.mainly.walking.or.standing...Instance.0_2 == TRUE) &
                                      df$nhc_Activity.hours < 1 & df$nhc_Activity.hours > 0, TRUE, NA)
df$nhc_Moderately.inactive <- ifelse((!(df$nhc_Job.involves.heavy.manual.or.physical.work...Instance.0_3 == TRUE | 
                                       df$nhc_Job.involves.heavy.manual.or.physical.work...Instance.0_4 == TRUE)) &
                                      (df$nhc_Job.involves.mainly.walking.or.standing...Instance.0_3 == TRUE | 
                                      df$nhc_Job.involves.mainly.walking.or.standing...Instance.0_4 == TRUE) &
                                      df$nhc_Activity.hours == 0, TRUE, df$nhc_Moderately.inactive)
table(df$nhc_Moderately.inactive)

df$nhc_Moderately.active <- ifelse((df$nhc_Job.involves.heavy.manual.or.physical.work...Instance.0_1 == TRUE | 
                                    df$nhc_Job.involves.heavy.manual.or.physical.work...Instance.0_2 == TRUE) &
                                    (df$nhc_Job.involves.mainly.walking.or.standing...Instance.0_1 == TRUE |
                                    df$nhc_Job.involves.mainly.walking.or.standing...Instance.0_2 == TRUE) & 
                                    df$nhc_Activity.hours < 3 & df$nhc_Activity.hours > 1, TRUE, NA)
df$nhc_Moderately.active <- ifelse((!(df$nhc_Job.involves.heavy.manual.or.physical.work...Instance.0_3 == TRUE | 
                                     df$nhc_Job.involves.heavy.manual.or.physical.work...Instance.0_4 == TRUE)) &
                                    (df$nhc_Job.involves.mainly.walking.or.standing...Instance.0_3 == TRUE | 
                                    df$nhc_Job.involves.mainly.walking.or.standing...Instance.0_4 == TRUE) &
                                    df$nhc_Activity.hours < 1 & df$nhc_Activity.hours > 0, TRUE, df$nhc_Moderately.active)
df$nhc_Moderately.active <- ifelse(df$nhc_Job.involves.heavy.manual.or.physical.work...Instance.0_3 == TRUE &
                                   df$nhc_Activity.hours == 0, TRUE, df$nhc_Moderately.active)
table(df$nhc_Moderately.active)

df$nhc_Active <- ifelse((df$nhc_Job.involves.heavy.manual.or.physical.work...Instance.0_1 == TRUE | 
                         df$nhc_Job.involves.heavy.manual.or.physical.work...Instance.0_2 == TRUE) &
                        (df$nhc_Job.involves.mainly.walking.or.standing...Instance.0_1 == TRUE |
                         df$nhc_Job.involves.mainly.walking.or.standing...Instance.0_2 == TRUE) &
                        df$nhc_Activity.hours >= 3, TRUE, NA)
df$nhc_Active <- ifelse((!(df$nhc_Job.involves.heavy.manual.or.physical.work...Instance.0_3 == TRUE | 
                           df$nhc_Job.involves.heavy.manual.or.physical.work...Instance.0_4 == TRUE)) &
                         (df$nhc_Job.involves.mainly.walking.or.standing...Instance.0_3 == TRUE | 
                          df$nhc_Job.involves.mainly.walking.or.standing...Instance.0_4 == TRUE) &
                         df$nhc_Activity.hours < 3 & df$nhc_Activity.hours > 1, TRUE, df$nhc_Active)
df$nhc_Active <- ifelse(df$nhc_Job.involves.heavy.manual.or.physical.work...Instance.0_3 == TRUE & 
                        df$nhc_Activity.hours < 1 & df$nhc_Activity.hours > 0, TRUE, df$nhc_Active)
df$nhc_Active <- ifelse(df$nhc_Job.involves.heavy.manual.or.physical.work...Instance.0_4 == TRUE, TRUE, df$nhc_Active)
table(df$nhc_Active)

more_than_one_true <- apply(df[, c("nhc_Inactive", "nhc_Moderately.inactive", "nhc_Moderately.active", "nhc_Active")], 1, function(x) {
  sum(x, na.rm = TRUE) > 1
}) # this is due to imputation, consider individuals (n=18) as moderately active
df[more_than_one_true, c("nhc_Inactive", "nhc_Moderately.inactive", "nhc_Moderately.active", "nhc_Active")] <- NA
df$nhc_Moderately.active[more_than_one_true] <- TRUE

#merge for GGPAQ
df$nhc_GPPAQ <- apply(df[, c("nhc_Inactive", "nhc_Moderately.inactive", "nhc_Moderately.active", "nhc_Active")], 1, function(x) {
  if (any(x, na.rm = TRUE)) {
    return(names(x)[which.max(x)])
  } else {
    return(NA)
  }
})


In [None]:
#one-hot encode
levels(as.factor(df$nhc_GPPAQ))
df$nhc_GPPAQ_active <- ifelse(df$nhc_GPPAQ == "nhc_Active" & !is.na(df$nhc_GPPAQ), TRUE, FALSE)
df$nhc_GPPAQ_inactive <- ifelse(df$nhc_GPPAQ == "nhc_Inactive" & !is.na(df$nhc_GPPAQ), TRUE, FALSE)
df$nhc_GPPAQ_moderatelyactive <- ifelse(df$nhc_GPPAQ == "nhc_Moderately.active" & !is.na(df$nhc_GPPAQ), TRUE, FALSE)
df$nhc_GPPAQ_moderatelyinactive <- ifelse(df$nhc_GPPAQ == "nhc_Moderately.inactive" & !is.na(df$nhc_GPPAQ), TRUE, FALSE)
df <- df[, !names(df) %in% "nhc_GPPAQ"]

### 3.2 AUDIT-C

In [None]:
# AUDIT-C Q1
table(df$nhc_Alcohol.intake.frequency....Instance.0)
df$nhc_AUDIT.C.Q1 <- ifelse(df$nhc_Alcohol.intake.frequency....Instance.0 %in% c("6"), "0", NA)
df$nhc_AUDIT.C.Q1 <- ifelse(df$nhc_Alcohol.intake.frequency....Instance.0 %in% c("5"), "1", df$nhc_AUDIT.C.Q1)
df$nhc_AUDIT.C.Q1 <- ifelse(df$nhc_Alcohol.intake.frequency....Instance.0 %in% c("4"), "2", df$nhc_AUDIT.C.Q1)
df$nhc_AUDIT.C.Q1 <- ifelse(df$nhc_Alcohol.intake.frequency....Instance.0 %in% c("3"), "3", df$nhc_AUDIT.C.Q1)
df$nhc_AUDIT.C.Q1 <- ifelse(df$nhc_Alcohol.intake.frequency....Instance.0 %in% c("2", "1"), "4", df$nhc_AUDIT.C.Q1)
table(df$nhc_AUDIT.C.Q1)

# AUDIT-C Q2
table(df$nhc_Average.weekly.champagne.plus.white.wine.intake...Instance.0)

# Calculate units per day
df$nhc_Units.per.day <- (df$nhc_Average.weekly.red.wine.intake...Instance.0 * 1.5 + 
                         df$nhc_Average.weekly.champagne.plus.white.wine.intake...Instance.0 * 1.5 + 
                         df$nhc_Average.weekly.beer.plus.cider.intake...Instance.0 * 2 + 
                         df$nhc_Average.weekly.spirits.intake...Instance.0 + 
                         df$nhc_Average.weekly.fortified.wine.intake...Instance.0) / 7
plot(density(df$nhc_Units.per.day))

# Assign AUDIT-C Q2 scores
df$nhc_AUDIT.C.Q2 <- ifelse(df$nhc_Units.per.day >= 0 & df$nhc_Units.per.day <= 2, "0", NA)
df$nhc_AUDIT.C.Q2 <- ifelse(df$nhc_Units.per.day > 2 & df$nhc_Units.per.day <= 4, "1", df$nhc_AUDIT.C.Q2)
df$nhc_AUDIT.C.Q2 <- ifelse(df$nhc_Units.per.day > 4 & df$nhc_Units.per.day <= 6, "2", df$nhc_AUDIT.C.Q2)
df$nhc_AUDIT.C.Q2 <- ifelse(df$nhc_Units.per.day > 6 & df$nhc_Units.per.day <= 9, "3", df$nhc_AUDIT.C.Q2)
df$nhc_AUDIT.C.Q2 <- ifelse(df$nhc_Units.per.day > 9, "4", df$nhc_AUDIT.C.Q2)

# Convert AUDIT-C Q1 and Q2 scores to numeric
df$nhc_AUDIT.C.Q1 <- as.numeric(df$nhc_AUDIT.C.Q1)
df$nhc_AUDIT.C.Q2 <- as.numeric(df$nhc_AUDIT.C.Q2)

# Calculate AUDIT-C score and result
df$nhc_AUDIT.C.score <- (df$nhc_AUDIT.C.Q1 + df$nhc_AUDIT.C.Q2) * 1.5
df$nhc_AUDIT.C.result <- ifelse(df$nhc_AUDIT.C.score >= 5, "Positive", NA)
df$nhc_AUDIT.C.result <- ifelse(df$nhc_AUDIT.C.score < 5, "Negative", df$nhc_AUDIT.C.result)
table(df$nhc_AUDIT.C.result)


In [None]:
plot(density(df$nhc_AUDIT.C.score))

### 3.3 QRISK3

In [None]:
# Ethnic background conditional array
table(df$nhc_Ethnic.background...Instance.0_1)
table(is.na(df$nhc_Ethnic.background...Instance.0_1))

df$nhc_qrisk.ethnic.background <- case_when(
  # White or not stated (TRUE for any of these columns)
  df$nhc_Ethnic.background...Instance.0_1 == TRUE | 
    df$nhc_Ethnic.background...Instance.0_1001 == TRUE | 
    df$nhc_Ethnic.background...Instance.0_1002 == TRUE | 
    df$nhc_Ethnic.background...Instance.0_1003 == TRUE |
    df$nhc_Ethnic.background...Instance.0_2 == TRUE | 
    df$nhc_Ethnic.background...Instance.0_2001 == TRUE | 
    df$nhc_Ethnic.background...Instance.0_2002 == TRUE | 
    df$nhc_Ethnic.background...Instance.0_2003 == TRUE | 
    df$nhc_Ethnic.background...Instance.0_2004 == TRUE ~ 1,
  
  # Indian
  df$nhc_Ethnic.background...Instance.0_3001 == TRUE ~ 2,
  
  # Pakistani
  df$nhc_Ethnic.background...Instance.0_3002 == TRUE ~ 3,
  
  # Bangladeshi
  df$nhc_Ethnic.background...Instance.0_3003 == TRUE ~ 4,
  
  # Other Asian
  df$nhc_Ethnic.background...Instance.0_3004 == TRUE | 
    df$nhc_Ethnic.background...Instance.0_3 == TRUE ~ 5,
  
  # Black Caribbean
  df$nhc_Ethnic.background...Instance.0_4001 == TRUE ~ 6,
  
  # Black African
  df$nhc_Ethnic.background...Instance.0_4002 == TRUE ~ 7,
  
  # Chinese
  df$nhc_Ethnic.background...Instance.0_5 == TRUE ~ 8,
  
  # Other ethnic group
  df$nhc_Ethnic.background...Instance.0_6 == TRUE |
    df$nhc_Ethnic.background...Instance.0_4 == TRUE |
    df$nhc_Ethnic.background...Instance.0_4003 == TRUE ~ 9,
  
  # Default NA for unhandled cases
  TRUE ~ NA_real_
)

table(df$nhc_qrisk.ethnic.background)
table(is.na(df$nhc_qrisk.ethnic.background))


In [None]:
# Smoking status conditional array
df$nhc_qrisk.smoking.status <- case_when(
  df$nhc_Smoking.status...Instance.0_0 == TRUE ~ 1,
  df$nhc_Smoking.status...Instance.0_1 == TRUE ~ 2,
  df$nhc_Number.of.cigarettes.currently.smoked.daily..current.cigarette.smokers....Instance.0 < 10 ~ 3,
  df$nhc_Number.of.cigarettes.currently.smoked.daily..current.cigarette.smokers....Instance.0 >= 10 & 
    df$nhc_Number.of.cigarettes.currently.smoked.daily..current.cigarette.smokers....Instance.0 <= 19 ~ 4,
  df$nhc_Number.of.cigarettes.currently.smoked.daily..current.cigarette.smokers....Instance.0 >= 20 ~ 5,
  TRUE ~ NA_real_
)

table(df$nhc_qrisk.smoking.status)
table(is.na(df$nhc_qrisk.smoking.status))


In [None]:
# Function to calculate QRISK3 for females
cvd_female_raw <- function(age, b_AF, b_atypicalantipsy, b_corticosteroids, b_migraine, b_ra, b_renal, 
                           b_semi, b_sle, b_treatedhyp, b_type1, b_type2, bmi, ethrisk, fh_cvd, 
                           rati, sbp, sbps5, smoke_cat, surv, town) {
  # Convert all inputs to numeric at the start
  age <- as.numeric(age)
  b_AF <- as.numeric(b_AF)
  b_atypicalantipsy <- as.numeric(b_atypicalantipsy)
  b_corticosteroids <- as.numeric(b_corticosteroids)
  b_migraine <- as.numeric(b_migraine)
  b_ra <- as.numeric(b_ra)
  b_renal <- as.numeric(b_renal)
  b_semi <- as.numeric(b_semi)
  b_sle <- as.numeric(b_sle)
  b_treatedhyp <- as.numeric(b_treatedhyp)
  b_type1 <- as.numeric(b_type1)
  b_type2 <- as.numeric(b_type2)
  bmi <- as.numeric(bmi)
  ethrisk <- as.numeric(ethrisk)
  fh_cvd <- as.numeric(fh_cvd)
  rati <- as.numeric(rati)
  sbp <- as.numeric(sbp)
  sbps5 <- as.numeric(sbps5)
  smoke_cat <- as.numeric(smoke_cat)
  surv <- as.numeric(surv)
  town <- as.numeric(town)
  
  # Survivor array
  survivor <- c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0.988876402378082, 0, 0, 0, 0, 0)
  
  # Conditional arrays
  Iethrisk <- c(0, 0.280403143329954250, 0.562989941420753980, 0.295900008511165160, 0.072785379877982545, 
                -0.170721355088573170, -0.393710433148749710, -0.326324952835302720, -0.171270568832417840)
  
  Ismoke <- c(0, 0.133868337865462620, 0.562008580124385370, 0.667495933775025470, 0.849481776448308470)
  
  # Apply fractional polynomial transforms
  dage <- age / 10
  age_1 <- dage^(-2)
  age_2 <- dage
  dbmi <- bmi / 10
  bmi_1 <- dbmi^(-2)
  bmi_2 <- (dbmi^(-2)) * log(dbmi)
  
  # Centre the continuous variables
  age_1 <- age_1 - 0.053274843841791
  age_2 <- age_2 - 4.332503318786621
  bmi_1 <- bmi_1 - 0.154946178197861
  bmi_2 <- bmi_2 - 0.144462317228317
  rati <- rati - 3.476326465606690
  sbp <- sbp - 123.130012512207030
  sbps5 <- sbps5 - 9.002537727355957
  town <- town - 0.392308831214905
  
  # Start of Sum
  a <- 0
  
  # Conditional sums
  a <- a + Iethrisk[ethrisk]  # Add +1 for R indexing (which starts at 1)
  a <- a + Ismoke[smoke_cat]
  
  # Sum from continuous values
  a <- a + age_1 * -8.1388109247726188
  a <- a + age_2 * 0.79733376689699098
  a <- a + bmi_1 * 0.29236092275460052
  a <- a + bmi_2 * -4.1513300213837665
  a <- a + rati * 0.15338035820802554
  a <- a + sbp * 0.013131488407103424
  a <- a + sbps5 * 0.0078894541014586095
  a <- a + town * 0.077223790588590108
  
  # Sum from boolean values
  a <- a + b_AF * 1.5923354969269663
  a <- a + b_atypicalantipsy * 0.25237642070115557
  a <- a + b_corticosteroids * 0.59520725304601851
  a <- a + b_migraine * 0.301267260870345
  a <- a + b_ra * 0.21364803435181942
  a <- a + b_renal * 0.65194569493845833
  a <- a + b_semi * 0.12555308058820178
  a <- a + b_sle * 0.75880938654267693
  a <- a + b_treatedhyp * 0.50931593683423004
  a <- a + b_type1 * 1.7267977510537347
  a <- a + b_type2 * 1.0688773244615468
  a <- a + fh_cvd * 0.45445319020896213
  
  # Sum from interaction terms
  a <- a + age_1 * (smoke_cat == 2) * -4.7057161785851891
  a <- a + age_1 * (smoke_cat == 3) * -2.7430383403573337
  a <- a + age_1 * (smoke_cat == 4) * -0.86608088829392182
  a <- a + age_1 * (smoke_cat == 5) * 0.90241562369710648
  a <- a + age_1 * b_AF * 19.938034889546561
  a <- a + age_1 * b_corticosteroids * -0.98408045235936281
  a <- a + age_1 * b_migraine * 1.7634979587872999
  a <- a + age_1 * b_renal * -3.5874047731694114
  a <- a + age_1 * b_sle * 19.690303738638292
  a <- a + age_1 * b_treatedhyp * 11.872809733921812
  a <- a + age_1 * b_type1 * -1.2444332714320747
  a <- a + age_1 * b_type2 * 6.8652342000009599
  a <- a + age_1 * bmi_1 * 23.802623412141742
  a <- a + age_1 * bmi_2 * -71.184947692087007
  a <- a + age_1 * fh_cvd * 0.99467807940435127
  a <- a + age_1 * sbp * 0.034131842338615485
  a <- a + age_1 * town * -1.0301180802035639
  a <- a + age_2 * (smoke_cat == 2) * -0.075589244643193026
  a <- a + age_2 * (smoke_cat == 3) * -0.119511928748670740
  a <- a + age_2 * (smoke_cat == 4) * -0.103663063975719230
  a <- a + age_2 * (smoke_cat == 5) * -0.139918535917183890
  a <- a + age_2 * b_AF * -0.076182651011162505
  a <- a + age_2 * b_corticosteroids * -0.120053649467424720
  a <- a + age_2 * b_migraine * -0.065586917898699859
  a <- a + age_2 * b_renal * -0.226888730864425070
  a <- a + age_2 * b_sle * 0.077347949679016273
  a <- a + age_2 * b_treatedhyp * 0.00096857823588174436
  a <- a + age_2 * b_type1 * -0.287240646244889490
  a <- a + age_2 * b_type2 * -0.097112252590695489
  a <- a + age_2 * bmi_1 * 0.523699589336644290
  a <- a + age_2 * bmi_2 * 0.045744190122323759
  a <- a + age_2 * fh_cvd * -0.076885051698423038
  a <- a + age_2 * sbp * -0.001508250142327236
  a <- a + age_2 * town * -0.031593414674962329
  
  
  # Final calculation for the score
  score <- 100 * (1 - (survivor[surv]) ^ exp(a))
  
  return(score)
}

In [None]:
# Apply QRISK3 calculation for females
df$nhc_female_qrisk <- mapply(function(Sex_0, Age, AF, antipsy, steroids, Migraine, RA, CKD, Mental, SLE, hypertensives, 
                                   Type1, Type2, BMI, ethnic_bg, Rel_illness, HDL_ratio, SBP_mean, SBP_sd, smoking_status, Townsend) {
  if (Sex_0 == 1) {  # Assuming 1 represents female
    cvd_female_raw(Age, AF, antipsy, steroids, Migraine, RA, CKD, Mental, SLE, hypertensives, Type1, Type2, BMI, 
                   ethnic_bg, Rel_illness, HDL_ratio, SBP_mean, SBP_sd, smoking_status, 10, Townsend)
  } else {
    NA  # Return NA for non-female rows
  }
}, 
# Provide the corresponding columns from the dataset as arguments
df$nhc_Sex_0, df$nhc_Age.at.recruitment, df$nhc_Atrial.fibrillation, df$nhc_atypical.antipsychotics, df$nhc_corticosteroids, 
df$nhc_Migraine, df$nhc_Rheumatoid.arthritis, df$nhc_Chronic.kidney.disease, df$nhc_Mental.illness, df$nhc_Systemic.lupus.erythematosus, 
df$nhc_antihypertensives, df$nhc_Type.1.Diabetes, df$nhc_Type.2.diabetes, df$nhc_BMI, df$nhc_qrisk.ethnic.background, 
df$nhc_Illnesses.of.relatives.0_1, df$nhc_Cholesterol.to.HDL.ratio, df$nhc_SBP_mean, df$nhc_SBP_sd, 
df$nhc_qrisk.smoking.status, df$nhc_Townsend)

# Check the summary
summary(df$nhc_female_qrisk)
table(is.na(df$nhc_female_qrisk))


In [None]:
# Function to calculate QRISK3 for males
cvd_male_raw <- function(age, b_AF, b_atypicalantipsy, b_corticosteroids, b_impotence2, b_migraine, 
                         b_ra, b_renal, b_semi, b_sle, b_treatedhyp, b_type1, b_type2, bmi, 
                         ethrisk, fh_cvd, rati, sbp, sbps5, smoke_cat, surv, town) {
  # Convert all inputs to numeric at the start
  age <- as.numeric(age)
  b_AF <- as.numeric(b_AF)
  b_atypicalantipsy <- as.numeric(b_atypicalantipsy)
  b_corticosteroids <- as.numeric(b_corticosteroids)
  b_impotence2 <- as.numeric(b_impotence2)
  b_migraine <- as.numeric(b_migraine)
  b_ra <- as.numeric(b_ra)
  b_renal <- as.numeric(b_renal)
  b_semi <- as.numeric(b_semi)
  b_sle <- as.numeric(b_sle)
  b_treatedhyp <- as.numeric(b_treatedhyp)
  b_type1 <- as.numeric(b_type1)
  b_type2 <- as.numeric(b_type2)
  bmi <- as.numeric(bmi)
  ethrisk <- as.numeric(ethrisk)
  fh_cvd <- as.numeric(fh_cvd)
  rati <- as.numeric(rati)
  sbp <- as.numeric(sbp)
  sbps5 <- as.numeric(sbps5)
  smoke_cat <- as.numeric(smoke_cat)
  surv <- as.numeric(surv)
  town <- as.numeric(town)
  
  # Survivor array
  survivor <- c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0.977268040180206, 0, 0, 0, 0, 0)
  
  # Conditional arrays
  Iethrisk <- c(0, 0.277192487603082790, 0.474463607149312680, 0.529617299196893710, 0.035100159186299017, 
                -0.358078996693279190, -0.400564852321651400, -0.415227928898301730, -0.263213481347499670)
  
  Ismoke <- c(0, 0.191282228633889830, 0.552415881926455520, 0.638350530275060720, 0.789838198818580190)
  
  # Apply fractional polynomial transforms
  dage <- age / 10
  age_1 <- dage^(-1)
  age_2 <- dage^3
  dbmi <- bmi / 10
  bmi_1 <- dbmi^(-2)
  bmi_2 <- (dbmi^(-2)) * log(dbmi)
  
  # Centre the continuous variables
  age_1 <- age_1 - 0.234766781330109
  age_2 <- age_2 - 77.284080505371094
  bmi_1 <- bmi_1 - 0.149176135659218
  bmi_2 <- bmi_2 - 0.141913309693336
  rati <- rati - 4.300998687744141
  sbp <- sbp - 128.571578979492190
  sbps5 <- sbps5 - 8.756621360778809
  town <- town - 0.526304900646210
  
  # Start of Sum
  a <- 0
  
  # Conditional sums
  a <- a + Iethrisk[ethrisk]
  a <- a + Ismoke[smoke_cat]
  
  # Sum from continuous values
  a <- a + age_1 * -17.839781666005575
  a <- a + age_2 * 0.0022964880605765492
  a <- a + bmi_1 * 2.4562776660536358
  a <- a + bmi_2 * -8.3011122314711354
  a <- a + rati * 0.17340196856327111
  a <- a + sbp * 0.012910126542553305
  a <- a + sbps5 * 0.010251914291290456
  a <- a + town * 0.033268201277287295
  
  # Sum from boolean values
  a <- a + b_AF * 0.88209236928054657
  a <- a + b_atypicalantipsy * 0.13046879855173513
  a <- a + b_corticosteroids * 0.45485399750445543
  a <- a + b_impotence2 * 0.22251859086705383
  a <- a + b_migraine * 0.25584178074159913
  a <- a + b_ra * 0.20970658013956567
  a <- a + b_renal * 0.71853261288274384
  a <- a + b_semi * 0.12133039882047164
  a <- a + b_sle * 0.4401572174457522
  a <- a + b_treatedhyp * 0.51659871082695474
  a <- a + b_type1 * 1.2343425521675175
  a <- a + b_type2 * 0.85942071430932221
  a <- a + fh_cvd * 0.54055469009390156
  
  # Sum from interaction terms
  a <- a + age_1 * (smoke_cat == 2) * -0.21011133933516346
  a <- a + age_1 * (smoke_cat == 3) * 0.75268676447503191
  a <- a + age_1 * (smoke_cat == 4) * 0.99315887556405791
  a <- a + age_1 * (smoke_cat == 5) * 2.1331163414389076
  a <- a + age_1 * b_AF * 3.4896675530623207
  a <- a + age_1 * b_corticosteroids * 1.1708133653489108
  a <- a + age_1 * b_impotence2 * -1.506400985745431
  a <- a + age_1 * b_migraine * 2.3491159871402441
  a <- a + age_1 * b_renal * -0.50656716327223694
  a <- a + age_1 * b_treatedhyp * 6.5114581098532671
  a <- a + age_1 * b_type1 * 5.3379864878006531
  a <- a + age_1 * b_type2 * 3.6461817406221311
  a <- a + age_1 * bmi_1 * 31.004952956033886
  a <- a + age_1 * bmi_2 * -111.29157184391643
  a <- a + age_1 * fh_cvd * 2.7808628508531887
  a <- a + age_1 * sbp * 0.018858524469865853
  a <- a + age_1 * town * -0.1007554870063731
  a <- a + age_2 * (smoke_cat == 2) * -0.00049854870275326121
  a <- a + age_2 * (smoke_cat == 3) * -0.00079875633317385414
  a <- a + age_2 * (smoke_cat == 4) * -0.00083706184266251296
  a <- a + age_2 * (smoke_cat == 5) * -0.00078400319155637289
  a <- a + age_2 * b_AF * -0.00034995608340636049
  a <- a + age_2 * b_corticosteroids * -0.0002496045095297166
  a <- a + age_2 * b_impotence2 * -0.0011058218441227373
  a <- a + age_2 * b_migraine * 0.00019896446041478631
  a <- a + age_2 * b_renal * -0.0018325930166498813
  a <- a + age_2 * b_treatedhyp * 0.00063838053104165013
  a <- a + age_2 * b_type1 * 0.0006409780808752897
  a <- a + age_2 * b_type2 * -0.00024695695588868315
  a <- a + age_2 * bmi_1 * 0.0050380102356322029
  a <- a + age_2 * bmi_2 * -0.013074483002524319
  a <- a + age_2 * fh_cvd * -0.00024791809907396037
  a <- a + age_2 * sbp * -0.00001271874191588457
  a <- a + age_2 * town * -0.000093299642323272888
  
  # Final calculation for the score
  score <- 100 * (1 - (survivor[surv]) ^ exp(a))
  
  return(score)
}

In [None]:
# Apply QRISK3 calculation for males
df$nhc_male_qrisk <- mapply(function(Sex_1, Age, AF, antipsy, steroids, ED, Migraine, RA, CKD, Mental, SLE, hypertensives, 
                                 Type1, Type2, BMI, ethnic_bg, Rel_illness, HDL_ratio, SBP_mean, SBP_sd, smoking_status, Townsend) {
  if (Sex_1 == 1) {  # Assuming 1 represents male
    cvd_male_raw(Age, AF, antipsy, steroids, ED, Migraine, RA, CKD, Mental, SLE, hypertensives, Type1, Type2, BMI, 
                 ethnic_bg, Rel_illness, HDL_ratio, SBP_mean, SBP_sd, smoking_status, 10, Townsend)
  } else {
    NA  # Return NA for non-male rows
  }
}, 
# Provide the corresponding columns from the dataset as arguments
df$nhc_Sex_1, df$nhc_Age.at.recruitment, df$nhc_Atrial.fibrillation, df$nhc_atypical.antipsychotics, df$nhc_corticosteroids, 
df$nhc_Erectile.dysfunction, df$nhc_Migraine, df$nhc_Rheumatoid.arthritis, df$nhc_Chronic.kidney.disease, df$nhc_Mental.illness, 
df$nhc_Systemic.lupus.erythematosus, df$nhc_antihypertensives, df$nhc_Type.1.Diabetes, df$nhc_Type.2.diabetes, df$nhc_BMI, 
df$nhc_qrisk.ethnic.background, df$nhc_Illnesses.of.relatives.0_1, df$nhc_Cholesterol.to.HDL.ratio, df$nhc_SBP_mean, df$nhc_SBP_sd, 
df$nhc_qrisk.smoking.status, df$nhc_Townsend)

# Check the summary of the results
summary(df$nhc_male_qrisk)
table(is.na(df$nhc_male_qrisk))


In [None]:
# Create a new column 'qrisk_score' by merging 'female_qrisk' and 'male_qrisk'
df$nhc_qrisk_score <- ifelse(is.na(df$nhc_female_qrisk), df$nhc_male_qrisk, df$nhc_female_qrisk)

# Check the summary of the combined QRISK score
summary(df$nhc_qrisk_score)
plot(density(df$nhc_qrisk_score))


### 3.4 Subset to only relevant cols

In [None]:
#pass to qrisk, processed below
df$qrisk_ethnic.background <- df$nhc_qrisk.ethnic.background

In [None]:
#one-hot encode ethnic background
for (level in levels(as.factor(df$nhc_qrisk.ethnic.background))) {
  df[[paste0("nhc_ethnic_", level)]] <- ifelse(df$nhc_qrisk.ethnic.background == level, TRUE, FALSE)
}

df$nhc_qrisk.ethnic.background <- NULL

In [None]:
final_columns <- c("nhc_Age.at.recruitment",
                   "nhc_Sex_0",
                   "nhc_Sex_1",
                   "nhc_Smoking.status...Instance.0_2",
                   "nhc_Illnesses.of.relatives.0_1",
                   "nhc_BMI",
                   "nhc_Cholesterol...Instance.0",
                   "nhc_SBP_mean",
                   "nhc_GPPAQ_active",
                   "nhc_GPPAQ_inactive",
                   "nhc_GPPAQ_moderatelyactive",
                   "nhc_GPPAQ_moderatelyinactive",
                   "nhc_AUDIT.C.score",
                   "nhc_qrisk_score",
                   'nhc_ethnic_1',
                   'nhc_ethnic_2',
                   'nhc_ethnic_3',
                   'nhc_ethnic_4',
                   'nhc_ethnic_5',
                   'nhc_ethnic_6',
                   'nhc_ethnic_7',
                   'nhc_ethnic_8',
                   'nhc_ethnic_9'
                  )

In [None]:
dim(df)
columns_to_remove <- setdiff(colnames(df)[grepl("^nhc_", colnames(df))], final_columns)
df <- df[, !colnames(df) %in% columns_to_remove]

dim(df)

## 4. Adjust Data types

### 4.1. Generate missing mapping files

#### 4.1.1 NHC

In [None]:
mapping_nhc <- data.frame("Column.name" = colnames(df[, grep("^nhc_", names(df))]), "Data.type" = "Factor")
continous_nhc <- c("nhc_Age.at.recruitment", "nhc_BMI", "nhc_Cholesterol...Instance.0", "nhc_SBP_mean", "nhc_AUDIT.C.score", 
                   "nhc_qrisk_score")

mapping_nhc[mapping_nhc$Column.name %in% continous_nhc, "Data.type"] <- "Continuous"

In [None]:
mapping_nhc

#### 4.1.2 SCORE-2

In [None]:
mapping_score <- data.frame("Column.name" = colnames(df[, grep("^score_", names(df))]), "Data.type" = "Factor")

continous_score <- c("score_Age.at.recruitment", "score_SBP_mean", "score_Cholesterol...Instance.0", "score_HDL.cholesterol...Instance.0")

mapping_score[mapping_score$Column.name %in% continous_score, "Data.type"] <- "Continuous"

In [None]:
mapping_score

#### 4.1.3 PREVENT

In [None]:
mapping_prevent <- data.frame("Column.name" = colnames(df[, grep("^prevent_", names(df))]), "Data.type" = "Factor")

continous_prevent <- c("prevent_Age.at.recruitment", "prevent_Cholesterol...Instance.0", "prevent_HDL.cholesterol...Instance.0", "prevent_SBP_mean",
                      "prevent_BMI", "prevent_eGFR", "prevent_UACR", "prevent_Glycated.haemoglobin..HbA1c....Instance.0", "prevent_Townsend.deprivation.index.at.recruitment")

mapping_prevent[mapping_prevent$Column.name %in% continous_prevent, "Data.type"] <- "Continuous"

In [None]:
mapping_prevent

#### 4.1.4 QRISK3 

In [None]:
#replace ethnicity cols with summary ethnicity col generated for nhs hc above
cols_to_remove <- c("qrisk_Ethnic.background...Instance.0_1",
                    "qrisk_Ethnic.background...Instance.0_1001",
                    "qrisk_Ethnic.background...Instance.0_1002",
                    "qrisk_Ethnic.background...Instance.0_1003",
                    "qrisk_Ethnic.background...Instance.0_2",
                    "qrisk_Ethnic.background...Instance.0_2001",
                    "qrisk_Ethnic.background...Instance.0_2002",
                    "qrisk_Ethnic.background...Instance.0_2003",
                    "qrisk_Ethnic.background...Instance.0_2004",
                    "qrisk_Ethnic.background...Instance.0_3",
                    "qrisk_Ethnic.background...Instance.0_3001",
                    "qrisk_Ethnic.background...Instance.0_3002",
                    "qrisk_Ethnic.background...Instance.0_3003",
                    "qrisk_Ethnic.background...Instance.0_3004",
                    "qrisk_Ethnic.background...Instance.0_4",
                    "qrisk_Ethnic.background...Instance.0_4001",
                    "qrisk_Ethnic.background...Instance.0_4002",
                    "qrisk_Ethnic.background...Instance.0_4003",
                    "qrisk_Ethnic.background...Instance.0_5",
                    "qrisk_Ethnic.background...Instance.0_6")

df <- df[, !colnames(df) %in% cols_to_remove]

In [None]:
#onehot encode ethnic background col
for (level in levels(as.factor(df$qrisk_ethnic.background))) {
  df[[paste0("qrisk_ethnic_", level)]] <- ifelse(df$qrisk_ethnic.background == level, TRUE, FALSE)
}

df$qrisk_ethnic.background <- NULL

In [None]:
mapping_qrisk <- data.frame("Column.name" = colnames(df[, grep("^qrisk_", names(df))]), "Data.type" = "Factor")

continous_qrisk <- c("qrisk_Age.at.recruitment", "qrisk_Townsend.deprivation.index.at.recruitment", "qrisk_Number.of.cigarettes.currently.smoked.daily..current.cigarette.smokers....Instance.0",
                     "qrisk_Cholesterol.to.HDL.ratio", "qrisk_SBP_mean", "qrisk_SBP_sd")

mapping_qrisk[mapping_qrisk$Column.name %in% continous_qrisk, "Data.type"] <- "Continuous"

In [None]:
mapping_qrisk

### 4.2 Adjust data types

In [None]:
#ts
for (i in 1:nrow(mapping_ts)) {
  column_name <- mapping_ts$Column.name[i]
  data_type <- mapping_ts$Data.type[i]
  if (column_name %in% names(df)) {
    if (data_type == "Continuous") {
      df[[column_name]] <- as.numeric(df[[column_name]])
    } else {
      df[[column_name]] <- as.factor(df[[column_name]])
    }
  }
}

In [None]:
#clinicalrisk
for (i in 1:nrow(mapping_clinicalrisk)) {
  column_name <- mapping_clinicalrisk$Column.name[i]
  data_type <- mapping_clinicalrisk$Data.type[i]
  if (column_name %in% names(df)) {
    if (data_type == "Continuous") {
      df[[column_name]] <- as.numeric(df[[column_name]])
    } else {
      df[[column_name]] <- as.factor(df[[column_name]])
    }
  }
}

In [None]:
#nhc
for (i in 1:nrow(mapping_nhc)) {
  column_name <- mapping_nhc$Column.name[i]
  data_type <- mapping_nhc$Data.type[i]
  if (column_name %in% names(df)) {
    if (data_type == "Continuous") {
      df[[column_name]] <- as.numeric(df[[column_name]])
    } else {
      df[[column_name]] <- as.factor(df[[column_name]])
    }
  }
}

In [None]:
#score
for (i in 1:nrow(mapping_score)) {
  column_name <- mapping_score$Column.name[i]
  data_type <- mapping_score$Data.type[i]
  if (column_name %in% names(df)) {
    if (data_type == "Continuous") {
      df[[column_name]] <- as.numeric(df[[column_name]])
    } else {
      df[[column_name]] <- as.factor(df[[column_name]])
    }
  }
}

In [None]:
#prevent
for (i in 1:nrow(mapping_prevent)) {
  column_name <- mapping_prevent$Column.name[i]
  data_type <- mapping_prevent$Data.type[i]
  if (column_name %in% names(df)) {
    if (data_type == "Continuous") {
      df[[column_name]] <- as.numeric(df[[column_name]])
    } else {
      df[[column_name]] <- as.factor(df[[column_name]])
    }
  }
}

In [None]:
#qrisk
for (i in 1:nrow(mapping_qrisk)) {
  column_name <- mapping_qrisk$Column.name[i]
  data_type <- mapping_qrisk$Data.type[i]
  if (column_name %in% names(df)) {
    if (data_type == "Continuous") {
      df[[column_name]] <- as.numeric(df[[column_name]])
    } else {
      df[[column_name]] <- as.factor(df[[column_name]])
    }
  }
}

## 5. Post-Processing

### 5.1 check for neg values

In [None]:
cols_to_check <- grep("^metabolomics_|ts_|clinicalrisk_|nhc_|score_|prevent_|qrisk_", names(df), value = TRUE)
cols_to_check <- cols_to_check[sapply(df[cols_to_check], is.numeric)]

In [None]:
#check for neg cols, should only be townsend
neg_cols <- sapply(df[,cols_to_check], function(col) any(col < 0, na.rm = TRUE))
if (any(neg_cols)) {
    cat("df contains neg cols: ", paste(names(df[,cols_to_check])[neg_cols], collapse = ", "), "\n")
} else {
    cat("No negative columns found in df.\n")
}

In [None]:
#townsend is already standardized and partially log-normalized, no need to do further log-transf or scaling
#from https://www.restore.ac.uk/geo-refer/36229dtuks00y19810000.php#:~:text=These%20four%20standardized%20scores%20are,area%20with%20overall%20mean%20values.: The unemployment and overcrowding percentages (+1) are then subjected to a log transformation in order to normalise the raw values, which tend to be highly skewed. All four variables are then standardized using a Z-score (subtract the mean value and divide by the standard deviation). These four standardized scores are then summed to obtain a single value which is the Townsend deprivation index. Positive values of the index will indicate areas with high material deprivation, whereas those with negative values will indicate relative affluence. A score of 0 represents an area with overall mean values.

#plot to assess normal dist
plot(density(df$qrisk_Townsend.deprivation.index.at.recruitment))

In [None]:
# def all continuous cols but townsend for further processing
cols_to_transform <- names(df)[grepl("^(metabolomics_|ts_|clinicalrisk_|nhc_|score_|prevent_|qrisk_)", names(df)) & 
                                 sapply(df, function(x) class(x) %in% c('numeric', 'integer'))]
cols_to_transform2 <- setdiff(cols_to_transform, c("qrisk_Townsend.deprivation.index.at.recruitment", "prevent_Townsend.deprivation.index.at.recruitment"))

### 5.2 replace 0s with 1/10th of the median of the column

In [None]:
df[cols_to_transform2] <- lapply(df[cols_to_transform2], function(x) {
  median_val <- median(x[x != 0], na.rm = TRUE) / 10
  x[x == 0] <- median_val
  return(x)
})

### 5.3 log scale

In [None]:
#log
df <- df %>% mutate(across(all_of(cols_to_transform2), log))

In [None]:
#scale 
means <- sapply(df[df$testtrain == 'train', cols_to_transform], mean, na.rm = TRUE)
sds <- sapply(df[df$testtrain == 'train', cols_to_transform], sd, na.rm = TRUE)
center_scale <- function(x, mean, sd) {
  (x - mean) / sd
}
df[, cols_to_transform] <- mapply(center_scale, 
                                  df[, cols_to_transform], 
                                  means, 
                                  sds, 
                                  SIMPLIFY = FALSE)

In [None]:
#check for NaN or -Inf
nan_exists <- suppressWarnings(sapply(df, function(col) any(is.nan(col))))
inf_exists <- suppressWarnings(sapply(df, function(col) any(is.infinite(col) & col < 0)))

if (any(nan_exists)) {
    cat("Columns with NaN values:", paste(names(df)[nan_exists], collapse = ", "), "\n")
} else {
    cat("No NaN values found in the dataframe.\n")
}

if (any(inf_exists)) {
    cat("Columns with -Inf values:", paste(names(df)[inf_exists], collapse = ", "), "\n")
} else {
    cat("No -Inf values found in the dataframe.\n")
}

### 5.4 Exclude outlier blood measurements > 5 SD

#### 5.4.1 blood measurements mapping

In [None]:
#panel
bloods_cr <- mapping_clinicalrisk$Column.name[22:44]
bloods_cr

In [None]:
#nhc
bloods_nhc <- mapping_nhc$Column.name[7]
bloods_nhc

In [None]:
#score
bloods_score <- mapping_score$Column.name[6:7]
bloods_score

In [None]:
#prevent
bloods_prevent <- mapping_prevent$Column.name[c(4,5,8,13,14)]
bloods_prevent

In [None]:
#qrisk
bloods_qrisk <- mapping_qrisk$Column.name[21]
bloods_qrisk

In [None]:
bloods <- c(bloods_cr, bloods_nhc, bloods_score, bloods_prevent, bloods_qrisk)
bloods

#### 5.3.2 Filtering

In [None]:
#filtering bloods (standard clinical chem & metabolomics) outlier measurements > 5 SD
dim(df)
df <- df %>% filter(across(c(starts_with("metabolomics_"), any_of(bloods)), ~abs(.) <= 5))
dim(df)

## 6. Save

In [None]:
#save df
filename_save <- paste0("Processed_final_04092024.tsv")


write.table(df, file = filename_save, sep = "\t")

upl_cmd <- paste0("dx upload ", filename_save, " --path UKBRISK_Processed/", filename_save)
system(upl_cmd, intern = TRUE) 