## Setup environment

In [None]:
# Import packages
from pyspark.sql import SparkSession
import pandas as pd
import subprocess
import dxpy
import numpy as np

## Initialize spark cluster

In [None]:
spark = SparkSession.builder \
    .appName("Phenotype Analysis") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .config("spark.kryoserializer.buffer.max", "128") \
    .getOrCreate()

## Import data

In [None]:
import phenofhy
from phenofhy import utils
import dxdata

dataset = utils.connect_to_dataset()

# Monkey-patch TSSLSocket to inject CA certs
import ssl, certifi
from thrift.transport import TSSLSocket

original_init = TSSLSocket.TSSLSocket.__init__

def patched_init(self, host, port, *args, **kwargs):
    # Inject CA cert path if not explicitly passed
    if "ca_certs" not in kwargs or not kwargs["ca_certs"]:
        kwargs["ca_certs"] = certifi.where()
    if "cert_reqs" not in kwargs:
        kwargs["cert_reqs"] = ssl.CERT_REQUIRED
    return original_init(self, host, port, *args, **kwargs)

TSSLSocket.TSSLSocket.__init__ = patched_init

## Select phenotypes

In [None]:
field_names = [
    'registration_year', 
    'registration_month',
    'birth_year',
    'birth_month',
    'demog_sex_1_1',
    'demog_sex_2_1',
    'demog_ethnicity_1_1',
    'housing_income_1_1',
    'diag_cvd_1_m',
    'diag_neuro_1_m',
    'health_pain_chronic_1_m',
    'diag_neuro_dev_1_m', 
    'diag_resp_1_m',
    'diag_psych_1_m', 
    'diag_gastro_1_m',
    'alcohol_curr_1_1', 
    'father_diag_psych_1_m', 
    'mother_diag_psych_1_m', 
    'sibling_diag_psych_1_m',
    'weight',
    'height',
    'demog_height_1_1',
    'demog_weight_1_1',
    'diag_osteo_1_m',
    'diag_cancer_1_m',
    'diag_opthal_1_m',
    'diag_2_m',
    'father_diag_cancer_1_m',
    'mother_diag_cancer_1_m',
    'sibling_diag_cancer_1_m',
    'smoke_status_1_1', 'smoke_status_2_1', 'smoke_tobacco_prev_1_1',
    'smoke_100_times_1_1', 'smoke_100_times_2_1', 'smoke_reg_1_m', 'smoke_tobacco_type_1_m'
]

df = dataset.retrieve_fields(names=field_names, engine=dxdata.connect())

# Convert to Pandas
pdf = df.toPandas()

#sample_pdf = sampled_df.toPandas()

pdf.columns = pdf.columns.str.replace('$', '.', regex=False)

pdf.columns

df_1 = phenofhy.process.questionnaire_fields(pdf, derive=['smoke_status_v1','smoke_status_v2'])

pdf = df_1

#fraction_to_sample = 0.01

# Sample the Spark DataFrame
#sampled_df = df.sample(fraction=fraction_to_sample)

## Preprocess dataframe for analyses

In [None]:
### Exclusions

age
- exclude -999

# Exclude invalid birth year/month
pdf = pdf[pdf['participant.birth_year'] != -999]
pdf = pdf[pdf['participant.birth_month'] != -999]

sex
- exclude: 3 Intersex
- exclude: -3 Prefer not to answer
- exclude: nan

# Exclude sex values: 3 (Intersex), -3 (Prefer not to answer)
pdf = pdf[~pdf['participant.demog_sex_1_1'].isin([3, -3])]
pdf = pdf[~pdf['participant.demog_sex_2_1'].isin([3, -3])]

ethnicity
- exclude: -3
- exclude: 19

#pdf = pdf[~pdf['participant$demog_ethnicity_1_1'].isin([19, -3])]

income
- exclude: -1 (do not know)
- exclude: -3 (prefer not to answer)
- exclude nan

#pdf = pdf[~pdf['questionnaire$housing_income_1_1'].isin([-1, -3, np.nan])]

### Transformations

In [None]:
# Calculate age using datetime
pdf['registration_date'] = pd.to_datetime(dict(year=pdf['participant.registration_year'], month=pdf['participant.registration_month'], day=1))
pdf['birth_date'] = pd.to_datetime(dict(year=pdf['participant.birth_year'], month=pdf['participant.birth_month'], day=1))
pdf['datetime_age'] = (pdf['registration_date'] - pdf['birth_date']).dt.days / 365.25

## Sex variable transformation
pdf['sex'] = np.where(pdf['participant.demog_sex_2_1'].notna(), pdf['participant.demog_sex_2_1'], pdf['participant.demog_sex_1_1'])
pdf['sex'] = pdf['sex'].map({1: 'Male', 2: 'Female'})

pdf.info()
pdf.describe()
pdf.head()

# Convert to csv
pdf.to_csv("df.csv", index=False)

change kernel to R

df <- read.csv("df.csv", sep = ",", header = TRUE)

head(df)

## Further Transformations

In [None]:
#ethnicity
df$ethnicity <- df$participant.demog_ethnicity_1_1
df$ethnicity_factor <- factor(df$ethnicity)
df$ethnicity_factor <- relevel(df$ethnicity_factor, ref = "1")

df$income <- df$questionnaire.housing_income_1_1
df$income_factor <- factor(df$income)
df$income_factor <- relevel(df$income_factor, ref = "1")

df$sex_factor <- factor(df$sex)

df$smoke_factor <- factor(df$derived.smoke_status)
df$smoke_factor <- relevel(df$smoke_factor, ref = "Never")

#alcohol
df$alcohol <- sapply(df$questionnaire.alcohol_curr_1_1, function(x) {
  if (is.na(x)) {
    NA
  } else if (x %in% 1:5) {
    TRUE
  } else if (x == 6) {
    FALSE
  } else {
    NA
  }
})

#age
df$age <- df$datetime_age

df$bmi <- df$clinic_measurements.weight / ((df$clinic_measurements.height / 100)^2)

levels(df$smoke_factor)

# scalar: map a single cell like "[2 9]" -> "Diabetes, Hypertension"
map_multi <- function(x, map) {
  x_chr <- as.character(x)

  # extract tokens: words/digits/underscore/hyphen (covers "9", "p20001_i0_a0", "-1")
  tokens <- regmatches(x_chr, gregexpr("[A-Za-z0-9_\\-]+", x_chr))[[1]]

  if (length(tokens) == 0) return(NA_character_)

  # lookup in map (map should be a named vector: names(map) are keys)
  mapped <- map[tokens]

  # keep only those that mapped
  mapped <- mapped[!is.na(mapped)]

  if (length(mapped) == 0) return(NA_character_)

  paste(mapped, collapse = ", ")
}


# ---------- OSTEO ----------
osteo_map <- c(
  "1"  = "Hip fracture",
  "2"  = "Osteoporosis",
  "3"  = "Osteoarthritis (arthritis)",
  "4"  = "Gout",
  "5"  = "Other (not listed)",
  "-7" = "None of the above",
  "-1" = "Do not know",
  "-3" = "Prefer not to answer"
)
df$osteo <- sapply(df$questionnaire.diag_osteo_1_m, map_multi, map = osteo_map)


# ---------- CANCER (self) ----------
cancer_map <- c(
  "1"  = "Anal",
  "2"  = "Bladder",
  "3"  = "Brain",
  "4"  = "Breast",
  "5"  = "Cervical",
  "6"  = "Colon/rectal",
  "7"  = "Oesophageal",
  "8"  = "Head and neck (Including cancers of the mouth, sinuses, nose, or throat. Not including brain or skin cancers.)",
  "9"  = "Gastric",
  "10" = "Kidney",
  "11" = "Leukaemia (blood and bone marrow)",
  "12" = "Liver",
  "13" = "Lung or bronchial",
  "14" = "Lymphoma",
  "15" = "Ovarian",
  "16" = "Pancreatic",
  "17" = "Prostate",
  "18" = "Skin",
  "19" = "Stomach",
  "20" = "Testicular",
  "21" = "Thyroid",
  "22" = "Uterine (endometrial)",
  "23" = "Another type of cancer",
  "-1" = "I know I had cancer, but I don't know what type",
  "-7" = "None of the above",
  "-3" = "Prefer not to answer"
)
df$cancer <- sapply(df$questionnaire.diag_cancer_1_m, map_multi, map = cancer_map)


# ---------- FATHER (cancer) ----------
father_cancer_map <- c(
  "1"  = "Anal",
  "2"  = "Bladder",
  "3"  = "Brain",
  "4"  = "Breast",
  "6"  = "Colon/rectal",
  "7"  = "Oesophageal",
  "8"  = "Head and neck (Including cancers of the mouth, sinuses, nose, or throat. Not including brain or skin cancers.)",
  "9"  = "Gastric",
  "10" = "Kidney",
  "11" = "Leukaemia (blood and bone marrow)",
  "12" = "Liver",
  "13" = "Lung or bronchial",
  "14" = "Lymphoma",
  "16" = "Pancreatic",
  "17" = "Prostate",
  "18" = "Skin",
  "19" = "Stomach",
  "20" = "Testicular",
  "21" = "Thyroid",
  "23" = "Another type of cancer",
  "-1" = "I know they had cancer, but I don't know what type",
  "-7" = "None of the above",
  "-3" = "Prefer not to answer"
)
df$father_cancer <- sapply(df$questionnaire.father_diag_cancer_1_m, map_multi, map = father_cancer_map)


# ---------- MOTHER (cancer) ----------
mother_cancer_map <- c(
  "1"  = "Anal",
  "2"  = "Bladder",
  "3"  = "Brain",
  "4"  = "Breast",
  "5"  = "Cervical",
  "6"  = "Colon/rectal",
  "7"  = "Oesophageal",
  "8"  = "Head and neck (Including cancers of the mouth, sinuses, nose, or throat. Not including brain or skin cancers.)",
  "9"  = "Gastric",
  "10" = "Kidney",
  "11" = "Leukaemia (blood and bone marrow)",
  "12" = "Liver",
  "13" = "Lung or bronchial",
  "14" = "Lymphoma",
  "15" = "Ovarian",
  "16" = "Pancreatic",
  "18" = "Skin",
  "19" = "Stomach",
  "21" = "Thyroid",
  "22" = "Uterine (endometrial)",
  "23" = "Another type of cancer",
  "-1" = "I know they had cancer, but I don't know what type",
  "-7" = "None of the above",
  "-3" = "Prefer not to answer"
)
df$mother_cancer <- sapply(df$questionnaire.mother_diag_cancer_1_m, map_multi, map = mother_cancer_map)


# ---------- SIBLING (cancer) ----------
sibling_cancer_map <- c(
  "1"  = "Anal",
  "2"  = "Bladder",
  "3"  = "Brain",
  "4"  = "Breast",
  "5"  = "Cervical",
  "6"  = "Colon/rectal",
  "7"  = "Oesophageal",
  "8"  = "Head and neck (Including cancers of the mouth, sinuses, nose, or throat. Not including brain or skin cancers.)",
  "9"  = "Gastric",
  "10" = "Kidney",
  "11" = "Leukaemia (blood and bone marrow)",
  "12" = "Liver",
  "13" = "Lung or bronchial",
  "14" = "Lymphoma",
  "15" = "Ovarian",
  "16" = "Pancreatic",
  "17" = "Prostate",
  "18" = "Skin",
  "19" = "Stomach",
  "20" = "Testicular",
  "21" = "Thyroid",
  "22" = "Uterine (endometrial)",
  "23" = "Another type of cancer",
  "-1" = "I know they had cancer, but I don't know what type",
  "-7" = "None of the above",
  "-3" = "Prefer not to answer"
)
df$sibling_cancer <- sapply(df$questionnaire.sibling_diag_cancer_1_m, map_multi, map = sibling_cancer_map)


# ---------- OPTHAL ----------
opthal_map <- c(
  "1"  = "Glaucoma",
  "2"  = "Visual impairment including blindness",
  "3"  = "Double vision",
  "4"  = "Night blindness",
  "5"  = "Colour blindness",
  "6"  = "Macular degeneration",
  "7"  = "Cataracts",
  "8"  = "Retinal detachment",
  "9"  = "Diabetic retinopathy",
  "10" = "Other (not listed)",
  "-7" = "None of the above",
  "-1" = "Do not know",
  "-3" = "Prefer not to answer"
)
df$opthal <- sapply(df$questionnaire.diag_opthal_1_m, map_multi, map = opthal_map)


# ---------- DIAG_2 (general) ----------
diag2_map <- c(
  "1"  = "Autoimmune disorder",
  "2"  = "Blood disorders (Anaemia)",
  "3"  = "Cancer",
  "4"  = "Complications or difficulties in pregnancy or childbirth",
  "5"  = "Digestive system or liver problems",
  "6"  = "Endocrine, nutritional and metabolic disorders (e.g. diabetes, thyroid disorder, vitamin deficiencies)",
  "7"  = "Eye or visual problems",
  "8"  = "Fractures, breaks, or joint problems",
  "9"  = "Heart or circulatory disease (e.g. high blood pressure or stroke)",
  "10" = "Kidney or urinary system disorders",
  "11" = "Lung or respiratory problems",
  "12" = "Mental health conditions (e.g. depression, bipolar disorder)",
  "13" = "Neurodevelopmental conditions (e.g. Autism spectrum disorder, ADHD)",
  "14" = "Neurological disorders (things that affect the brain or nervous system. E.g., Epilepsy)",
  "15" = "Reproductive system problems",
  "16" = "Other not listed",
  "-7" = "None of the above",
  "-1" = "Do not know",
  "-3" = "Prefer not to answer"
)
df$diag2 <- sapply(df$questionnaire.diag_2_m, map_multi, map = diag2_map)


# ---------- PHENOTYPES FROM YOUR LATEST BLOCK ----------

# cvd
cvd_map <- c(
  "1"  = "B-12 Deficiency (Pernicious Anaemia)",
  "2"  = "Coronary Artery/Coronary Heart Disease",
  "3"  = "Congestive Heart Failure",
  "4"  = "High Cholesterol",
  "5"  = "Heart Attack (Myocardial Infarction)",
  "6"  = "Abnormal Heart Rhythm (Arrhythmia)",
  "7"  = "Chest Pain (Angina)",
  "8"  = "Heart Valve Problems",
  "9"  = "High Blood Pressure (Hypertension)",
  "10" = "Blood Clots (DVT, Pulmonary Embolism)",
  "11" = "Stroke",
  "12" = "Other (not listed)",
  "-7" = "None of the above",
  "-1" = "Do not know",
  "-3" = "Prefer not to answer"
)
df$cvd <- sapply(df$questionnaire.diag_cvd_1_m, map_multi, map = cvd_map)


# neuro
neuro_map <- c(
  "1"  = "Epilepsy",
  "2"  = "Parkinson's disease",
  "3"  = "Alzheimer’s disease/dementia",
  "4"  = "Early onset Alzheimer’s disease/dementia",
  "5"  = "Vascular dementia",
  "6"  = "Migraine with aura",
  "7"  = "Migraine without aura",
  "8"  = "Other (not listed)",
  "-7" = "None of the above",
  "-1" = "Do not know",
  "-3" = "Prefer not to answer"
)
df$neuro <- sapply(df$questionnaire.diag_neuro_1_m, map_multi, map = neuro_map)


# neurodev
neurodev_map <- c(
  "1"  = "Autism spectrum disorder",
  "2"  = "Developmental learning disorders",
  "3"  = "Attention deficit hyperactivity disorder (ADHD)",
  "4"  = "Disorder of intellectual development",
  "5"  = "Developmental motor coordination disorder",
  "6"  = "Developmental speech or language disorders",
  "7"  = "Stereotyped movement disorder",
  "8"  = "Other (not listed)",
  "-7" = "None of the above",
  "-1" = "Do not know",
  "-3" = "Prefer not to answer"
)
df$neurodev <- sapply(df$questionnaire.diag_neuro_dev_1_m, map_multi, map = neurodev_map)


# pain
pain_map <- c(
  "1"  = "Headache",
  "2"  = "Facial pain",
  "3"  = "Neck or shoulder pain",
  "4"  = "Back pain",
  "5"  = "Stomach or abdominal pain",
  "6"  = "Hip pain",
  "7"  = "Knee pain",
  "8"  = "Pain all over the body",
  "9"  = "Premenstrual pain",
  "-7" = "None of the above",
  "-3" = "Prefer not to answer"
)
df$pain <- sapply(df$questionnaire.health_pain_chronic_1_m, map_multi, map = pain_map)


# resp
resp_map <- c(
  "1"  = "Chronic Obstructive Pulmonary Disease (COPD)",
  "2"  = "Lung fibrosis",
  "3"  = "Bronchiectasis",
  "4"  = "Asthma",
  "5"  = "Hay Fever (Allergic Rhinitis)",
  "6"  = "Other (not listed)",
  "-7" = "None of the above",
  "-1" = "Do not know",
  "-3" = "Prefer not to answer"
)
df$resp <- sapply(df$questionnaire.diag_resp_1_m, map_multi, map = resp_map)


# psych
psych_map <- c(
  "1"  = "Anxiety",
  "2"  = "Bipolar disorder",
  "3"  = "Body dysmorphia",
  "4"  = "Depression",
  "5"  = "Premenstrual dysphoric disorder",
  "6"  = "Post Traumatic Stress Disorder (PTSD)",
  "7"  = "Obsessive Compulsive Disorder",
  "8"  = "Eating disorder",
  "9"  = "Psychosis",
  "10" = "Schizophrenia",
  "11" = "Schizoaffective disorder",
  "12" = "Personality disorder",
  "13" = "Other (not listed)",
  "-7" = "None of the above",
  "-1" = "Do not know",
  "-3" = "Prefer not to answer"
)
df$psych <- sapply(df$questionnaire.diag_psych_1_m, map_multi, map = psych_map)


# gastro
gastro_map <- c(
  "1"  = "Gastro-oesophageal Acid Reflux (GORD)",
  "2"  = "Barrett’s Oesophagus",
  "3"  = "Irritable bowel syndrome (IBS)",
  "4"  = "Inflammatory Bowel Disease (IBD)",
  "5"  = "Diverticulitis or Diverticulosis",
  "6"  = "Ulcerative Colitis",
  "7"  = "Crohn’s Disease",
  "8"  = "Coeliac Disease (Gluten-Sensitive Enteropathy)",
  "9"  = "Gallstones (Biliary Stones)",
  "10" = "Fatty liver disease",
  "11" = "Liver Cirrhosis",
  "12" = "Hepatitis",
  "13" = "Pancreatitis",
  "-7" = "None of the above",
  "-1" = "Do not know",
  "-3" = "Prefer not to answer"
)
df$gastro <- sapply(df$questionnaire.diag_gastro_1_m, map_multi, map = gastro_map)


# father psych
father_psych_map <- c(
  "1"  = "Anxiety",
  "2"  = "Bipolar disorder",
  "3"  = "Body dysmorphia",
  "4"  = "Depression",
  "6"  = "Post Traumatic Stress Disorder (PTSD)",
  "7"  = "Obsessive Compulsive Disorder",
  "8"  = "Eating disorder",
  "9"  = "Psychosis",
  "10" = "Schizophrenia",
  "11" = "Schizoaffective disorder",
  "12" = "Personality disorder",
  "13" = "Other (not listed)",
  "-7" = "None of the above",
  "-1" = "Do not know",
  "-3" = "Prefer not to answer"
)
df$father_psych <- sapply(df$questionnaire.father_diag_psych_1_m, map_multi, map = father_psych_map)


# mother psych
mother_psych_map <- father_psych_map  # same keys/labels as father_psych_map
df$mother_psych <- sapply(df$questionnaire.mother_diag_psych_1_m, map_multi, map = mother_psych_map)


# sibling psych
sibling_psych_map <- father_psych_map  # same keys/labels as father_psych_map
df$sibling_psych <- sapply(df$questionnaire.sibling_diag_psych_1_m, map_multi, map = sibling_psych_map)


#family psych into one column
df$combined_family_psych <- apply(
  df[, c("mother_psych", "father_psych", "sibling_psych")], 
  1, 
  function(row) {
    # Remove any NA or empty strings
    row <- row[!is.na(row) & nzchar(row)]
    # Concatenate with comma separator
    paste(row, collapse = ", ")
  }
)

df$combined_family_cancer <- apply(
  df[, c("mother_cancer", "father_cancer", "sibling_cancer")], 
  1, 
  function(row) {
    # Remove any NA or empty strings
    row <- row[!is.na(row) & nzchar(row)]
    # Concatenate with comma separator
    paste(row, collapse = ", ")
  }
)



# Word-based flagging function ------------------------------------------------
diagnosis_flag_word <- function(x, target_words) {
  # normalize target words to lower-case and remove NA entries
  target_words <- tolower(target_words)
  target_words <- target_words[nzchar(target_words)]

  sapply(x, function(cell) {
    # empty/NA -> FALSE (per your original behavior)
    if (is.na(cell) || identical(trimws(as.character(cell)), "")) return(FALSE)

    text <- tolower(as.character(cell))
    text_trim <- trimws(text)

    # treat common variants of the NA responses as NA
    if (grepl("do not know", text_trim, fixed = TRUE) ||
        grepl("don't know", text_trim, fixed = TRUE) ||
        grepl("do not remember", text_trim, fixed = TRUE) ||
        grepl("prefer not to answer", text_trim, fixed = TRUE) ||
        grepl("prefer not to say", text_trim, fixed = TRUE)) {
      return(NA)
    }

    # treat explicit "none" / "none of the above" / similar as FALSE
    if (grepl("none of the above", text_trim, fixed = TRUE) ||
        grepl("^none$", text_trim) ||
        grepl("no known", text_trim, fixed = TRUE) ||
        grepl("no conditions", text_trim, fixed = TRUE)) {
      return(FALSE)
    }

    # TRUE if any target word appears as a substring (case-insensitive)
    any(vapply(target_words, function(w) grepl(w, text, ignore.case = TRUE, fixed = FALSE),
               logical(1)))
  }, USE.NAMES = FALSE)
}

# Apply to your dataframe -----------------------------------------------------
# (Assumes your dataframe is named `df` and contains the columns you listed:
#  psych, neurodev, combined_family_psych, neuro, cvd, pain, bmi, resp, gastro,
#  cancer, combined_family_cancer, opthal, osteo, diag2)

# Mental health & family history
df$anxiety     <- diagnosis_flag_word(df$psych, c("anxiety"))
df$adhd        <- diagnosis_flag_word(df$neurodev, c("attention deficit", "adhd"))
df$schiz       <- diagnosis_flag_word(df$psych, c("schizophrenia", "schizophrenic"))
df$depression  <- diagnosis_flag_word(df$psych, c("depression", "depressive"))
df$ocd         <- diagnosis_flag_word(df$psych, c("obsessive", "ocd"))
df$fam_schiz   <- diagnosis_flag_word(df$combined_family_psych, c("schizophrenia", "schizophrenic"))

# Diagnoses
df$dementia        <- diagnosis_flag_word(df$neuro, c("dementia"))
df$hypertension    <- diagnosis_flag_word(df$cvd, c("hypertension", "high blood pressure", "raised blood pressure"))
df$ihd             <- diagnosis_flag_word(df$cvd, c("heart disease", "ischemic heart", "ischaemic heart", "coronary"))
df$heartfailure    <- diagnosis_flag_word(df$cvd, c("heart failure", "cardiac failure", "congestive heart failure"))
df$backpain        <- diagnosis_flag_word(df$pain, c("back", "back pain", "backpain"))
df$obesity         <- if ("bmi" %in% names(df)) { # keep BMI-based obesity as before
                       ifelse(is.na(df$bmi), NA, df$bmi > 30)
                     } else {
                       diagnosis_flag_word(df$diag2, c("obesity", "obese"))
                     }

df$asthma         <- diagnosis_flag_word(df$resp, c("asthma"))
df$copd           <- diagnosis_flag_word(df$resp, c("copd", "chronic obstructive"))
df$gerd           <- diagnosis_flag_word(df$gastro, c("gord", "gerd", "gastro-oesophageal", "gastroesophageal", "acid reflux"))
df$lungcancer     <- diagnosis_flag_word(df$cancer, c("lung", "lung cancer"))
df$coloncancer    <- diagnosis_flag_word(df$cancer, c("colon", "colorectal", "bowel cancer", "colon cancer"))
df$famlungcancer  <- diagnosis_flag_word(df$combined_family_cancer, c("lung", "lung cancer"))

df$cataracts       <- diagnosis_flag_word(df$opthal, c("cataract", "cataracts"))
df$osteoarthritis  <- diagnosis_flag_word(df$osteo, c("osteoarthritis", "degenerative joint", "wear-and-tear arthritis"))
df$anaemia         <- diagnosis_flag_word(df$diag2, c("anaemia", "anemia"))

### Prepare for analysis

## Analysis

In [None]:
library(dplyr)

map_levels <- c(
  "1" = "White",
  "2" = "White",
  "3" = "Asian",
  "4" = "Asian",
  "5" = "Black",
  "6" = "Black",
  "7" = "Mixed",
  "8" = "Other",
  "9" = "White",
  "10" = "Asian",
  "11" = "Other",
  "12" = "Black",
  "13" = "Asian",
  "14" = "Other",
  "15" = "Mixed",
  "16" = "White",
  "17" = "Black",
  "18" = "Asian",
  "19" = "Other",
  "-3" = "Missing"   # if you have a -3 or missing-code level
)

# If your df uses numeric codes in column ethnicity_code
df <- df %>%
  mutate(
    eth_code_chr = as.character(ethnicity_factor), # convert factor/number to string key
    ethnicity_broad = map_levels[eth_code_chr],
    ethnicity_broad = factor(ethnicity_broad, levels = c("Asian","Black","Mixed","Other","White","Missing"))
  )
df$ethnicity_broad <- relevel(df$ethnicity_broad, ref = "White")

summary(df$ethnicity_broad)

#prep analysis

outcomes   <- c("ihd",
                "hypertension",
                "dementia",
                "dementia",
                "backpain",
                "copd",
                "anxiety",
                "anxiety",
                "adhd",
                "gerd",
                "lungcancer",
                "lungcancer",
                "coloncancer",
                "coloncancer",
                "osteoarthritis",
                "ihd",
                "cataracts",
                "schiz"
               )

predictors <- c("hypertension",
                "obesity",
                "heartfailure",
                "depression",
                "obesity",
                "asthma",
                "alcohol",
                "ocd",
                "anxiety",
                "obesity",
                "smoke_factor",
                "famlungcancer",
                "alcohol",
                "obesity",
                "obesity",
                "anaemia",
                "smoke_factor",
                "fam_schiz"
               ) # Same length as outcomes

covariates <- c("age", "sex_factor", "income_factor", "ethnicity_broad")

analysis_vars <- c(outcomes, predictors, covariates)

df_analysis <- df[, analysis_vars, drop = FALSE]

colSums(is.na(df_analysis))

with(df_analysis, table(ihd, hypertension))

summary(df_analysis)

### Results

In [None]:
# Prepare empty data.frame to store results
results <- data.frame(
  model = character(),
  outcome = character(),
  predictor = character(),
  OR = numeric(),
  CI_lower = numeric(),
  CI_upper = numeric(),
  N = integer(),
  stringsAsFactors = FALSE
)

for (i in seq_along(outcomes)) {
  formula_str <- paste(outcomes[i], "~", predictors[i], "+", paste(covariates, collapse = " + "))
  model_formula <- as.formula(formula_str)
  
  model <- glm(model_formula, data = df_analysis, family = binomial)
  
  log_or <- coef(model)[2]
  se <- summary(model)$coefficients[2, "Std. Error"]
  
  or <- exp(log_or)
  ci_lower <- exp(log_or - 1.96 * se)
  ci_upper <- exp(log_or + 1.96 * se)
  n <- nobs(model)
    
  # Append row to results
  results <- rbind(results, data.frame(
    model = paste0("model_", i),
    outcome = outcomes[i],
    predictor = predictors[i],
    OR = round(or, 2),
    CI_lower = round(ci_lower, 2),
    CI_upper = round(ci_upper, 2),
    N = n,
    stringsAsFactors = FALSE
  ))
}

print(results)

write.csv(results, "mainresults.csv", row.names = FALSE)

### Robustness checks

In [None]:
Restrict to UK Biobank–like age band 

df_subset <- df_analysis[df_analysis$age >= 40 & df_analysis$age < 70, ]

nrow(df_subset)

# Prepare empty data.frame to store results
results <- data.frame(
  model = character(),
  outcome = character(),
  predictor = character(),
  OR = numeric(),
  CI_lower = numeric(),
  CI_upper = numeric(),
    N = integer(),
  stringsAsFactors = FALSE
)

for (i in seq_along(outcomes)) {
  formula_str <- paste(outcomes[i], "~", predictors[i], "+", paste(covariates, collapse = " + "))
  model_formula <- as.formula(formula_str)
  
  model <- glm(model_formula, data = df_subset, family = binomial)
  
  log_or <- coef(model)[2]
  se <- summary(model)$coefficients[2, "Std. Error"]
  
  or <- exp(log_or)
  ci_lower <- exp(log_or - 1.96 * se)
  ci_upper <- exp(log_or + 1.96 * se)
  n <- nobs(model)
    
  # Append row to results
  results <- rbind(results, data.frame(
    model = paste0("model_", i),
    outcome = outcomes[i],
    predictor = predictors[i],
    OR = round(or, 2),
    CI_lower = round(ci_lower, 2),
    CI_upper = round(ci_upper, 2),
      N = n,
    stringsAsFactors = FALSE
  ))
}

print(results)

write.csv(results, "age4069_results.csv", row.names = FALSE)

### Stratification by ethnicity

In [None]:
# ---------- Interaction LRTs for each outcome/predictor ----------
interaction_tests <- data.frame(
  model = character(),
  outcome = character(),
  predictor = character(),
  chisq = numeric(),
  df = integer(),
  p_value = numeric(),
  N = integer(),
  stringsAsFactors = FALSE
)

for (i in seq_along(outcomes)) {
  outcome_i <- outcomes[i]
  predictor_i <- predictors[i]

  # Build formulas
  full_formula_str <- paste0(outcome_i, " ~ ", predictor_i, " * ethnicity_broad + ",
                             paste(covariates, collapse = " + "))
  reduced_formula_str <- paste0(outcome_i, " ~ ", predictor_i, " + ethnicity_broad + ",
                                paste(covariates, collapse = " + "))

  full_formula <- as.formula(full_formula_str)
  reduced_formula <- as.formula(reduced_formula_str)

  # Fit models safely
  full <- tryCatch(
    glm(full_formula, data = df_analysis, family = binomial),
    error = function(e) {
      warning(sprintf("Full model failed for outcome '%s' predictor '%s': %s", outcome_i, predictor_i, e$message))
      NULL
    }
  )
  reduced <- tryCatch(
    glm(reduced_formula, data = df_analysis, family = binomial),
    error = function(e) {
      warning(sprintf("Reduced model failed for outcome '%s' predictor '%s': %s", outcome_i, predictor_i, e$message))
      NULL
    }
  )

  # Default values if something failed
  chisq_val <- NA_real_
  df_val <- NA_integer_
  p_val <- NA_real_
  N_val <- NA_integer_

  if (!is.null(full) && !is.null(reduced)) {
    # Use anova LRT (reduced vs full). Wrap in tryCatch as anova can fail if models aren't nested or degenerate
    anova_res <- tryCatch(
      anova(reduced, full, test = "Chisq"),
      error = function(e) {
        warning(sprintf("anova LRT failed for outcome '%s' predictor '%s': %s", outcome_i, predictor_i, e$message))
        NULL
      }
    )

    if (!is.null(anova_res) && nrow(anova_res) >= 2) {
      # Extract the LRT row (second row is the comparison reduced -> full)
      # Column name for p-value can be "Pr(>Chi)" depending on R; use exact column lookup
      pcol <- grep("Pr\\(>Chi\\)", colnames(anova_res), value = TRUE)
      if (length(pcol) == 1) {
        p_val <- as.numeric(anova_res[2, pcol])
      } else {
        # fallback: last column (usually p-value)
        p_val <- as.numeric(anova_res[2, ncol(anova_res)])
      }
      # deviance difference = Chi-square, df difference:
      if ("Deviance" %in% colnames(anova_res)) {
        chisq_val <- as.numeric(anova_res[2, "Deviance"])
      } else if ("Resid. Dev" %in% colnames(anova_res)) {
        chisq_val <- as.numeric(anova_res[1, "Resid. Dev"]) - as.numeric(anova_res[2, "Resid. Dev"])
      } else {
        chisq_val <- NA_real_
      }
      if ("Df" %in% colnames(anova_res)) {
        # the 'Df' column often shows the difference in df on the second row
        df_val <- as.integer(anova_res[2, "Df"])
      } else if ("Resid. df" %in% colnames(anova_res)) {
        df_val <- as.integer(as.numeric(anova_res[1, "Resid. df"]) - as.numeric(anova_res[2, "Resid. df"]))
      }

      # N: use number of observations used in the full model
      N_val <- tryCatch(nobs(full), error = function(e) NA_integer_)
    }
  }

  interaction_tests <- rbind(interaction_tests, data.frame(
    model = paste0("model_", i),
    outcome = outcome_i,
    predictor = predictor_i,
    chisq = chisq_val,
    df = df_val,
    p_value = p_val,
    N = N_val,
    stringsAsFactors = FALSE
  ))
}

# Inspect results
print(interaction_tests)


write.csv(interaction_tests, "interaction_results.csv", row.names = FALSE)



library(stringr)

# ---------- Prepare result containers ----------
results_long <- data.frame(
  model = character(),
  outcome = character(),
  predictor = character(),
  ethnicity = character(),
  logOR = numeric(),
  OR = numeric(),
  CI_lower = numeric(),
  CI_upper = numeric(),
  se = numeric(),
  z = numeric(),
  p_value = numeric(),
  N_eth = integer(),
  events_eth = integer(),
  stringsAsFactors = FALSE
)

# (sanity checks & releveling as before)...
df_analysis$ethnicity_broad <- factor(df_analysis$ethnicity_broad)
if ("White" %in% levels(df_analysis$ethnicity_broad)) {
  df_analysis$ethnicity_broad <- relevel(df_analysis$ethnicity_broad, ref = "White")
}


# ---------- Helper: find coefficient name for main predictor ----------
find_main_name <- function(coef_names, predictor_name) {
  # Strict match: names that start with predictor and have no ":" (i.e., not interactions)
  cand <- coef_names[stringr::str_detect(coef_names, paste0("^", predictor_name))]
  cand <- cand[!stringr::str_detect(cand, ":")]
  if (length(cand) >= 1) return(cand[1])
  
  # Fallback: any coefficient containing predictor_name and not an interaction
  cand2 <- coef_names[stringr::str_detect(coef_names, predictor_name) &
                      !stringr::str_detect(coef_names, ":")]
  if (length(cand2) >= 1) return(cand2[1])
  
  return(NA_character_)
}

# ---------- Main loop: fit models & compute ethnicity-specific effects & Ns ----------
for (i in seq_along(outcomes)) {
  outcome_i <- outcomes[i]
  predictor_i <- predictors[i]
  
  # Build and fit model
  formula_str <- paste0(outcome_i, " ~ ", predictor_i, " * ethnicity_broad + ",
                        paste(covariates, collapse = " + "))
  model <- glm(as.formula(formula_str), data = df_analysis, family = binomial)
  
  # model frame and response vector for accurate subgroup counts
  mf <- model.frame(model)                 # rows actually used in the fit (handles NA)
  resp_vec <- model.response(mf)           # numeric (0/1) or whatever response was used
  # If response is not 0/1 numeric, you can adapt how to count events below
  
  coefs <- coef(model)
  coef_names <- names(coefs)
  vc <- tryCatch(vcov(model), error = function(e) { stop("vcov failed: ", e$message) })
  
  # find main coefficient corresponding to predictor
  main_name <- find_main_name(coef_names, predictor_i)
  if (is.na(main_name)) stop(paste0("Could not find main coefficient for predictor '", predictor_i, "'. Check names: ", paste(coef_names, collapse = ", ")))
  
  # iterate ethnicities (levels are taken from the model-data factor to be safe)
  eth_levels <- levels(mf$ethnicity_broad)
  for (eth in eth_levels) {
    # compute N and events for this ethnicity from the model frame
    idx_eth <- which(mf$ethnicity_broad == eth)
    N_eth <- length(idx_eth)
    # count events robustly using the response vector (assumes binary coded 0/1)
    # if resp_vec is factor, convert to numeric 0/1:
    if (is.factor(resp_vec)) {
      # convert to numeric 0/1 by mapping the second level to 1 (common pattern)
      rv_num <- as.numeric(as.character(resp_vec)) # try direct
      if (all(is.na(rv_num))) {
        # fallback: map factor levels to 0/1 assuming two levels
        levs <- levels(resp_vec)
        if (length(levs) == 2) {
          rv_num <- ifelse(resp_vec == levs[2], 1, 0)
        } else {
          rv_num <- rep(NA_real_, length(resp_vec))
        }
      }
    } else {
      rv_num <- as.numeric(resp_vec)
    }
    events_eth <- if (N_eth > 0) sum(rv_num[idx_eth] == 1, na.rm = TRUE) else 0
    
    # Compute combined coefficient as before (main + interaction if present)
    if (eth == eth_levels[1]) {
      beta_main <- coefs[main_name]
      var_main <- vc[main_name, main_name]
      beta_comb <- beta_main
      var_comb <- var_main
    } else {
      # search for interaction coefficient name using patterns
      pattern1 <- paste0("^", main_name, ":ethnicity_broad", eth, "$")
      pattern2 <- paste0("^ethnicity_broad", eth, ":", main_name, "$")
      pattern3 <- paste0("^", main_name, ":.*", eth, "$")
      cand <- coef_names[str_detect(coef_names, ":")]
      int_name <- cand[str_detect(cand, pattern1) | str_detect(cand, pattern2) | str_detect(cand, pattern3)]
      if (length(int_name) == 0) {
        fallback <- coef_names[str_detect(coef_names, predictor_i) & str_detect(coef_names, eth)]
        if (length(fallback) >= 1) int_name <- fallback[1]
      } else {
        int_name <- int_name[1]
      }
      
      if (length(int_name) == 0) {
        beta_main <- coefs[main_name]
        beta_int <- 0
        var_main <- vc[main_name, main_name]
        var_int <- 0
        cov_main_int <- 0
        beta_comb <- beta_main + beta_int
        var_comb <- var_main + var_int + 2 * cov_main_int
      } else {
        beta_main <- coefs[main_name]
        beta_int <- coefs[int_name]
        var_main <- vc[main_name, main_name]
        var_int <- vc[int_name, int_name]
        cov_main_int <- vc[main_name, int_name]
        beta_comb <- beta_main + beta_int
        var_comb <- var_main + var_int + 2 * cov_main_int
      }
    }
    
    se_comb <- sqrt(var_comb)
    if (!is.finite(se_comb) || se_comb <= 0) {
      z_val <- NA_real_; p_val <- NA_real_; OR <- NA_real_; ci_low <- NA_real_; ci_high <- NA_real_
    } else {
      z_val <- beta_comb / se_comb
      p_val <- 2 * pnorm(-abs(z_val))
      OR <- exp(beta_comb)
      ci_low <- exp(beta_comb - 1.96 * se_comb)
      ci_high <- exp(beta_comb + 1.96 * se_comb)
    }
    
    results_long <- rbind(results_long, data.frame(
      model = paste0("model_", i),
      outcome = outcome_i,
      predictor = predictor_i,
      ethnicity = eth,
      logOR = round(beta_comb, 4),
      OR = ifelse(!is.na(OR), round(OR, 2), NA_real_),
      CI_lower = ifelse(!is.na(ci_low), round(ci_low, 2), NA_real_),
      CI_upper = ifelse(!is.na(ci_high), round(ci_high, 2), NA_real_),
      se = ifelse(is.finite(se_comb), round(se_comb, 4), NA_real_),
      z = ifelse(!is.na(z_val), round(z_val, 3), NA_real_),
      p_value = ifelse(!is.na(p_val), signif(p_val, 3), NA_real_),
      N_eth = N_eth,
      events_eth = events_eth,
      stringsAsFactors = FALSE
    ))
  } # end eth loop
} # end outcomes loop

# Arrange / inspect
results_long <- results_long %>% arrange(model, outcome, predictor, ethnicity)
print(results_long)


write.csv(results_long, "ethnicity_specific_ORs_clinical.csv", row.names = FALSE)



ethnicity ukb age

df_analysis = df_subset

# ---------- Interaction LRTs for each outcome/predictor ----------
interaction_tests <- data.frame(
  model = character(),
  outcome = character(),
  predictor = character(),
  chisq = numeric(),
  df = integer(),
  p_value = numeric(),
  N = integer(),
  stringsAsFactors = FALSE
)

for (i in seq_along(outcomes)) {
  outcome_i <- outcomes[i]
  predictor_i <- predictors[i]

  # Build formulas
  full_formula_str <- paste0(outcome_i, " ~ ", predictor_i, " * ethnicity_broad + ",
                             paste(covariates, collapse = " + "))
  reduced_formula_str <- paste0(outcome_i, " ~ ", predictor_i, " + ethnicity_broad + ",
                                paste(covariates, collapse = " + "))

  full_formula <- as.formula(full_formula_str)
  reduced_formula <- as.formula(reduced_formula_str)

  # Fit models safely
  full <- tryCatch(
    glm(full_formula, data = df_analysis, family = binomial),
    error = function(e) {
      warning(sprintf("Full model failed for outcome '%s' predictor '%s': %s", outcome_i, predictor_i, e$message))
      NULL
    }
  )
  reduced <- tryCatch(
    glm(reduced_formula, data = df_analysis, family = binomial),
    error = function(e) {
      warning(sprintf("Reduced model failed for outcome '%s' predictor '%s': %s", outcome_i, predictor_i, e$message))
      NULL
    }
  )

  # Default values if something failed
  chisq_val <- NA_real_
  df_val <- NA_integer_
  p_val <- NA_real_
  N_val <- NA_integer_

  if (!is.null(full) && !is.null(reduced)) {
    # Use anova LRT (reduced vs full). Wrap in tryCatch as anova can fail if models aren't nested or degenerate
    anova_res <- tryCatch(
      anova(reduced, full, test = "Chisq"),
      error = function(e) {
        warning(sprintf("anova LRT failed for outcome '%s' predictor '%s': %s", outcome_i, predictor_i, e$message))
        NULL
      }
    )

    if (!is.null(anova_res) && nrow(anova_res) >= 2) {
      # Extract the LRT row (second row is the comparison reduced -> full)
      # Column name for p-value can be "Pr(>Chi)" depending on R; use exact column lookup
      pcol <- grep("Pr\\(>Chi\\)", colnames(anova_res), value = TRUE)
      if (length(pcol) == 1) {
        p_val <- as.numeric(anova_res[2, pcol])
      } else {
        # fallback: last column (usually p-value)
        p_val <- as.numeric(anova_res[2, ncol(anova_res)])
      }
      # deviance difference = Chi-square, df difference:
      if ("Deviance" %in% colnames(anova_res)) {
        chisq_val <- as.numeric(anova_res[2, "Deviance"])
      } else if ("Resid. Dev" %in% colnames(anova_res)) {
        chisq_val <- as.numeric(anova_res[1, "Resid. Dev"]) - as.numeric(anova_res[2, "Resid. Dev"])
      } else {
        chisq_val <- NA_real_
      }
      if ("Df" %in% colnames(anova_res)) {
        # the 'Df' column often shows the difference in df on the second row
        df_val <- as.integer(anova_res[2, "Df"])
      } else if ("Resid. df" %in% colnames(anova_res)) {
        df_val <- as.integer(as.numeric(anova_res[1, "Resid. df"]) - as.numeric(anova_res[2, "Resid. df"]))
      }

      # N: use number of observations used in the full model
      N_val <- tryCatch(nobs(full), error = function(e) NA_integer_)
    }
  }

  interaction_tests <- rbind(interaction_tests, data.frame(
    model = paste0("model_", i),
    outcome = outcome_i,
    predictor = predictor_i,
    chisq = chisq_val,
    df = df_val,
    p_value = p_val,
    N = N_val,
    stringsAsFactors = FALSE
  ))
}

# Inspect results
print(interaction_tests)


write.csv(interaction_tests, "ukbofh_interactiontest.csv", row.names = FALSE)

# ---------- Prepare result containers ----------
results_long <- data.frame(
  model = character(),
  outcome = character(),
  predictor = character(),
  ethnicity = character(),
  logOR = numeric(),
  OR = numeric(),
  CI_lower = numeric(),
  CI_upper = numeric(),
  se = numeric(),
  z = numeric(),
  p_value = numeric(),
  N_eth = integer(),
  events_eth = integer(),
  stringsAsFactors = FALSE
)

# (sanity checks & releveling as before)...
df_analysis$ethnicity_broad <- factor(df_analysis$ethnicity_broad)
if ("White" %in% levels(df_analysis$ethnicity_broad)) {
  df_analysis$ethnicity_broad <- relevel(df_analysis$ethnicity_broad, ref = "White")
}


# ---------- Helper: find coefficient name for main predictor ----------
find_main_name <- function(coef_names, predictor_name) {
  # Strict match: names that start with predictor and have no ":" (i.e., not interactions)
  cand <- coef_names[stringr::str_detect(coef_names, paste0("^", predictor_name))]
  cand <- cand[!stringr::str_detect(cand, ":")]
  if (length(cand) >= 1) return(cand[1])
  
  # Fallback: any coefficient containing predictor_name and not an interaction
  cand2 <- coef_names[stringr::str_detect(coef_names, predictor_name) &
                      !stringr::str_detect(coef_names, ":")]
  if (length(cand2) >= 1) return(cand2[1])
  
  return(NA_character_)
}

# ---------- Main loop: fit models & compute ethnicity-specific effects & Ns ----------
for (i in seq_along(outcomes)) {
  outcome_i <- outcomes[i]
  predictor_i <- predictors[i]
  
  # Build and fit model
  formula_str <- paste0(outcome_i, " ~ ", predictor_i, " * ethnicity_broad + ",
                        paste(covariates, collapse = " + "))
  model <- glm(as.formula(formula_str), data = df_analysis, family = binomial)
  
  # model frame and response vector for accurate subgroup counts
  mf <- model.frame(model)                 # rows actually used in the fit (handles NA)
  resp_vec <- model.response(mf)           # numeric (0/1) or whatever response was used
  # If response is not 0/1 numeric, you can adapt how to count events below
  
  coefs <- coef(model)
  coef_names <- names(coefs)
  vc <- tryCatch(vcov(model), error = function(e) { stop("vcov failed: ", e$message) })
  
  # find main coefficient corresponding to predictor
  main_name <- find_main_name(coef_names, predictor_i)
  if (is.na(main_name)) stop(paste0("Could not find main coefficient for predictor '", predictor_i, "'. Check names: ", paste(coef_names, collapse = ", ")))
  
  # iterate ethnicities (levels are taken from the model-data factor to be safe)
  eth_levels <- levels(mf$ethnicity_broad)
  for (eth in eth_levels) {
    # compute N and events for this ethnicity from the model frame
    idx_eth <- which(mf$ethnicity_broad == eth)
    N_eth <- length(idx_eth)
    # count events robustly using the response vector (assumes binary coded 0/1)
    # if resp_vec is factor, convert to numeric 0/1:
    if (is.factor(resp_vec)) {
      # convert to numeric 0/1 by mapping the second level to 1 (common pattern)
      rv_num <- as.numeric(as.character(resp_vec)) # try direct
      if (all(is.na(rv_num))) {
        # fallback: map factor levels to 0/1 assuming two levels
        levs <- levels(resp_vec)
        if (length(levs) == 2) {
          rv_num <- ifelse(resp_vec == levs[2], 1, 0)
        } else {
          rv_num <- rep(NA_real_, length(resp_vec))
        }
      }
    } else {
      rv_num <- as.numeric(resp_vec)
    }
    events_eth <- if (N_eth > 0) sum(rv_num[idx_eth] == 1, na.rm = TRUE) else 0
    
    # Compute combined coefficient as before (main + interaction if present)
    if (eth == eth_levels[1]) {
      beta_main <- coefs[main_name]
      var_main <- vc[main_name, main_name]
      beta_comb <- beta_main
      var_comb <- var_main
    } else {
      # search for interaction coefficient name using patterns
      pattern1 <- paste0("^", main_name, ":ethnicity_broad", eth, "$")
      pattern2 <- paste0("^ethnicity_broad", eth, ":", main_name, "$")
      pattern3 <- paste0("^", main_name, ":.*", eth, "$")
      cand <- coef_names[str_detect(coef_names, ":")]
      int_name <- cand[str_detect(cand, pattern1) | str_detect(cand, pattern2) | str_detect(cand, pattern3)]
      if (length(int_name) == 0) {
        fallback <- coef_names[str_detect(coef_names, predictor_i) & str_detect(coef_names, eth)]
        if (length(fallback) >= 1) int_name <- fallback[1]
      } else {
        int_name <- int_name[1]
      }
      
      if (length(int_name) == 0) {
        beta_main <- coefs[main_name]
        beta_int <- 0
        var_main <- vc[main_name, main_name]
        var_int <- 0
        cov_main_int <- 0
        beta_comb <- beta_main + beta_int
        var_comb <- var_main + var_int + 2 * cov_main_int
      } else {
        beta_main <- coefs[main_name]
        beta_int <- coefs[int_name]
        var_main <- vc[main_name, main_name]
        var_int <- vc[int_name, int_name]
        cov_main_int <- vc[main_name, int_name]
        beta_comb <- beta_main + beta_int
        var_comb <- var_main + var_int + 2 * cov_main_int
      }
    }
    
    se_comb <- sqrt(var_comb)
    if (!is.finite(se_comb) || se_comb <= 0) {
      z_val <- NA_real_; p_val <- NA_real_; OR <- NA_real_; ci_low <- NA_real_; ci_high <- NA_real_
    } else {
      z_val <- beta_comb / se_comb
      p_val <- 2 * pnorm(-abs(z_val))
      OR <- exp(beta_comb)
      ci_low <- exp(beta_comb - 1.96 * se_comb)
      ci_high <- exp(beta_comb + 1.96 * se_comb)
    }
    
    results_long <- rbind(results_long, data.frame(
      model = paste0("model_", i),
      outcome = outcome_i,
      predictor = predictor_i,
      ethnicity = eth,
      logOR = round(beta_comb, 4),
      OR = ifelse(!is.na(OR), round(OR, 2), NA_real_),
      CI_lower = ifelse(!is.na(ci_low), round(ci_low, 2), NA_real_),
      CI_upper = ifelse(!is.na(ci_high), round(ci_high, 2), NA_real_),
      se = ifelse(is.finite(se_comb), round(se_comb, 4), NA_real_),
      z = ifelse(!is.na(z_val), round(z_val, 3), NA_real_),
      p_value = ifelse(!is.na(p_val), signif(p_val, 3), NA_real_),
      N_eth = N_eth,
      events_eth = events_eth,
      stringsAsFactors = FALSE
    ))
  } # end eth loop
} # end outcomes loop

# Arrange / inspect
results_long <- results_long %>% arrange(model, outcome, predictor, ethnicity)
print(results_long)


write.csv(results_long, "ethnicity_specific_ORs_clinical_ukbofh.csv", row.names = FALSE)