# Helper

In [None]:
# Load "config.R" for utility functions. 
#Will also triggger loading of 
    
    # user_config.JSON (including key for project_config)
    # project_config.JSON
    # preprocessing_visualizations.R
    # preprocessing_functions.R

user <- "Jan" 
source("config.r")



#If certain packages not installed yet via requirements.txt, install them here via
# install.packages("package_name")

In [None]:
IOIs

In [None]:
timeframe

# Dataloader

## Load Metadata

In [None]:
# Load metadata (file with columns/ICD codes to investigate)
icd_dict_path <- paste0(data_path, "/Features_AOU.xlsx")

ICD_Groups <- read_excel(icd_dict_path, sheet = "ICD_Groups") %>% select(-c("n", "Summe"))
ICD_Singles <- read_excel(icd_dict_path, sheet = "ICD_Singles") %>% select(-c("n", "Summe"))

## Load df_diagnosis (reduced to first instance per diag per patient)¶

In [None]:
df_diagnosis <- read_csv(paste0(data_path, "/dataframes/df_diagnosis_reduced.csv"))
df_diagnosis$source_concept_code <- gsub("\\.", "", df_diagnosis$source_concept_code) #removes the dot in the ICD codes
head(df_diagnosis)
dim(df_diagnosis)




# Subset dataset 

## Filter dataset for all DOI combined

In [None]:
# =============================================================================
# STEP 1: FILTER DIAGNOSIS DATA FOR RELEVANT ICD CODES ONLY
# =============================================================================

cat("\nOriginal diagnosis data dimensions:", nrow(df_diagnosis), "rows\n")

# Create pattern for filtering (codes that START WITH any of our ICD codes)
pattern <- paste0("^(", paste(IOIs, collapse = "|"), ")")

# Filter diagnosis data
df_diagnosis_filtered <- df_diagnosis %>%
  filter(grepl(pattern, source_concept_code, perl = TRUE))

cat("Filtered diagnosis data dimensions:", nrow(df_diagnosis_filtered), "rows\n")
cat("Reduction:", round((1 - nrow(df_diagnosis_filtered)/nrow(df_diagnosis)) * 100, 1), "% removed\n")

# Show what ICD codes we actually have in the filtered data
filtered_codes <- unique(df_diagnosis_filtered$source_concept_code)
cat("Unique ICD codes found in filtered data:", length(filtered_codes), "\n")

df_diagnosis_filtered

## Create separate columns for DOIs

In [None]:
summary_df <- data.frame(icd_code = character(), amount = numeric())

# Define a function that loops through the ICD codes and creates separate dataframes for them
subset_df <- function(df_diagnosis_filtered, IOI, DOI, summary_df) {
      #Subset for only rows in hesin with icd-code of interest (IOI)
      sub_icd <- subset(df_diagnosis_filtered, source_concept_code == IOI)
      print(paste("Amount of ", IOI, " in AOU: ", length(unique(sub_icd$person_id))))
      sub_icd$condition_start_datetime <- as.Date(sub_icd$condition_start_datetime, format="%d/%m/%Y") #transform to date       object
      sub_icd$year <- as.numeric(format(sub_icd$condition_start_datetime, "%Y"))
      sub_icd$icd_code <- IOI
    
    return(sub_icd)
    
}    

df_list <- list()
for(IOI in IOIs) {
  name <- paste0("sub_", IOI) # Dynamically create a variable name
  df_list[[name]] <- subset_df(df_diagnosis_filtered, IOI, DOI, summary_df)
}


## Inspect dataframes

In [None]:
# The code above creates a list of dataframes for every single ICD code in IOIs.
# These can be accessed via

for (nm in names(df_list)) {
  cat("\n###", nm, "###\n")
  print(head(df_list[[nm]]))
}

# Merge DOI with instance data

## Concatenate the df per DOI to one df and inspect duplicates

In [None]:
df_combined <- bind_rows(df_list)

# 2) Print any duplicate rows (based on all columns)
dup_rows <- df_combined %>%
  filter(duplicated(.) | duplicated(., fromLast = TRUE))
print(dup_rows)

# 3) Create df_y with status = 1 and rename person_id → eid
df_y <- df_combined %>%
  mutate(status = 1) 

df_y

## Load all IDs

In [None]:
df_basic_id <- data.table::fread(file.path(data_path, "dataframes/df_covariates_1.1.txt"), sep="\t") %>% select("person_id")

df_basic_id

## Merge IDs with status data

In [None]:
df_y <- merge(df_basic_id, df_y, by="person_id", all.x=TRUE) %>%
    mutate(status = if_else(is.na(status), 0, status))

df_y

# Export

In [None]:
write_csv(df_y, paste0(data_path, "/dataframes/df_y.csv"))

# Plots (not copied over from ukb_scripts properly yet)

In [None]:
if (!requireNamespace("systemfonts", quietly = TRUE)) install.packages("systemfonts")
if (!requireNamespace("showtext", quietly = TRUE)) install.packages("showtext")

n_total <- nrow(df_y) #Assess absolute number

plot_included_discarded_cases(cases_first_visit, base_size=30, n_total = n_total)



# HCC Only: Explore diseases underlying DOI cases

In [None]:



### Rank priority of diagnosis

priority_order <- c("Cirrhosis", "Viral Hepatitis", "CLD", "No Liver disease")

pat_cld <- pat_icds[pat_icds$diag_icd10 %in% par_icd_codes | pat_icds$diag_icd9 %in% par_icd_codes, ] %>%
  select(c("eid", "diag_icd9", "diag_icd10", "epistart")) %>%
  left_join(Patients_at_risk, by = c("diag_icd10" = "ICD10")) %>%
  right_join(df_y, by = "eid") %>%
  subset(status==1) %>%
  select(-c("location_name", "location_code", "location_nr", "location_country", "country_code", "split_ext", "split_int"))

pat_cld$epistart[is.na(pat_cld$Group)] <- as.Date(pat_cld$date_of_diag)

pat_cld$Group[is.na(pat_cld$Group)] <- "No Liver disease"
pat_cld$Group[!pat_cld$Group %in% par_subset] <- "No Liver disease" #Replace all non-matching groups with "No LD"
pat_cld$Group <- factor(pat_cld$Group, levels=priority_order)

summary(pat_cld$Group)

priority <- function(diagnosis) {
  case_when(
    diagnosis == "Cirrhosis" ~ 1,
    diagnosis == "Viral Hepatitis" ~ 2,
    diagnosis == "CLD" ~ 3,
    diagnosis == "No Liver disease" ~ 4,
    TRUE ~ 5  # Assign a lower priority to other diagnoses
  )
}


pat_cld <- pat_cld %>%
  group_by(eid)

# Node 0 represents first visit to hospital after assessment
pat_cld_node0 <- pat_cld %>%
  mutate(Priority = priority(Group)) %>%
  group_by(eid) %>%
  filter(epistart == min(epistart)) %>%
  arrange(eid, Priority) %>%
  filter(row_number() == 1) %>%
  ungroup() %>%
  select(-Priority)

summary_node0 <- pat_cld_node0 %>%
  group_by(Group) %>%
  summarize(Count = n(), .groups = 'drop') %>%
  mutate(Time = "First \nEHR") %>%
  mutate(Order = 1 ) %>%
  mutate(Priority = priority(Group)) %>%
  arrange(Priority) %>%
  mutate(Percentage = round(Count / sum(Count) * 100)) 



pat_cld_node1 <- pat_cld %>%
  mutate(Priority = priority(Group)) %>%
  group_by(eid) %>%
  #filter(epistart == max(epistart)) %>%   #better to take all incidents than just the last, as not all diags get coded everytime
  arrange(eid, Priority) %>%
  filter(row_number() == 1) %>%
  ungroup() %>%
  select(-Priority)

summary_node1 <- pat_cld_node1 %>%
  group_by(Group) %>%
  summarize(Count = n(), .groups = 'drop') %>%
  mutate(Time = paste0("Prior to\n", DOI)) %>%
  mutate(Order = 2 ) %>%
  mutate(Priority = priority(Group)) %>%
  arrange(Priority) %>%
  mutate(Percentage = round(Count / sum(Count) * 100))


# View the summaries
print(summary_node0)
print(summary_node1)


# Merge timepoints
combined_data <- rbind(summary_node0, summary_node1)


stacked_bars_time_comparison(combined_data, base_size=22)




In [None]:
## Explore Cirrhosis

check_unique_participants <- function(df, name) {
  total_rows <- nrow(df)
  unique_participants <- n_distinct(df$person_id)
  print(paste("Checking", name))
  print(paste("Total rows:", total_rows))
  print(paste("Unique participants:", unique_participants))
}


# Check cirrhosis_cases and HCC_cases
check_unique_participants(cirrhosis_cases, "Cirrhosis Cases")
check_unique_participants(HCC_cases, "HCC Cases")


process_icd_codes <- function(icd_codes_subset, codes, disease_name, select_visits = "first") {
  # Filter for the specified ICD codes
  cases <- icd_codes_subset[icd_codes_subset$source_concept_code %in% codes, ]
  
  # Convert condition_start_datetime to datetime format
  cases <- cases %>%
    mutate(condition_start_datetime = ymd_hms(condition_start_datetime))
  
  # Function to extract year from datetime
  extract_year <- function(date) {
    return(year(date))
  }
  
  # Process based on select_visits option
  if (select_visits == "first") {
    # Filter for the first visit per unique person_id
    processed_cases <- cases %>%
      group_by(person_id) %>%
      arrange(condition_start_datetime) %>%
      slice(1) %>%
      ungroup()
  } else if (select_visits == "all") {
    # Keep all visits
    processed_cases <- cases
  } else {
    stop("Invalid select_visits option. Use 'first' or 'all'.")
  }
  
  # Add year and date_of_diag columns
  processed_cases <- processed_cases %>%
    mutate(
      year = sapply(condition_start_datetime, extract_year),
      date_of_diag = as.Date(condition_start_datetime)
    )
  
  # Add disease name column
  processed_cases$disease <- disease_name
  
  # Print summary information
  print(paste("Processing", disease_name, "cases:"))
  print(paste("ICD codes used:", paste(codes, collapse = ", ")))
  print(paste("Original number of rows:", nrow(cases)))
  print(paste("Number of rows after processing:", nrow(processed_cases)))
  print(paste("Number of unique person_ids:", n_distinct(processed_cases$person_id)))
  
  # Optional: Count of cases per ICD code
  if (length(codes) > 1) {
    code_counts <- processed_cases %>%
      group_by(source_concept_code) %>%
      summarise(count = n()) %>%
      arrange(desc(count))
    print("Cases per ICD code:")
    print(code_counts)
  }
    
  processed_cases <- processed_cases %>% select(c("person_id", "source_concept_code", "date_of_diag", "disease"))
  return(processed_cases)
}

cirrhosis_codes <- c("K703", "K743", "K745", "K746", "K767", "I850", "I859", "R18")


cirrhosis_cases <- process_icd_codes(icd_codes_subset, cirrhosis_codes, "Cirrhosis", "first")

HCC_cases <- process_icd_codes(icd_codes_subset, "C220", DOI, "first")


check_unique_participants(cirrhosis_cases, "Cirrhosis Cases")
check_unique_participants(HCC_cases, paste0("Cases with", DOI))


head(cirrhosis_cases)
head(HCC_cases)
dim(cirrhosis_cases)
dim(HCC_cases)


time_threshold = 90
print(time_threshold)

cirrhosis_cases <- cirrhosis_cases %>%
    mutate(date_of_diag = as.Date(date_of_diag))
  
HCC_cases <- HCC_cases %>%
    mutate(date_of_diag = as.Date(date_of_diag))

early_cirrhosis_cases <- merge(cirrhosis_cases, HCC_cases, by="person_id", suffix = c("_cirrhosis", "_HCC"), all=TRUE)

sum(is.na(early_cirrhosis_cases$date_of_diag_HCC))

early_cirrhosis_cases <- early_cirrhosis_cases %>%
  mutate(
    time_to_hcc = case_when(
      !is.na(date_of_diag_HCC) & !is.na(date_of_diag_cirrhosis) ~ 
        as.numeric(difftime(date_of_diag_HCC, date_of_diag_cirrhosis, units = "days")),
      TRUE ~ NA_real_
    ),
    cirrhosis_status = case_when(
      is.na(date_of_diag_cirrhosis) & is.na(date_of_diag_HCC) ~ "Neither Cirrhosis nor HCC",
      is.na(date_of_diag_HCC) ~ "Cirrhosis but No HCC",
      is.na(date_of_diag_cirrhosis) ~ "HCC but No Cirrhosis",
      time_to_hcc > time_threshold ~ "Cirrhosis prior to HCC",
      time_to_hcc >= -time_threshold & time_to_hcc <= time_threshold ~ "Simultaneous Cirrhosis + HCC",
      time_to_hcc < -time_threshold ~ "Cirrhosis after HCC",
      TRUE ~ "Error in date calculation"
    )
  )


early_cirrhosis_cases




case_analysis <- early_cirrhosis_cases %>%
    group_by(cirrhosis_status) %>%
    summarize(count = n()) %>%
    mutate(percentage = count / sum(count) * 100)
  
  print("Cirrhosis cases analysis:")
  print(case_analysis)

early_cirrhosis_only <- early_cirrhosis_cases %>%
  filter(cirrhosis_status %in% c("Cirrhosis but No HCC", "Cirrhosis prior to HCC"))

df_early_cirrhosis <- early_cirrhosis_only %>%
  select(person_id) %>%
  mutate(cirrhosis = 1) %>%
  distinct()  

write_csv(df_early_cirrhosis, "data/df_early_cirrhosis.csv")

early_cirrhosis_only

df_cirrhosis <- early_cirrhosis_only %>% select("person_id")