In [25]:
library(tidyverse)
bugbank_data_dir = "/well/bag/clme1992/bugbank_data/"
ukb_data_dir = "/well/bag/wilson/ukb/"


-- [1mAttaching core tidyverse packages[22m ------------------------ tidyverse 2.0.0 --
[32mv[39m [34mdplyr    [39m 1.1.3     [32mv[39m [34mreadr    [39m 2.1.4
[32mv[39m [34mforcats  [39m 1.0.0     [32mv[39m [34mstringr  [39m 1.5.0
[32mv[39m [34mggplot2  [39m 3.4.4     [32mv[39m [34mtibble   [39m 3.2.1
[32mv[39m [34mlubridate[39m 1.9.3     [32mv[39m [34mtidyr    [39m 1.3.0
[32mv[39m [34mpurrr    [39m 1.0.2     
-- [1mConflicts[22m ------------------------------------------ tidyverse_conflicts() --
[31mx[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31mx[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mi[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors


In [None]:
# At each step of QC, track the change in the infection cases

resolve.symlink <- function(x) {
    y <- Sys.readlink(x)
    if (any(is.na(y))) {
        stop("Could not resolve symlink ", x)
    }
    ifelse(y == "", x, file.path(dirname(x), y))
}

# configuration file
source("~/.saige_pipe.config")

lg <- list()
lg$hesin_diag_file <- paste0(config$ukb.derived.dir, "/hes/hesin_diag.latest.txt.gz")
lg$pathogen_icd10_file <- paste0(config$bbdatadir, "/pathogen_to_unique_icd10.tsv")
lg$bd_RDdata_file <- paste0(config$ukb.derived.dir, "/ukb41482.ukb41376.fields.RData")
lg$bd_not_lost2followup_file <- paste0(config$ukb.derived.dir, "/ukb41482.English-not-lost-to-followup-8-April-2020.txt")
lg$bed_sample_qc_file <- paste0(config$ukbdir, "/v2/qc/ukb_sqc_v2.txt")
lg$withdrawn_eid_file <- paste0(config$bbdatadir, "/w53100_2023-04-25.csv")
# Pre-computed eids for the bed-format genotypes
lg$bed_eid_file <- paste0(config$ukb.derived.dir, "/analysis.bed.eids.txt")
# Individuals with first degree relatives
lg$remrels_file <- paste0(config$ukb.derived.dir, "/ukb41482.English-remove-first-degree-relatives.eids.txt")
# panUKB ancestral files
lg$pan_ukb_file <- paste0(config$panukb.dir, "/Files for retman/all_pops_non_eur_pruned_within_pop_pc_covs.tsv")
lg$pan_ukb_bridge_file <- paste0(config$panukb.dir, "/ukb53100bridge31063.txt")

### load input files ###
pathogen_tb <- read.csv(lg$pathogen_file, sep = "\t")
pathogen_taxonomy <- read.csv(lg$pathogen_taxonomy_file, sep = "\t")
pathogen_icd10 <- read.csv(lg$pathogen_icd10_file, sep = "\t")
bugbank_data <- read.csv(lg$bugbank_file, sep = "\t")
hes_diag <- read.csv(lg$hesin_diag_file, sep = "\t")
system.time(load(lg$bd_RDdata_file))
all_eids <- bd[, "f.eid"]
bd_not_lost2followup <- scan(lg$bd_not_lost2followup_file, what = "logical") == "TRUE"
withdrawn_eid <- scan(lg$withdrawn_eid_file)
# Sample QC
bed_sample_qc <- read.csv(lg$bed_sample_qc_file, sep = " ")
# The corresponding eids
bed_eid <- scan(lg$bed_eid_file)
# Convert to bd_eid order
sample_qc <- bed_sample_qc[match(all_eids, bed_eid), ]
# Close (first degree) relatives
remrels <- scan(lg$remrels_file)
# load and match the panukb data
panukb <- read.csv(lg$pan_ukb_file, sep = "\t")[, c("s", "pop")]
panukb_bridge <- read.csv(lg$pan_ukb_bridge_file, sep = " ", header = F)
bridge_matched <- panukb_bridge[match(panukb$s, panukb_bridge[, 2]), ]
panukb$eid <- bridge_matched[, 1]
panukb_matched <- panukb[match(all_eids, panukb$eid), ]

### assign assessment centre data ###
f.assesscentre <- "f.54.0.0"
assess_centre_England <- c(
    11012, # 	Barts
    11021, # 	Birmingham
    11011, # 	Bristol
    11008, # 	Bury
    # 11003	Cardiff
    11024, # 	Cheadle (revisit)
    11020, # 	Croydon
    # 11005	Edinburgh
    # 11004	Glasgow
    11018, # 	Hounslow
    11010, # 	Leeds
    11016, # 	Liverpool
    11001, # 	Manchester
    11017, # 	Middlesborough
    11009, # 	Newcastle
    11013, # 	Nottingham
    11002, # 	Oxford
    11007, # 	Reading
    11014, # 	Sheffield
    10003, # 	Stockport (pilot)
    11006, # 	Stoke
    # 11022	Swansea
    # 11023	Wrexham
    11025, # 	Cheadle (imaging)
    11026, # 	Reading (imaging)
    11027, # 	Newcastle (imaging)
    11028 # 	Bristol (imaging)
)

f_assesscentre <- "f.54.0.0"
filter <- bd[, f_assesscentre] %in% assess_centre_England &
    !(all_eids %in% withdrawn_eid) &
    bd_not_lost2followup &
    sample_qc$het.missing.outliers == 0 &
    sample_qc$putative.sex.chromosome.aneuploidy == 0 &
    sample_qc$Submitted.Gender == sample_qc$Inferred.Gender &
    sample_qc$excluded.from.kinship.inference == 0 &
    sample_qc$excess.relatives == 0 &
    sample_qc$in.Phasing.Input.chr1_22 == 1 &
    sample_qc$in.Phasing.Input.chrX == 1 &
    sample_qc$in.Phasing.Input.chrXY == 1 &
    is.na(match(all_eids, remrels))
filter[is.na(filter)] <- FALSE

In [6]:
# get summary for SGSS
sgss_file = paste0(bugbank_data_dir, "ukb_sgss_extract_refined.csv")
sgss = read.csv(sgss_file, sep = "\t", header = TRUE, stringsAsFactors = FALSE)

# number of records
print(paste0("Number of records: ", nrow(sgss)))

# number of unique individuals
print(paste0("Number of unique individuals: ", length(unique(sgss$UKB_EID))))

# number of pathogen labels
print(paste0("Number of pathogen labels: ", length(unique(sgss$ORGANISM_SPECIES_NAME))))

[1] "Number of records: 350699"
[1] "Number of unique individuals: 114737"
[1] "Number of pathogen labels: 641"


In [19]:
# load HES
start_time = Sys.time()
hes_file = paste0(ukb_data_dir, "hes/hes_diag.latest.txt.gz")
hes = read.table(hes_file, sep = "\t", header = TRUE, stringsAsFactors = FALSE)
end_time = Sys.time()
print(paste0("Time taken to load hes: ", end_time - start_time))

[1] "Time taken to load HESIN: 17.3452785015106"


In [12]:
# load icd10 data
icd10_desc_file <- paste0(bugbank_data_dir, "pathogen_to_unique_icd10.tsv")
icd10_desc <- read.table(icd10_desc_file, sep = "\t", header = TRUE, stringsAsFactors = FALSE)

In [33]:
# subset hes to only infection related
infect_icd10_codes <- unique(unlist(strsplit(icd10_desc$icd10, split = ",")))
hes_infect <- hes[hes$diag_icd10 %in% infect_icd10_codes, ]

# number of records
print(paste0("Number of infection related records: ", nrow(hes_infect)))

# number of unique individuals
print(paste0("Number of unique individuals: ", length(unique(hes_infect$eid))))

# number of unique icd10 codes
print(paste0("Number of unique icd10 codes: ", length(unique(hes_infect$diag_icd10))))

# number of species level pathogens
# create a dictionary of ICD-10 to pathogen mapping
icd_to_pathogen = list()
for (i in 1:nrow(icd10_desc)) {
  cur_icd10s = unlist(strsplit(icd10_desc$icd10[i], ","))
  for (icd10 in cur_icd10s) {
    icd_to_pathogen[[icd10]] = c(icd10_desc$org_name[i], icd10_desc$tax_lev[i])
  }
}
# map hes icd10 codes to pathogen
hes_infect$org_name = map_chr(hes_infect$diag_icd10, function(x) icd_to_pathogen[[x]][1])
print(paste0("Number of unique pathogens: ", length(unique(hes_infect$org_name))))

# number of species level pathogen
hes_infect$tax_lev = map_chr(hes_infect$diag_icd10, function(x) icd_to_pathogen[[x]][2])
hes_infect_species = hes_infect[hes_infect$tax_lev == "species", ]
print(x = paste0("Number of unique species level pathogens: ", length(unique(hes_infect_species$org_name))))


[1] "Number of infection related records: 188788"
[1] "Number of unique individuals: 69900"
[1] "Number of unique icd10 codes: 480"
[1] "Number of unique pathogens: 155"
[1] "Number of unique species level pathogens: 88"


In [32]:
head(hes_infect_species)

Unnamed: 0_level_0,eid,ins_index,arr_index,level,diag_icd9,diag_icd9_nb,diag_icd10,diag_icd10_nb,org_name,tax_lev
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<chr>,<lgl>,<chr>,<int>,<chr>,<chr>
2750,1000906,6,5,2,,,B962,,Escherichia coli,species
4438,1001488,10,1,2,,,B961,,Klebsiella pneumoniae,species
4455,1001488,11,1,2,,,B961,,Klebsiella pneumoniae,species
4702,1001603,2,2,2,,,B962,,Escherichia coli,species
5375,1001970,8,4,2,,,B962,,Escherichia coli,species
5712,1002128,7,3,2,,,B171,,Hepacivirus C,species
