In [22]:
library(tidyverse)

-- [1mAttaching packages[22m --------------------------------------- tidyverse 1.3.1 --

[32mv[39m [34mggplot2[39m 3.3.5     [32mv[39m [34mpurrr  [39m 0.3.4
[32mv[39m [34mtibble [39m 3.1.5     [32mv[39m [34mdplyr  [39m 1.0.7
[32mv[39m [34mtidyr  [39m 1.1.4     [32mv[39m [34mstringr[39m 1.4.0
[32mv[39m [34mreadr  [39m 2.0.2     [32mv[39m [34mforcats[39m 0.5.1

-- [1mConflicts[22m ------------------------------------------ tidyverse_conflicts() --
[31mx[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31mx[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()



### Check if all HES diagnosis icd10 starting with A|B is included in the pathogen description file

In [15]:
setwd("~/bugbank_data")

# input file
hes_diag_f <- "/well/bag/wilson/ukb/hes/hesin_diag.latest.txt.gz"
path_desc_f <- "./icd10_pathogen_description_13032023.tsv"

# load file
hes_diag <- read.csv(hes_diag_f, sep = "\t")
path_desc <- read.csv(path_desc_f, sep = "\t")

In [16]:
hes_diag_icd10 <- unique(hes_diag$diag_icd10)
hes_diag_icd10_infect <- hes_diag_icd10[grep("^A|^B", hes_diag_icd10)]
path_desc_icd10 <- unique(unlist(strsplit(path_desc$UKB_code, ",")))
setdiff(hes_diag_icd10_infect, path_desc_icd10)

Check if there are ICD10 codes assigned to multiple organisms based on the raw icd10_pathogen_description_13032023.tsv

In [19]:
# input file
path_desc_f <- "./icd10_pathogen_description_13032023.tsv"

# load file
path_desc <- read.csv(path_desc_f, sep = "\t")

In [35]:
# create two dictionaries so that icd10 codes can be mapped to its pathogen and vice versa

path_to_icd10 <- list()
icd10_to_path <- list()

for (i in 1:nrow(path_desc)) {
    pathogen <- path_desc$org_name[i]
    icd10s <- strsplit(path_desc$UKB_code[i], ",")[[1]]

    # for path2icd10
    path_to_icd10[[pathogen]] <- unique(c(path_to_icd10[[pathogen]], icd10s))

    # for icd10_to_path
    for (icd10 in icd10s) {
        icd10_to_path[[icd10]] <- unique(c(icd10_to_path[[icd10]], pathogen))
    }
}

# check for multiple patghoen assignment in a single icd10
icd10_to_path_length <- map_int(icd10_to_path, length)
length(unique(unlist(icd10_to_path[which(icd10_to_path_length > 1)])))

# 184
# This seems to be caused by 

Check if there are duplicated taxonomy for different origin_name

In [36]:
colnames(path)

### Sanity check for sgss taxonomy assignment

In [31]:
# input file
setwd("~/bugbank_data")
sgss_f <- "./ukb_sgss_extract_20211115.csv"
sgss_tax_f <- "./bb_pathogen_taxonomy_13032023.tsv"

# load input file
sgss_tax <- read.csv(sgss_tax_f, sep = "\t")
sgss <- read.csv(sgss_f, header = T)
colnames(sgss) <- c(
    "SPECIMEN_NUMBER",
    "UKB_EID",
    "SPECIMEN_DATE",
    "LAB_REPORT_DATE",
    "REPORTING_LAB_NAME",
    "LAB_GEOG_NAME_CURRENT",
    "LOCAL_AUTHORITY_NAME",
    "SPECIMEN_GROUP_DESC",
    "SPECIMEN_TYPE_DESC",
    "ORGANISM_CATEGORY_DESC",
    "ORGANISM_GENUS_NAME",
    "ORGANISM_SPECIES_NAME",
    "ORGANISM_SUBSPECIES_NAME"
)

check if all the pathogen in the bugbank file has been assigned taxonomy

In [32]:
sgss_pathogens <- sgss$ORGANISM_SPECIES_NAME
setdiff(sgss_pathogens, sgss_tax$origin_name)
setdiff(sgss_tax$origin_name, sgss_pathogens)

### Sanity check for HES taxonomy assignment

In [33]:
# input files
hes_tax_f <- "./hes_pathogen_taxonomy_13032023.tsv"
path_desc_f <- "./icd10_pathogen_description_13032023.tsv"

# load files
hes_tax <- read.csv(hes_tax_f, sep = "\t")
path_desc <- read.csv(path_desc_f, sep = "\t")

check if all the pathogen from icd10 pathogen designation has been assigned a taxonomy

In [34]:
setdiff(path_desc$org_name, hes_tax$origin_name)