In [2]:
library(tidyverse)

-- [1mAttaching packages[22m --------------------------------------- tidyverse 1.3.1 --

[32mv[39m [34mggplot2[39m 3.3.5     [32mv[39m [34mpurrr  [39m 0.3.4
[32mv[39m [34mtibble [39m 3.1.5     [32mv[39m [34mdplyr  [39m 1.0.7
[32mv[39m [34mtidyr  [39m 1.1.4     [32mv[39m [34mstringr[39m 1.4.0
[32mv[39m [34mreadr  [39m 2.0.2     [32mv[39m [34mforcats[39m 0.5.1

-- [1mConflicts[22m ------------------------------------------ tidyverse_conflicts() --
[31mx[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31mx[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()



### Check if all HES diagnosis icd10 starting with A|B is included in the pathogen description file

In [23]:
setwd("~/bugbank_data")

# input file
hes_diag_f <- "/well/bag/wilson/ukb/hes/hesin_diag.latest.txt.gz"
path_desc_f <- "./icd10_pathogen_description_13032023.tsv"

# load file
hes_diag <- read.csv(hes_diag_f, sep = "\t")
path_desc <- read.csv(path_desc_f, sep = "\t")

In [24]:
hes_diag_icd10 <- unique(hes_diag$diag_icd10)
hes_diag_icd10_infect <- hes_diag_icd10[grep("^A|^B", hes_diag_icd10)]
path_desc_icd10 <- unique(unlist(strsplit(path_desc$UKB_code, ",")))
setdiff(hes_diag_icd10_infect, path_desc_icd10)

Check if there are ICD10 codes assigned to multiple organisms based on the raw icd10_pathogen_description_13032023.tsv

In [4]:
# input file
path_desc_f <- "./icd10_pathogen_description_13032023.tsv"

# load file
path_desc <- read.csv(path_desc_f, sep = "\t")

In [5]:
# create two dictionaries so that icd10 codes can be mapped to its pathogen and vice versa

path_to_icd10 <- list()
icd10_to_path <- list()

for (i in 1:nrow(path_desc)) {
    pathogen <- path_desc$org_name[i]
    icd10s <- strsplit(path_desc$UKB_code[i], ",")[[1]]

    # for path2icd10
    path_to_icd10[[pathogen]] <- unique(c(path_to_icd10[[pathogen]], icd10s))

    # for icd10_to_path
    for (icd10 in icd10s) {
        icd10_to_path[[icd10]] <- unique(c(icd10_to_path[[icd10]], pathogen))
    }
}

# check for multiple patghoen assignment in a single icd10
icd10_to_path_length <- map_int(icd10_to_path, length)
length(unique(unlist(icd10_to_path[which(icd10_to_path_length > 1)])))
unique(unlist(icd10_to_path[which(icd10_to_path_length > 1)]))

# 184
# This seems to be caused by the same icd10 code being assign to a species and then to a higher taxon level like the genus of the species

After refinement of the icd10 code assignment, is each ICD10 now assign to a unique organism?

In [22]:
# input file
path_to_ic10_refined_f <- "./pathogen_to_unique_icd10.tsv"

# load file
path_to_icd10_refined <- read.csv(path_to_ic10_refined_f, sep = "\t")

# create a icd10 to pathogen map 
icd10_to_path_map <- list()
for(i in 1:nrow(path_to_icd10_refined)) {
    pathogen <- path_to_icd10_refined$org_name[i]
    icd10s <- strsplit(path_to_icd10_refined$icd10[i], ",")[[1]]
    for (icd10 in icd10s) {
        icd10_to_path_map[[icd10]] <- c(icd10_to_path_map[[icd10]], pathogen)
    }
}

# from the icd10 to pathogen map, see which icd10 has more than 1 pathogen
icd10_to_path_map[which(map_int(icd10_to_path_map, length) > 1)]





### Sanity check for sgss taxonomy assignment

In [31]:
# input file
setwd("~/bugbank_data")
sgss_f <- "./ukb_sgss_extract_20211115.csv"
sgss_tax_f <- "./bb_pathogen_taxonomy_13032023.tsv"

# load input file
sgss_tax <- read.csv(sgss_tax_f, sep = "\t")
sgss <- read.csv(sgss_f, header = T)
colnames(sgss) <- c(
    "SPECIMEN_NUMBER",
    "UKB_EID",
    "SPECIMEN_DATE",
    "LAB_REPORT_DATE",
    "REPORTING_LAB_NAME",
    "LAB_GEOG_NAME_CURRENT",
    "LOCAL_AUTHORITY_NAME",
    "SPECIMEN_GROUP_DESC",
    "SPECIMEN_TYPE_DESC",
    "ORGANISM_CATEGORY_DESC",
    "ORGANISM_GENUS_NAME",
    "ORGANISM_SPECIES_NAME",
    "ORGANISM_SUBSPECIES_NAME"
)

check if all the pathogen in the bugbank file has been assigned taxonomy

In [32]:
sgss_pathogens <- sgss$ORGANISM_SPECIES_NAME
setdiff(sgss_pathogens, sgss_tax$origin_name)
setdiff(sgss_tax$origin_name, sgss_pathogens)

Check if there are duplicated taxonomy for different origin_name

In [43]:
tax_paths <- map_chr(1:nrow(sgss_tax), function(i) {
    paste0(unlist(sgss_tax[i, 1:8]), collapse = ":")
})

tax_path_freq <- table(tax_paths)
dup_tax_paths <- names(tax_path_freq)[which(tax_path_freq > 1)]
for (tax_path in dup_tax_paths) {
    dup_org_names <- sgss_tax$origin_name[which(tax_paths == tax_path)]
    print("duplicated organisms:")
    print(dup_org_names)
}

# it seems like all the origin_name that have the same tax path have good reasons for it

[1] "duplicated organisms:"
[1] "ACTINOMYCES OTHER NAMED" "ACTINOMYCES SP"         
[1] "duplicated organisms:"
[1] "MICROCOCCUS SP"          "MICROCOCCUS OTHER NAMED"
[1] "duplicated organisms:"
[1] "DIPHTHEROIDS"                "CORYNEBACTERIUM SP"         
[3] "CORYNEBACTERIUM OTHER NAMED"
[1] "duplicated organisms:"
[1] "MYCOBACTERIUM AVIUM"         "MYCOBACTERIUM AVIUM COMPLEX"
[1] "duplicated organisms:"
[1] "MYCOBACTERIUM INTRACELLULARE" "MYCOBACTERIUM CHIMAERA"      
[1] "duplicated organisms:"
[1] "MYCOBACTERIUM TUBERCULOSIS"          
[2] "MYCOBACTERIUM BOVIS"                 
[3] "MYCOBACTERIUM AFRICANUM"             
[4] "MYCOBACTERIUM BOVIS (BCG STRAIN)"    
[5] "MYCOBACTERIUM BOVIS (NON BCG STRAIN)"
[1] "duplicated organisms:"
[1] "MYCOBACTERIUM OTHER NAMED"      "MYCOBACTERIUM SP"              
[3] "MYCOBACTERIUM NON-TUBERCULOSIS"
[1] "duplicated organisms:"
[1] "MYCOBACTERIUM ABSCESSUS"         "MYCOBACTERIUM ABSCESSUS COMPLEX"
[1] "duplicated organisms:"
[1] "NOCARDIA 

### Sanity check for HES taxonomy assignment

In [33]:
# input files
hes_tax_f <- "./hes_pathogen_taxonomy_13032023.tsv"
path_desc_f <- "./icd10_pathogen_description_13032023.tsv"

# load files
hes_tax <- read.csv(hes_tax_f, sep = "\t")
path_desc <- read.csv(path_desc_f, sep = "\t")

check if all the pathogen from icd10 pathogen designation has been assigned a taxonomy

In [34]:
setdiff(path_desc$org_name, hes_tax$origin_name)

Check if there are duplicated taxonomy for different origin_name

In [44]:
tax_paths <- map_chr(1:nrow(hes_tax), function(i) {
    paste0(unlist(hes_tax[i, 1:8]), collapse = ":")
})

tax_path_freq <- table(tax_paths)
dup_tax_paths <- names(tax_path_freq)[which(tax_path_freq > 1)]
for (tax_path in dup_tax_paths) {
    dup_org_names <- hes_tax$origin_name[which(tax_paths == tax_path)]
    print("duplicated organisms:")
    print(dup_org_names)
}
# it seems like all the origin_name that have the same tax path have good reasons for it

[1] "duplicated organisms:"
[1] "Salmonella Typhi"     "Salmonella Paratyphi"
[1] "duplicated organisms:"
[1] "Fungi" "fungi"
[1] "duplicated organisms:"
[1] "Betaherpesvirinae"  "Gammaherpesvirinae"
[1] "duplicated organisms:"
[1] "Hantavirus"      "Orthohantavirus"
