# Frequency matched cases-controls

# Aim

The aim of this pipeline is to be able to generate a subset of cases and controls that are matched by frequency

# Pre-requisites

To use this pipeline you must install the R package in your library

`install.packages("ccoptimalmatch")`

# Input

Phenotype - covariate file: this is the raw file and should include patient_ID, phenotype (specifying cases and controls), variables that you want to use for matching.  

The phenotype variable needs to specify who is a case or a control ('1' or '0' respectively)

# Output

# Reference

In this case we will be using the R package [ccoptimalmatch](https://cran.r-project.org/web/packages/ccoptimalmatch/vignettes/ccoptimalmatching_vignette.html)

# Global parameters

In [None]:
[global]
# the output directory for generated files
parameter: cwd = path
# Phenotype file
parameter: phenoFile = path
# Phenotype to be analyzed (specify the column)
parameter: phenoCol = str
# Covariates to be used in the matching procedure
parameter: covarCol = []
# Number of controls to match per case (default: 5 controls per case)
parameter: n_con = 5
# Specific number of threads to use
parameter: numThreads = 1
# For cluster jobs, number commands to run per job
parameter: job_size = 1
parameter: mem = '20G'
parameter: walltime = '48h'

# Illustration of a minimal working example

On a minimal working example (MWE) dataset,

```
sos run frequency_match.ipynb match\
    --cwd output \
    --phenoFile ~/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Hearing_aid_f3393_expandedwhite_6436cases_96601ctrl_PC1_2.tsv \
    --phenoCol f3393 \
    --covarCol sex,age
```

# Case-control matching workflow implementation

In [None]:
[match]
input: phenoFile
output: f'{cwd}/{_input:bnn}.ccmatched', f'{cwd}/{_input:bnn}.ccmatched.pheno'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
R:  expand = "${ }", stderr = f'{_output[0]:n}.stderr', stdout = f'{_output[0]:n}.stdout'

    library('ccoptimalmatch')
    library('dplyr')
    print("Reading the phenotype file")
    raw <- read.table(${phenoFile:r}, header=T)
    head(raw)
    #Show number of cases-controls
    print("The number of cases and controls in the data-set is:")
    table(raw$${phenoCol})
    #Step 1. Exact matching on several variables: we need to create subsets based on relevant variables for matching
    print("Creating subsets based on matching variables")
    create_subset <- raw %>% 
                 filter(${phenoCol} == "1") %>%
                 arrange(${paths(covarCol):,}) %>%
                 distinct(${paths(covarCol):,}, .keep_all = TRUE) %>%
                 mutate(subset = 1:n()) %>%
                 select(${paths(covarCol):,}, subset)   
    head(create_subset)
    #Merge the data that contains subset variable with the cases only
    print("Merging subset with cases")
    case_with_subset <- raw %>% 
                          filter(${phenoCol} =="1") %>%
                           full_join(create_subset, by = c(${paths(covarCol):r,}))
    #Merge the data that contains subset variable with the controls only
    print("Merging subset with controls")
    control_with_subset <- raw %>% 
                             filter(${phenoCol} == "0") %>%
                             right_join(create_subset, by = c(${paths(covarCol):r,})) %>%
                             na.exclude()
    #Bind the cases and controls with subset variable
    raw <- rbind(case_with_subset,control_with_subset)
    
    #Step 2. Create artificial observations and select range of variables 
    bdd_controls <- raw[raw$${phenoCol}=="0",]
    bdd_controls$cluster_case <- 0
    bdd_cases <- raw[raw$f${phenoCol}=="1",]
    bdd_cases$cluster_case <- paste("1",1:nrow(bdd_cases),sep = "_")
    raw <- rbind(bdd_cases,bdd_controls)
    bdd_cases <- raw[raw$${phenoCol}=="1",]
    bdd_control <- raw[raw$${phenoCol}=="0",]
    bdd_temp <- data.frame()
    list_p <- unique(bdd_cases$cluster_case)
    bdd_temp <- data.frame()
    ##loop to generated pseudo-observations of controls
    for(i in 1:length(list_p)){
    temp <- bdd_cases[bdd_cases$cluster_case==list_p[i],]
    subset_identified <- temp$subset
    temp0 <- bdd_control[bdd_control$subset==temp$subset,]
    temp_final <- rbind(temp,temp0)
    temp_final$cluster_case <- list_p[i]
    temp_final=temp_final %>%
        group_by(cluster_case) %>%
        mutate(age_diff = abs(age - age[${phenoCol}=="1"]))
    temp_final$age_match <- ifelse(temp_final$age_diff<=2,"accept","delete")
    temp_final <- temp_final[temp_final$age_match=="accept",]
    temp_final$age_match <- NULL
    bdd_temp <- rbind(data.frame(bdd_temp),data.frame(temp_final))
    }
    # Step 3. Create the variables total control per case and frequency of controls
    bdd_temp1 = bdd_temp %>% group_by(cluster_case) %>% mutate(total_control_per_case = n()-1)
    bdd_temp1$case_ind <- bdd_temp1$${phenoCol}
    bdd_temp1$case_control <- ifelse(bdd_temp$${phenoCol}=="1","case","control")
    bdd_temp1 <- subset(bdd_temp1, select=c(cluster_case, IID, ${phenoCol}, case_ind,case_control,
                     ${paths(covarCol):,}, age_diff, total_control_per_case))
    bdd_temp1 = bdd_temp1 %>% group_by(IID) %>% mutate(freq_of_controls = n())
    print("Visualizing data with number of total controls per case and frequency of controls")
    head(bdd_temp1)
    # Step 4. Ordering variables 
    bdd_temp1<-bdd_temp1[order(bdd_temp1$cluster_case,bdd_temp1$case_control,bdd_temp1$sex,
                         bdd_temp1$age_diff,bdd_temp1$freq_of_controls),]
    # Step5. Case-control matching
    final_data <- optimal_matching(bdd_temp1, n_con=${n_con}, cluster_case, IID, 
                               total_control_per_case, case_control, with_replacement = FALSE)
    ##look at how many controls are matched per case
    final_data = final_data %>% group_by(cluster_case) %>% mutate(total_control_matched = n()-1)
    print("Table of the number of controls matched per case")
    table(final_data$case_control,final_data$total_control_matched)
    print("The total number of individuals in the final sample is:", nrow(final_data))
    write.table(final_data, file=${_output[0]:r}, quote=F, row.names=F, sep=" ", col.names=T)
    ${phenoCol}_match<- final_data %>% mutate (FID = IID) %>% 
                             select ('FID', 'IID','sex', '${phenoCol}', 'age', 'cluster_case')
    print("Creating output phenotype file")
    write.table(final_data, file=${_output[1]:r}, quote=F, row.names=F, sep=" ", col.names=T)

In [None]:
[plots]
parameter: cont_var = ['age']
parameter: disc_var = 
input:
output: f'{cwd}/{_input:bnn}.ccmatched.hist.png', f'{cwd}/{_input:bnn}.ccmatched.hist.png'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
R:  expand = "${ }", stderr = f'{_output[0]:n}.stderr', stdout = f'{_output[0]:n}.stdout'
    library('ggplot2')
    matched_data <- read.table(${_input[0]:r}, header=T)
    print("Plotting distribution of ${cont_var[0]} for ${phenoCol} cases and controls")
    ggplot(matched_data,aes(x=${cont_var[0]})) + 
    geom_histogram(data=subset(matched_data,${phenoCol}== '1'),fill = "red", alpha = 0.2) +
    geom_histogram(data=subset(matched_data,${phenoCol} == '0'),fill = "blue", alpha = 0.2) 

In [None]:
', '.join(['"%s"' % w for w in words])
'"' + '","'.join(words) + '"'


In [1]:
module load Singularity

In [61]:
sos run frequency_match.ipynb match\
    --cwd ~/matching_test\
    --phenoFile ~/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Hearing_aid_f3393_expandedwhite_6436cases_96601ctrl_PC1_2.tsv \
    --phenoCol f3393 \
    --covarCol sex age

INFO: Running [32mmatch[0m: 
INFO: [32mmatch[0m is [32mcompleted[0m.
[91mERROR[0m: [91m[match]: [match]: Output target /home/dmc2245/matching_test/090321_UKBB_Hearing_aid_f3393_expandedwhite_6436cases_96601ctrl_PC1_2.ccmatched does not exist after the completion of step match[0m


: 1