# Project Description

This project involves heterochromation profiling of plasma from lymphoma patients, vs non-lymphoma controls. Primary goal of this project is to identify a signature of lymphoma features from plasma H3K27me3 profiles, which significantly differ from non-lymphoma controls. Additional data includes clinical metadata from each of the lymphoma patients, and other parallel plasma measurements including CAPP-Seq (mutations using a lymphoma-specific panel), and shallow WGS. Metabolic Tumor Volume quantified from PET-CT imaging is also included for these patients.

# Import libraries and set plot dimensions

In [None]:
# libraries
library(tidyverse)
library(edgeR)
library(sva)
library(reshape2)
library(pheatmap)
library(RColorBrewer)
library(MetBrewer)
library(caret)
library(glmnet)
library(e1071)
library(edgeR)
library(umap)
library(DMwR)
library(pROC)
library(gbm)
library(survival)
library(survminer)
library(GenomicRanges)
library(recipes)

In [None]:
# plot dimensions
options(repr.plot.width = 10, repr.plot.height = 10) # specific to the IRkernel

# Data pre-processing + differential analysis (edgeR)

## Import counts data and compile into a numeric matrix

In [None]:
# import 10kb reference, w regulatory features and bin IDs. Reference was created using annotatR. Note; reference generation requires an internet connection.
setwd("/path/to/reference/")
annotated_ref_no_X_Y_M_blacklist <- read.delim("10kb_bin_genome_wide_annotated_reference_with_regulatory_features_no_X_Y_M_ENCODEBlacklist.txt", header = TRUE, sep = "\t")

# import sample list, formatted beforehand.
setwd("/path/to/list/")
sample_list <- read.delim("sample_list_for_lymphoma_cohort.txt", header = TRUE, sep = "\t")

gc() #clear memory if memory is a bottleneck; the data frames in this script are large

# import non-normalized counts, output from MEDIPS (in 10kb bins)
# BAM files prior to generation of non-normalized counts were generated using MEDIPIPE pre-processing pipeline (10.1093/bioinformatics/btad423, originally intended for cfMeDIP-Seq data, but adapted for cfChIP-Seq). 
setwd("/path/to/samples/")
import_counts <- list.files(path = ".", pattern = "counts_10kb.txt$", recursive = TRUE) # list counts with directory paths

# exclude samples with these names, as they are not required for the following analyses. Only T1 (baseline) samples should be included.
exclude <- c("0018_T3", "0025_T4", "0027_T3", "0027_T4", "0101_T3", "0030_T1", "HUCON_37", "HUCON_43", "HUCON_44", "0036_C", "0247_T1_ABC", "T3", "T5", "pbmc_k27") #character vector of sample strings to be excluded.

# convert character vector to data frame
import_counts <- as.data.frame(import_counts)

# Remove all excluded samples from the original list of samples. Use "|" as the seperator between vector elements.
import_counts <- import_counts[!grepl(paste(exclude, collapse = "|"), import_counts$import_counts),] # 

# loop over files in import_counts, and read them into memory
counts <- mapply(read.delim, import_counts) 

# bind individual datasets across columns
counts_matrix <- do.call(cbind, counts)

# change column names to file names from list, and remove NAs
colnames(counts_matrix) <- c(paste0(sample_list$sample_name))
counts_matrix <- na.omit(counts_matrix)

# index rows in counts_matrix
rownames(counts_matrix) <- 1:nrow(counts_matrix)

gc()

## Data cleaning, filter out low variance and low count features from the matrix

In [None]:
#filter out chrX, Y, and ENCODE Blacklist regions. Resulting data frame is your ground truth; refer back to this after feature selection.
all_filtered_features <- counts_matrix[as.integer(rownames(counts_matrix)) %in% annotated_ref_no_X_Y_M_blacklist$bin_id,]

# calculate the coefficient of variation (CV, ratio of SD to the mean) for all features using the var function, row-wise. Independent of sample origin.
cv_features <- apply(all_filtered_features, 1, var)

# visualize histogram of feature variances
var_hist <- hist(cv_features[cv_features > 0 & cv_features < 40000], breaks = 500) # bimodal distribution, left skewed

# remove variance below the valley on the histogram. Valley is at around 2400.
all_filtered_features <- all_filtered_features[(which(apply(all_filtered_features, 1, var)>=2400)),]

# remove low count features
counts_mean <- apply(all_filtered_features, 1, mean) # left skewed, slight bimodal distribution of counts after filtering low variance features.
counts_hist <- hist(counts_mean[counts_mean > 0 & counts_mean < 1000], breaks = 500) 

# Remove lowest expressed features
all_filtered_features <- all_filtered_features[(which(apply(all_filtered_features, 1, mean)>=50)),]

gc() #clear memory

## Split dataset into train and test sets for feature selection / validation

In [None]:
# split train and test sets. Perform differential analysis on only training set.

# Split the data into features (X) and labels (y)
X <- all_filtered_features  # Features (numeric matrix plus sample name attributes)
y <- sample_list$timepoint # Data labels for lymphoma vs non-lymphoma classification

# Training: 70%; Test: 30%
# Split the data into training and testing sets
set.seed(23)  # For reproducibility

train_indices <- createDataPartition(y, p = 0.7, list = FALSE)

# save train and test set numeric matrices and labels as independent variables. Lock test set away for later validation.
X_train <- X[, train_indices]
y_train <- y[train_indices]
X_test <- X[, -train_indices]
y_test <- y[-train_indices]

#check balance of healthy and baseline samples in train/test splits. Ensure it is close to 70/30.
table(y_train)
table(y_test)

## Perform differential analysis using edgeR, using SVA to address unknown patterns in the data

In [None]:
# this chunk takes time and computational power, so schedule script as a job if working on a virtual machine.
# script for differential analysis was adapted from edgeR user guide (https://bioconductor.org/packages/release/bioc/vignettes/edgeR/inst/doc/edgeRUsersGuide.pdf)
gc()

# set up DGEList object for differential analysis
design <- model.matrix(~y_train) #design model around differences between lymphoma and non-lymphoma profiles. Can change depending on intended comparison.
list <- DGEList(counts = X_train, samples = y_train, remove.zeros = FALSE) # no need to remove zeros, since they were removed in matrix pre-processing steps.

# apply TMM normalization to raw counts
tmm_norm <- calcNormFactors(list, method="TMM")

# normalize counts before SVA is performed; otherwise library size is the only surrogate variable.
cpm_norm <- cpm(tmm_norm, normalized.lib.sizes = TRUE, log = FALSE)

# perform SVA analysis to remove technical variation
sva_fit <- sva(cpm_norm, design)

#IMPORTANT; add SVA surrogate variables as covariates in model design.
design <- model.matrix(~y_train + sva_fit$sv)

## save model before differential analysis is applied, in case the analysis is not complete within the bounds of the wall time allocated.
#save.image()

gc()

# steps below here are to perform the differential analysis
fit <- estimateDisp(tmm_norm, design = design, trend.method = "locfit", robust = TRUE)

gc()

# fit model, incorporating surrogate variables
QL <- glmQLFit(fit, design = design)

#Subset to rows with an FDR <0.05
fdr <- table(p.adjust(QL$table$PValue, method="BH")<0.05)

lrt <- glmQLFTest(QL, coef = 2)

# extract all features, sorted by p-val
res <- topTags(lrt, n = 246275, adjust.method = "BH", sort.by = "PValue")

# save dataset
setwd("path/to/working/directory/")
save.image("train_test_split_for_differential_analysis.RData")

## Subset to significant features from differential analysis, generate volcano plot of features as sanity check

In [None]:
setwd("path/to/working/directory/")
load("train_test_split_for_differential_analysis.RData")

#keep only the features that meet a p-value threshold.
#de_features_subset <- res[(res$table$logFC >= 1 | res$table$logFC <= -1) & res$table$PValue <= 0.01,] # logFC >= 1 is 2x change; only 19 features.
#de_features_subset <- res[(res$table$logFC >= 0.25 | res$table$logFC <= -0.25) & res$table$PValue <= 0.05,] # 341 features.
#de_features_subset <- res[res$table$PValue <= 0.01,] # 1742 features
de_features_subset <- res[res$table$PValue <= 0.05,] # 9370 features; stick with this one as it is less stringent. Let the Random Forest decide which features are important.

# store feature names (aka bin IDs) for differential features and all features as separate variables
de_features <- rownames(de_features_subset)
de_features_all <- rownames(res)

# calculate logFC, pval, and -log10(pval) for all features
logFC <- res$table$logFC
p_value <- res$table$PValue
neg_log_p_value <- -log10(p_value)

# compile data for volcano plot.
df_feature_stats <- data.frame(logFC,neg_log_p_value,de_features_all)

# calculate logFC, pval, and -log10(pval) for all significant features
logFC_signif <- de_features_subset$table$logFC
p_value_signif <- de_features_subset$table$PValue
neg_log_p_value_signif <- -log10(p_value_signif)

# compile data for volcano plot.
df_feature_stats_signif <- data.frame(logFC_signif,neg_log_p_value_signif,de_features)

v <- ggplot() +
  geom_point(data = df_feature_stats, aes(x = logFC, y = neg_log_p_value), colour = "black", alpha = 0.3) +
  geom_point(data = df_feature_stats_signif, aes(x = logFC_signif, y = neg_log_p_value_signif), colour = "red", alpha = 0.3) +
  geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "red") +
  geom_vline(xintercept = c(-1, 1), linetype = "dashed", color = "blue") +
  labs(x = "Log Fold Change", y = "-log10(p-value)") +
  ggtitle("Volcano Plot") +
  theme_minimal()
print(v)

## Perform Principal Component Analysis (PCA) on all significant differential features

In [None]:
# plot PCA of significant features, to see whether the features differentiate between lymphoma profiles and non-lymphoma controls
# compare to PCA of all features

# ensure you transpose the feature matrix before PCA
# pca_data <- t(cpm_norm) # for all features
pca_data <- t(cpm_norm[de_features,]) # for feature subset

# perform PCA, with z-scaling
pca_res <- prcomp(pca_data, scale = TRUE)

# Plot the first two principal components. Commented geoms are additional data layers, for additional visualizations.
q <- pca_res$x %>% 
  as.data.frame %>%
  ggplot(aes(x=PC1,y=PC2)) +
  #geom_point(aes(color=list$samples$DIAGNOSIS_CLASS_SUMMARIZE_HEALTHY),size=5) +
  #geom_point(aes(color=list$samples$DIAGNOSIS_CLASS_SUMMARIZE_HEALTHY.DIAGNOSIS_SPECIFIC_DLBCL),size=5) +
  #geom_point(aes(color=list$samples$timepoint),size=5) +
  geom_point(aes(color=y_train),size=5) +
  #geom_point(aes(color=list$samples$batch),size=5) +
  #geom_text(aes(label=list$samples$sample_name)) +
  theme_minimal(base_size=20) + 
  labs(colour = "Group") +
  xlab(paste0("PC1 (", round(pca_res$sdev[1]^2*100/sum(pca_res$sdev^2), 1), "%)")) +
  ylab(paste0("PC2 (", round(pca_res$sdev[2]^2*100/sum(pca_res$sdev^2), 1), "%)")) +
  theme(legend.position="right")

print(q)

# Random Forest of significant differential features (additional feature selection step)

In [None]:
# chunk requires at least 60GB of memory on a virtual machine, run script as a scheduled job on compute cluster, requesting compute node.

# import dataset with significant differential features
setwd("path/to/working/directory/")
load("train_test_split_for_differential_analysis.RData")

# subset differential results to differential features
de_features_subset <- res[res$table$PValue <= 0.05,] # 9370 features
# de_features_subset <- res[res$table$PValue <= 0.05 & res$table$logFC > 0,] # 5328 features; only hypermethylated features considered. Compare classification performance

# save bin IDs (aka rownames) of differential features, depending on which set you're using
de_features <- rownames(de_features_subset)

# subset CPM normalized matrix to differential features
de_features_matrix <- cpm_norm[de_features,]

# specify tune grid for hyperparameter tuning; tuning hyperparameters depend on the model used.
tune_grid <- expand.grid(
  mtry = c(2, 5, 10) # tuning hyperparameter for rf. Can test a range of values here, or do a grid search in trainControl
)

# set up train control for Random Forest model (R Caret).
ctrl <- trainControl(
  method = "repeatedcv",  # Use cross-validation for evaluation
  repeats = 5, # number of model iterations
  number = 10,  # Number of folds for cross-validation
  search = "grid",
  sampling = "smote",
  verbose = FALSE
)

# transpose feature matrix for machine learning
t_de_features_matrix <- t(de_features_matrix)

model <- train(t_de_features_matrix, 
               as.factor(y_train), 
               method = "rf", # for Random Forest
               # method = "nb", # naive bayes; turn off tuneGrid.
               # method = "glmnet", # try with specifying the tuneLength parameter next
               trControl = ctrl, 
               tuneGrid = tune_grid, # turn off for nb
               metric = "Accuracy",
               #metric = "ROC",
               #metric = "Kappa", # for unbalanced sets; since we're using SMOTE, accuracy should be okay
               ntree = 500 # for rf
               #tuneLength = 8 # take random values for default tuning parameters and pick the best; alternative to tuneGrid
)

# print error log for model output, in case the model fails
# sink(rf_hyperparameter_tuning_accuracy_error_log.txt)

# see model results for training data
print(model)

# extract predictors for further analysis (aka feature validation)
predictors <- predictors(model)

# save dataset w model
setwd("path/to/working/directory/")
save.image("rf_model_from_differential_features.RData")

# REFINE AND OPTIMIZE CODE FROM HERE

# Analyses incorporating discovered features

## Comparison of discovered features across prognostic groups in lymphoma

In [None]:
# load dataset
setwd("/cluster/projects/scottgroup/people/steven/human_plasma_cfChIP-Seq/liberate_plasma_cfchip/analysis/machine_learning_models/02_07_2023_RFE_RF_10kb_windows/differential_analysis_filter/")
load("10_09_2023_rf_10kb_with_hyperparameter_tuning_accuracyevaluation_withdifferentialfilter.RData")

# import clinical metadata
setwd("/cluster/home/sdemichi/steven_scottgroup.txt/human_plasma_cfChIP-Seq/liberate_plasma_cfchip/temp/")
clinical_metadata <- as.data.frame(readxl::read_xlsx("25_07_2023_cohort_metadata_labeling_UPDATED_WITH_CLINICAL_ANNOTATIONS_plus_library_size_cfdna_concn.xlsx"))

# import mean VAF information - UPDATED FOR BATCH 2 - using dcs.unique.burden
setwd("/cluster/projects/scottgroup/people/steven/human_plasma_cfChIP-Seq/liberate_plasma_cfchip/other_profiles/cappseq_data_lymphoma/updated_batch_2_all_cases/")
mean_VAF_burden <- read.delim("Bratman.dcs.uniq.burden.txt", header = TRUE, sep = "\t") # for dcs.unique

# replace spaces in "Donor ID (rep)" so that you can match directly to mean VAF burden$Sample_id
clinical_metadata$`Donor ID (rep)` <- gsub(" ", "_", clinical_metadata$`Donor ID (rep)`)

# match mean VAF information to clinical metadata
clinical_metadata$mean_VAF <- mean_VAF_burden$mean_VAF[match(clinical_metadata$'Donor ID (rep)', mean_VAF_burden$Sample_id)]

# match annotations from X_train to the new clinical annotations. Append cfDNA concentrations.
#all_filtered_features_X_train <- all_filtered_features[,colnames(all_filtered_features) %in% colnames(X_train)]

# subset sample metadata to training samples.
training_sample_metadata <- sample_list[sample_list$sample_name %in% colnames(cpm_norm),]

# filter clinical metadata by append cfDNA concentration to test, from clinical metadata.
clinical_metadata <- clinical_metadata[clinical_metadata$PATIENT_ID %in% training_sample_metadata$PATIENT_ID,] 

# remove all T3 samples from the metadata
clinical_metadata <- subset(clinical_metadata, !grepl("T3|T5", clinical_metadata$`Donor ID (rep)`))

# subset sample metadata to training samples
X_train_attributes <- sample_list[sample_list$sample_name %in% colnames(cpm_norm),]

# sort clinical_metadata in order of colnames of X_train_attributes$PATIENT_ID
clinical_metadata <- clinical_metadata[match(X_train_attributes$PATIENT_ID, clinical_metadata$PATIENT_ID), ]

# overwrite X_train_attributes with clinical_metadata
X_train_attributes <- clinical_metadata



# check model output
print(model)
predictors_rf <- predictors(model)

# side; SUBSET TO PREDICTORS WITH A POSITIVE logFC
predictors_rf_hyper <- res[rownames(res) %in% predictors_rf & res$table$logFC > 0,]
predictors_rf_hyper <- rownames(predictors_rf_hyper)

# side; SUBSET TO ALL FEATURES WITH A POSITIVE logFC
all_hyper <- res[res$table$logFC > 0,]
all_hyper <- rownames(all_hyper)

# side; SUBSET TO ALL FEATURES WITH A POSITIVE logFC
all_sig_hyper <- res[res$table$logFC > 0 & res$table$PValue < 0.05,]
all_sig_hyper <- rownames(all_sig_hyper)

# side; SUBSET TO ALL FEATURES WITH A NEGATIVE logFC
all_sig_hypo <- res[res$table$logFC < 0 & res$table$PValue < 0.05,]
all_sig_hypo <- rownames(all_sig_hypo)

# SCALE FEATURES BEFORE SUBSETTING AND VISUALIZING
# cpm_norm_scaled <- scale(cpm_norm, center = TRUE, scale = TRUE)
# cpm_norm_scaled <- scale(cpm_norm, center = TRUE, scale = apply(cpm_norm, 2, function(x) max(x) - min(x)))

# alternate normalization; log2(CPM+1) normalize before subsetting
cpm_norm_log2 <- log2(cpm_norm + 1)
X_train_filtered <- cpm_norm_log2[(rownames(cpm_norm_log2)) %in% predictors_rf_hyper,] # for hypermethylated modeled features
# X_train_filtered <- cpm_norm_log2[(rownames(cpm_norm_log2)) %in% all_sig_hyper,] # for hypermethylated modeled features





# ADDITIONAL STEP; multiply the normalized feature matrix by the weights from the model. Matrix multiplication, after sorting the weights.

# model_feature_weights <- model$finalModel$importance # weights

# # subset weights to features in X_train_filtered
# model_feature_weights <- model_feature_weights[rownames(model_feature_weights) %in% rownames(X_train_filtered),]
# model_feature_weights <- as.data.frame(model_feature_weights) # convert to data frame

# # make rownames of model_feature_weights a column, and sort in ascending order by row
# model_feature_weights$bin_id <- as.numeric(rownames(model_feature_weights))

# # sort feature weights by feature
# model_feature_weights <- model_feature_weights[order(model_feature_weights$bin_id, decreasing = FALSE),]

# # multiply weights from model_feature_weights column, with the normalized counts from X_train_filtered
# X_train_filtered <- model_feature_weights$model_feature_weights * X_train_filtered




# X_train_filtered <- cpm_norm_scaled[(rownames(cpm_norm_scaled)) %in% predictors_rf,]
# X_train_filtered <- cpm_norm_scaled[(rownames(cpm_norm_scaled)) %in% predictors_rf_hyper,] # for hypermethylated modeled features
# X_train_filtered <- cpm_norm_scaled[(rownames(cpm_norm_scaled)) %in% all_hyper,] # for all hypermethylated features
# X_train_filtered <- cpm_norm_scaled[(rownames(cpm_norm_scaled)) %in% all_sig_hyper,] # for all significant hypermethylated features
# X_train_filtered <- cpm_norm_scaled[(rownames(cpm_norm_scaled)) %in% all_sig_hypo,] # for all significant hypomethylated features

# add the lowest scaled count in the matrix to each of the counts to remove zeros
# test <- min(X_train_filtered)
# X_train_filtered <- X_train_filtered + abs(test)

# subset original training matrix to the features in chosen feature set
#X_train_filtered <- cpm_norm[(rownames(cpm_norm)) %in% predictors_rf,]

# save new matrix as a DGEobject, since this links the sample attributes and the counts
X_train_filtered_features_plus_attributes <- DGEList(counts = X_train_filtered, samples = X_train_attributes, remove.zeros = TRUE)






#calculate the median of each column
median_X_train_filtered <- apply(X_train_filtered_features_plus_attributes$counts, 
                   MARGIN = 2, 
                   FUN = median)

#calculate the mean of each column
mean_X_train_filtered <- apply(X_train_filtered_features_plus_attributes$counts, 
                   MARGIN = 2, 
                   FUN = mean)

#include the sums of the features per sample as an additional feature, and visualize
sum_X_train_filtered <- colSums(X_train_filtered_features_plus_attributes$counts)

#bind the newly calculated medians and sumsto the pre-existing metadata
X_train_attributes <- as.data.frame(cbind(X_train_attributes, median_X_train_filtered)) #Subset this data frame for plotting.
X_train_attributes <- as.data.frame(cbind(X_train_attributes, sum_X_train_filtered)) #Subset this data frame for plotting.

##### METADATA-SPECIFIC CHANGES BEFORE PLOTTING #####
X_train_attributes$ANN_ARBOR_STAGE <- ifelse(X_train_attributes$ANN_ARBOR_STAGE == "NA", "non-lymphoma", X_train_attributes$ANN_ARBOR_STAGE) # convert NA characters to "non-lymphoma"
X_train_attributes$IPI <- as.character(X_train_attributes$IPI) #convert IPI from int to char

X_train_attributes$mean_VAF <- as.numeric(X_train_attributes$mean_VAF) #convert IPI from int to char


#For tumor diameter, create bins (<5cm, 5-10cm, 10-15cm, >15cm), then assign samples to those bins based off of tumor diameter; separate column
X_train_attributes$TUMOR_MASS_MAX_CM <- as.numeric(X_train_attributes$TUMOR_MASS_MAX_CM)
X_train_attributes <- X_train_attributes %>% 
  mutate(tumor_diameter_binned = cut(TUMOR_MASS_MAX_CM, breaks = c(0, 5, 10, 15, 30), labels = c("<5", "5-10", "10-15", ">15")))

# combine Stage I/II and III/IV samples into early and late stage, respectively, for visualization
X_train_attributes <- X_train_attributes %>%
  mutate(Stage_Group = case_when(
    ANN_ARBOR_STAGE %in% c(1, 2) ~ "Early Stage",
    ANN_ARBOR_STAGE %in% c(3, 4) ~ "Late Stage",
    TRUE ~ as.character(ANN_ARBOR_STAGE)  # Keep other values as is
  ))

# set factor level for Stage_Group
X_train_attributes$Stage_Group <- factor(X_train_attributes$Stage_Group, levels = c("non-lymphoma", "Early Stage", "Late Stage"))

# 24-10-2023; create new variable where IPI is grouped 1+2 and 4+5, as this is clinically relevant
X_train_attributes <- X_train_attributes %>%
  mutate(IPI_grouped = case_when(
    IPI %in% c(0, 1) ~ "0+1",
    IPI %in% c(4, 5) ~ "4+5",
    TRUE ~ as.character(IPI)  # Keep other values as is
  ))

# 24-10-2023; subset X_train_attributes to FL cases, and visualize FLIPI
# X_train_attributes_FL <- X_train_attributes[X_train_attributes$DIAGNOSIS_CLASS_SUMMARIZE_HEALTHY == "FL", ]

# for grouped IPI, remove healthy cases from matrix
# X_train_attributes <- X_train_attributes[!(X_train_attributes$IPI_grouped == "NA"),] # for grouped IPI



##### PLOT #####

# X_train_attributes_FL %>% 
X_train_attributes %>% 
  ggplot(aes(x = Stage_Group, y = median_X_train_filtered)) +#change to sum or median, depending on what you're visualizing
  # ggplot(aes(x = Stage_Group, y = mean_X_train_filtered)) +#change to sum or median, depending on what you're visualizing
  # ggplot(aes(x = Stage_Group, y = sum_X_train_filtered)) +#change to sum or median, depending on what you're visualizing
  # ggplot(aes(x = IPI_grouped, y = median_X_train_filtered)) +#change to sum or median, depending on what you're visualizing
  # ggplot(aes(x = Stage_Group, y = sum_X_train_filtered, colour = Stage_Group)) +
  #ggplot(aes(x = as.character(NB_EN_SITES), y = sum_X_train_filtered, colour = as.character(NB_EN_SITES))) +
  # geom_boxplot(aes(fill = IPI_grouped), alpha = 0.3) +
  geom_boxplot(aes(fill = Stage_Group), alpha = 0.3) +
  geom_point(aes(colour = mean_VAF), position = position_jitterdodge(0.2), size = 7, alpha = 0.6) + 
  # geom_label(label = X_train_attributes$PATIENT_ID, alpha = 0.3) + # to label the data points and figure out which samples are the outliers
  theme_minimal(base_size = 18) +
  #scale_color_gradient(low = "#185a9d", high = "#43cea2") +
  scale_color_gradient(low = "blue", high = "red") +
  labs(#x = "Stage Group",
       x = "IPI Group",
       # y = "Median of Hypermethylated Feature Expression",
       # y = "Median of ALL Hypermethylated Feature Expression",
        y = "Median of Modeled Hypermethylated Feature Expression",
        colour = "mean VAF", 
        # fill = "Stage Group", 
        fill = "IPI Group", 
       #colour = "IPI"
       )
  #ggtitle("Median Hypermethylated Counts of Differential Features")
  #ggtitle("Median of Counts over Features")
  #ggtitle("Sum of Counts over Modeled Features")
  ggtitle("Median of Counts over Modeled Features")

### 23-09-2023 Boxplot visualizations for DLBCL and FL separate. RUN ABOVE CHUNK

In [None]:
# Subset X_train_attributes to only DLBCL/FL cases
X_train_attributes_subtype <- X_train_attributes[!(X_train_attributes$DIAGNOSIS_CLASS_SUMMARIZE_HEALTHY == "FL"), ] # for DLBCL
X_train_attributes_subtype <- X_train_attributes[!(X_train_attributes$DIAGNOSIS_CLASS_SUMMARIZE_HEALTHY == "DLBCL"), ] # for FL


p <- X_train_attributes_subtype %>% 
  ggplot(aes(x = Stage_Group, y = median_X_train_filtered)) +#change to sum or median, depending on what you're visualizing
  #ggplot(aes(x = Stage_Group, y = sum_X_train_filtered, colour = Stage_Group)) +
  #ggplot(aes(x = as.character(NB_EN_SITES), y = sum_X_train_filtered, colour = as.character(NB_EN_SITES))) +
  geom_boxplot(aes(fill = Stage_Group), alpha = 0.3) +
  geom_point(aes(colour = mean_VAF), position = position_jitterdodge(0.2), size = 7, alpha = 0.6) + 
  theme_minimal(base_size = 18) +
  #scale_color_gradient(low = "#185a9d", high = "#43cea2") +
  scale_color_gradient(low = "blue", high = "red") +
  labs(x = "Stage Group",
       #x = "IPI",
       # y = "Median of Hypermethylated Feature Expression",
       # y = "Median of ALL Hypermethylated Feature Expression",
        y = "Median of MODELED Hypermethylated Feature Expression",
        colour = "Stage Group", 
       #colour = "IPI"
       )
  #ggtitle("Median Hypermethylated Counts of Differential Features")
  #ggtitle("Median of Counts over Features")
  #ggtitle("Sum of Counts over Modeled Features")
  ggtitle("Median of Counts over Modeled Features")
print(p)

- Directionality of the trend over modeled and random features changes depending on whether the sum or median of the features is used. This is an unreliable metric.
- Think about a better way to visualize these features.

### Stats for comparison between Stage Groups vs healthy and IPI vs healthy

In [None]:
# t-test between the different stage groups and healthy, for hypermethylated features
#stats; stage
stats_healthy <- X_train_attributes[X_train_attributes$Stage_Group == "non-lymphoma",]
stats_early <- X_train_attributes[X_train_attributes$Stage_Group == "Early Stage",]
stats_late <- X_train_attributes[X_train_attributes$Stage_Group == "Late Stage",]

# sum (less significant (due to the size of the values), but cleaner trend)
# t.test(stats_healthy$sum_X_train_filtered, stats_early$sum_X_train_filtered, var.equal = FALSE) #significant; p = 4.322e-09
# t.test(stats_healthy$sum_X_train_filtered, stats_late$sum_X_train_filtered, var.equal = FALSE) #significant; p = 1.319e-09
# t.test(stats_early$sum_X_train_filtered, stats_late$sum_X_train_filtered, var.equal = FALSE) # not significant; p = 0.07981

# median (more significant)
t.test(stats_healthy$median_X_train_filtered, stats_early$median_X_train_filtered, var.equal = FALSE) #significant; p = 3.958e-09
t.test(stats_healthy$median_X_train_filtered, stats_late$median_X_train_filtered, var.equal = FALSE) #significant; p = 8.591e-13
t.test(stats_early$median_X_train_filtered, stats_late$median_X_train_filtered, var.equal = FALSE) #significant; p = 0.01535

# sum
t.test(stats_healthy$sum_X_train_filtered, stats_early$sum_X_train_filtered, var.equal = FALSE) #significant; p = 1.218e-11
t.test(stats_healthy$sum_X_train_filtered, stats_late$sum_X_train_filtered, var.equal = FALSE) #significant; p = < 2.2e-16
t.test(stats_early$sum_X_train_filtered, stats_late$sum_X_train_filtered, var.equal = FALSE) #significant; p = 0.4273

# median, ALL HYPERMETHYLATED FEATURES, n = 129380
t.test(stats_healthy$median_X_train_filtered, stats_early$median_X_train_filtered, var.equal = FALSE) #significant; p = 0.001867
t.test(stats_healthy$median_X_train_filtered, stats_late$median_X_train_filtered, var.equal = FALSE) #significant; p = 1.226e-06

# median, ALL SIGNIFICANT HYPERMETHYLATED FEATURES, n = 5328
t.test(stats_healthy$median_X_train_filtered, stats_early$median_X_train_filtered, var.equal = FALSE) #significant; p = 5.592e-10
t.test(stats_healthy$median_X_train_filtered, stats_late$median_X_train_filtered, var.equal = FALSE) #significant; p = 1.493e-13




#stats; IPI
stats_IPI_0 <- X_train_attributes[X_train_attributes$IPI == "0",]
stats_IPI_1 <- X_train_attributes[X_train_attributes$IPI == "1",]
stats_IPI_2 <- X_train_attributes[X_train_attributes$IPI == "2",]
stats_IPI_3 <- X_train_attributes[X_train_attributes$IPI == "3",]
stats_IPI_4 <- X_train_attributes[X_train_attributes$IPI == "4",]
stats_IPI_5 <- X_train_attributes[X_train_attributes$IPI == "5",]
stats_IPI_healthy <- X_train_attributes[X_train_attributes$Stage_Group == "non-lymphoma",]

# sum
t.test(stats_IPI_healthy$sum_X_train_filtered, stats_IPI_1$sum_X_train_filtered, var.equal = FALSE) #significant; p = 5.253e-08
t.test(stats_IPI_healthy$sum_X_train_filtered, stats_IPI_2$sum_X_train_filtered, var.equal = FALSE) #significant; p = 2.533e-08
t.test(stats_IPI_healthy$sum_X_train_filtered, stats_IPI_3$sum_X_train_filtered, var.equal = FALSE) #significant; p = 1.114e-08
t.test(stats_IPI_healthy$sum_X_train_filtered, stats_IPI_4$sum_X_train_filtered, var.equal = FALSE) #significant; p = 3.851e-08
t.test(stats_IPI_healthy$sum_X_train_filtered, stats_IPI_5$sum_X_train_filtered, var.equal = FALSE) #not significant; p = 0.1411

# median; MODELED FEATURES
t.test(stats_IPI_healthy$median_X_train_filtered, stats_IPI_0$median_X_train_filtered, var.equal = FALSE) # significant; p = 2.27e-07
t.test(stats_IPI_healthy$median_X_train_filtered, stats_IPI_1$median_X_train_filtered, var.equal = FALSE) # significant; p = 0.0006568
t.test(stats_IPI_healthy$median_X_train_filtered, stats_IPI_2$median_X_train_filtered, var.equal = FALSE) # significant; p = 5.456e-07
t.test(stats_IPI_healthy$median_X_train_filtered, stats_IPI_3$median_X_train_filtered, var.equal = FALSE) # significant; p = 3.037e-06
t.test(stats_IPI_healthy$median_X_train_filtered, stats_IPI_4$median_X_train_filtered, var.equal = FALSE) # significant; p = 0.0002235
t.test(stats_IPI_healthy$median_X_train_filtered, stats_IPI_5$median_X_train_filtered, var.equal = FALSE) # not significant; p = 0.1051

# median
t.test(stats_IPI_healthy$median_X_train_filtered, stats_IPI_0$median_X_train_filtered, var.equal = FALSE) # significant; p = 4.951e-08
t.test(stats_IPI_healthy$median_X_train_filtered, stats_IPI_1$median_X_train_filtered, var.equal = FALSE) # significant; p = 0.001438
t.test(stats_IPI_healthy$median_X_train_filtered, stats_IPI_2$median_X_train_filtered, var.equal = FALSE) # significant; p = 3.861e-08
t.test(stats_IPI_healthy$median_X_train_filtered, stats_IPI_3$median_X_train_filtered, var.equal = FALSE) # significant; p = 5.936e-07
t.test(stats_IPI_healthy$median_X_train_filtered, stats_IPI_4$median_X_train_filtered, var.equal = FALSE) # significant; p = 0.0001509
t.test(stats_IPI_healthy$median_X_train_filtered, stats_IPI_5$median_X_train_filtered, var.equal = FALSE) # not significant; p = 0.0865


# FOR RANDOM SAMPLE
#stats; IPI
stats_IPI_0 <- X_train_attributes_subset[X_train_attributes_subset$IPI == "0",]
stats_IPI_1 <- X_train_attributes_subset[X_train_attributes_subset$IPI == "1",]
stats_IPI_2 <- X_train_attributes_subset[X_train_attributes_subset$IPI == "2",]
stats_IPI_3 <- X_train_attributes_subset[X_train_attributes_subset$IPI == "3",]
stats_IPI_4 <- X_train_attributes_subset[X_train_attributes_subset$IPI == "4",]
stats_IPI_5 <- X_train_attributes_subset[X_train_attributes_subset$IPI == "5",]
stats_IPI_healthy <- X_train_attributes_subset[X_train_attributes_subset$Stage_Group == "healthy",]

# median
t.test(stats_IPI_healthy$Mean_Median, stats_IPI_0$Mean_Median, var.equal = FALSE) # significant; p = 4.951e-08
t.test(stats_IPI_healthy$Mean_Median, stats_IPI_1$Mean_Median, var.equal = FALSE) # significant; p = 0.001438
t.test(stats_IPI_healthy$Mean_Median, stats_IPI_2$Mean_Median, var.equal = FALSE) # significant; p = 3.861e-08
t.test(stats_IPI_healthy$Mean_Median, stats_IPI_3$Mean_Median, var.equal = FALSE) # significant; p = 5.936e-07
t.test(stats_IPI_healthy$Mean_Median, stats_IPI_4$Mean_Median, var.equal = FALSE) # significant; p = 0.0001509
t.test(stats_IPI_healthy$Mean_Median, stats_IPI_5$Mean_Median, var.equal = FALSE) # not significant; p = 0.0865

- Makes sense that the trends are not significant, as the directionality of the scaled values is in both directions; need a better way to look at this.
- Consider doing this analysis again but for only modeled features which have a positive logFC in the volcano plot.
- If this doesn't work, stick with the median because there is a significant difference between an IPI of 1 and 5 across the modeled features.

### ANOVA across correlation coefficients for the different IPI groups

In [None]:
# create matrix of samples from each IPI group
matrix_data <- X_train_attributes[,c("IPI_grouped","median_X_train_filtered")] # for grouped IPI
# matrix_data <- X_train_attributes[,c("IPI","median_X_train_filtered")]

# matrix_data_lymphoma <- matrix_data[!(matrix_data$IPI == "NA"),] # for individual IPI
matrix_data_lymphoma <- matrix_data[!(matrix_data$IPI_grouped == "NA"),] # for grouped IPI
# matrix_data_lymphoma_no_IPI5 <- matrix_data[!(matrix_data$IPI == "NA" | matrix_data$IPI == "5"),]

# ANOVA
result_anova <- aov(median_X_train_filtered ~ IPI, data = matrix_data)
result_anova <- aov(median_X_train_filtered ~ IPI, data = matrix_data_lymphoma)
result_anova <- aov(median_X_train_filtered ~ IPI, data = matrix_data_lymphoma_no_IPI5)
summary(result_anova)

# alternative; kruskal.test (non-parametric)
# kruskal.test(median_X_train_filtered ~ IPI, data = matrix_data_lymphoma)
kruskal.test(median_X_train_filtered ~ IPI_grouped, data = matrix_data_lymphoma)
# kruskal.test(median_X_train_filtered ~ IPI, data = matrix_data_lymphoma_no_IPI5)





# EXTRA
# # Kruskal-Wallis (doesn't work)
# matrix_data_lymphoma$IPI <- as_factor(matrix_data_lymphoma$IPI)
# result_kw <- kruskal.test(median_X_train_filtered ~ IPI, data = matrix_data_lymphoma)
# summary(result_kw)

# Fisher's Exact Test
# Create a table for the contingency data
contingency_table <- table(df$IPI, df$median_X_train_filtered)

# Perform Fisher's Exact Test for each IPI group against all others
results <- lapply(unique(df$IPI), function(ipi_group) {
  # Create a 2x2 contingency table for the current IPI group vs. all others
  current_group <- contingency_table[ipi_group, ]
  other_groups <- rowSums(contingency_table[-ipi_group, ])
  contingency <- matrix(c(current_group, other_groups), nrow = 2)
  
  # Perform Fisher's Exact Test
  fisher_result <- fisher.test(contingency)
  
  # Store the result along with the IPI group name
  result <- list(IPI_Group = ipi_group, Fisher_Test_Result = fisher_result)
  
  return(result)
})

# Combine results into a data frame
results_df <- do.call(rbind, results)

# Print the results
print(results_df)

### Subset to features where a a positive logFC was observed for differential analysis

In [None]:
# subset to hypomethylated features from the volcano plot, measure sum and median of these features and see whether the 
# trends improve

# SUBSET TO PREDICTORS WITH A POSITIVE logFC
predictors_rf_hyper <- res[rownames(res) %in% predictors_rf & res$table$logFC > 0,]

# advance to first chunk in this heading


### Random sampling of X number of features over 1000 iterations, and visualization

In [None]:
# sub chunk of previous chunk, take 1000 random samples of the values from the matrix and visualize the sum of these. Compare to the modeled features.

# Initialize an empty list to store the results
results_list <- list()

for (iteration in 1:1000) {
  # Randomly select X number of features; adjust according to the number of desired features
  #selected <- sample(rownames(X_train), 5056, replace = FALSE)
  selected <- sample(rownames(X_train), 2850, replace = FALSE)
  # selected <- sample(rownames(X_train), 5328, replace = FALSE)

  # Subset the matrix
  subset_matrix <- cpm_norm_log2[rownames(cpm_norm_log2) %in% selected,]

  # Add the lowest scaled count in the matrix to each of the counts to remove zeros
  # test <- min(subset_matrix)
  # subset_matrix <- subset_matrix + abs(test) + 0.001

  # Calculate the median of each column of the random subset of features
  median_X_train_filtered_subset <- apply(subset_matrix, 
                                          MARGIN = 2, 
                                          FUN = median)

  # Include the sums of the features per sample as an additional feature
  sum_X_train_filtered_subset <- colSums(subset_matrix)

  # Create a data frame for the results of this iteration
  iteration_results <- data.frame(
    Median = median_X_train_filtered_subset,
    Sum = sum_X_train_filtered_subset
  )

  # Append the results to the list
  results_list[[iteration]] <- iteration_results
}

# Combine the results from all iterations into a single data frame
combined_results <- bind_rows(results_list, .id = "Iteration")

# make rowname a column in combined_results
combined_results <- combined_results %>%
  mutate(SampleName = str_extract(rownames(combined_results), ".*K27me3"))

# Calculate the mean of each observation in the "Sum" column across all iterations
mean_values <- combined_results %>%
  group_by(SampleName) %>%
  summarise(Mean_Median = mean(Median),
            Mean_Sum = mean(Sum))




## append new values to metadata for visualization
# subset sample metadata to training samples, then visualize expression over the random features
X_train_attributes_subset <- sample_list[sample_list$sample_name %in% colnames(cpm_norm),]

#bind the newly calculated median / sum to the pre-existing metadata
X_train_attributes_subset <- as.data.frame(cbind(X_train_attributes_subset, mean_values)) #Subset this data frame for plotting.

# normalize data by cfDNA concentrations. Once data is normalized visualize variance, distribution of expression levels, etc.
X_train_attributes_subset <- X_train_attributes_subset %>%
mutate(median_X_train_filtered_subset_cfDNA_adj = median_X_train_filtered_subset / clinical_metadata$`Plasma cfDNA concentration (ng/uL)`)

X_train_attributes_subset$ANN_ARBOR_STAGE[is.na(X_train_attributes_subset$ANN_ARBOR_STAGE)] <- "healthy" #make NA = healthy for Ann Arbor stage.
X_train_attributes_subset$IPI <- as.character(X_train_attributes_subset$IPI) #convert IPI from int to char

#For tumor diameter, create bins (<5cm, 5-10cm, 10-15cm, >15cm), then assign samples to those bins based off of tumor diameter; separate column
X_train_attributes_subset <- X_train_attributes_subset %>% 
  mutate(tumor_diameter_binned = cut(TUMOR_MASS_MAX_CM, breaks = c(0, 5, 10, 15, 30), labels = c("<5", "5-10", "10-15", ">15")))

# combine Stage I/II and III/IV samples into early and late stage, respectively, for visualization
X_train_attributes_subset <- X_train_attributes_subset %>%
  mutate(Stage_Group = case_when(
    ANN_ARBOR_STAGE %in% c(1, 2) ~ "Early Stage",
    ANN_ARBOR_STAGE %in% c(3, 4) ~ "Late Stage",
    TRUE ~ as.character(ANN_ARBOR_STAGE)  # Keep other values as is
  ))

# set factor level for visualization
X_train_attributes_subset$Stage_Group <- factor(X_train_attributes_subset$Stage_Group, levels = c("healthy", "Early Stage", "Late Stage"))






## RUN IF RANDOM SUBSET WAS ALREADY GENERATED, import
setwd("/cluster/projects/scottgroup/people/steven/human_plasma_cfChIP-Seq/liberate_plasma_cfchip/analysis/machine_learning_models/02_07_2023_RFE_RF_10kb_windows/differential_analysis_filter/r_objects")
load("26_09_2023_1000_iterations_random_subset_2850_features_after_scaling.RData")

# add mean VAF to this data frame from clinical_metadata. Sample order is the same, so cbind is sufficient.
mean_VAF <- clinical_metadata$mean_VAF
X_train_attributes_subset <- cbind(X_train_attributes_subset, mean_VAF)

p <- X_train_attributes_subset %>% 
  ggplot(aes(x = Stage_Group, y = Mean_Median)) +#change to sum or median, depending on what you're visualizing
  #ggplot(aes(x = Stage_Group, y = sum_X_train_filtered, colour = Stage_Group)) +
  #ggplot(aes(x = as.character(NB_EN_SITES), y = sum_X_train_filtered, colour = as.character(NB_EN_SITES))) +
  geom_boxplot(aes(fill = Stage_Group), alpha = 0.3) +
  geom_point(aes(colour = mean_VAF), position = position_jitterdodge(0.2), size = 7, alpha = 0.6) + 
  theme_minimal(base_size = 18) +
  #scale_color_gradient(low = "#185a9d", high = "#43cea2") +
  scale_color_gradient(low = "blue", high = "red") +
  labs(x = "Stage Group",
       # x = "IPI",
       y = "mean of 1000 iterations(sum(Feature Counts of 2850 Random Features))",
       # y = "mean of 1000 iterations(median(Feature Counts of 5328 Random Features))",
       colour = "mean VAF", 
       fill = "Stage Group", 
       # colour = "IPI"
       )
  #ggtitle("Median Hypermethylated Counts of Differential Features")
  #ggtitle("Median of Counts over Features")
  #ggtitle("Sum of Counts over Modeled Features")
  ggtitle("Median of Counts over Modeled Features")
print(p)

# save this iteration as an R Object for visualization later, along with the individual values for mean and median.
setwd("/cluster/projects/scottgroup/people/steven/human_plasma_cfChIP-Seq/liberate_plasma_cfchip/analysis/machine_learning_models/02_07_2023_RFE_RF_10kb_windows/differential_analysis_filter/r_objects")
save(X_train_attributes_subset, file = "26_09_2023_1000_iterations_random_subset_2850_features_after_scaling.RData")
save(combined_results, file = "26_09_2023_1000_iterations_random_subset_2850_features_after_scaling_rawdata.RData")

### Incorporate expression of K27me3 over modeled features in T3 (post-treatment) samples

In [None]:
# incorporate T3 samples into the numeric matrix
# see introductory chunk in this document for details
# for this to work, you need to normalize all the data in the same way, and perform the same train/test split for visualization.
# CPM normalization and scaling to be performed on all values as well.
# scaling is applied by column, where the mean and SD is calculated per column.
# Same with CPM normalization. This is good, so you can apply the two primary normalizations this way.

# what you can do is 1) create the matrix w all samples, 2) subset to all features in original data matrix 
# after pre-processing steps, and 3) apply the two normalizations before subsetting to the hypermethylated 
# modeled features (since the normalizations are applied column-wise).


# load original dataset from modeling, for reference and subsetting to relevant samples
# setwd("/cluster/projects/scottgroup/people/steven/human_plasma_cfChIP-Seq/liberate_plasma_cfchip/analysis/machine_learning_models/02_07_2023_RFE_RF_10kb_windows/differential_analysis_filter/")
# load("10_09_2023_rf_10kb_with_hyperparameter_tuning_accuracyevaluation_withdifferentialfilter.RData")


#Novaigate to directory with new annotated list. Import list to R session with differential features.
setwd("/cluster/projects/scottgroup/people/steven/reference_sets/other_references")
annotated_ref_no_X_Y_M_blacklist <- read.delim("10kb_bin_genome_wide_annotated_reference_with_regulatory_features_no_X_Y_M_ENCODEBlacklist.txt", header = TRUE, sep = "\t")
#annotated_ref_no_X_Y_M_blacklist <- read.delim("100kb_bin_genome_wide_annotated_reference_with_regulatory_features_no_X_Y_M_ENCODEBlacklist.txt", header = TRUE, sep = "\t") # for 100kb

#CREATE SAMPLE MATRIX IN THE FORM OF A DGEList froM edgeR#
setwd("/cluster/home/sdemichi/steven_scottgroup.txt/human_plasma_cfChIP-Seq/liberate_plasma_cfchip/temp/")
sample_list <- read.delim("15_05_2023_sample_list_edgeR_without_basic_exclusions_including_T3_outliers_T5_no_pbmc.txt", header = TRUE, sep = "\t") #without T3

gc() #clear memory; these data frames are large
setwd("/cluster/home/sdemichi/steven_scottgroup.txt/human_plasma_cfChIP-Seq/liberate_plasma_cfchip/")
import_counts <- list.files(path = ".", pattern = "counts_10kb.txt$", recursive = TRUE) #list counts data, 10kb windows
exclude <- c("0018_T3", "0025_T4", "0027_T3", "0027_T4", "0101_T3", "0030_T1", "HUCON_37", "HUCON_43", "HUCON_44", "0036_C", "0247_T1_ABC", "pbmc_k27") #character vector of sample strings to be excluded.

# look into why you excluded "0036_C" and "0247_T1_ABC" from the prior analysis. It works without them which is good.
# for now, remove them from the analysis
sl_exclude <- c("liberate_0036_C_K27me3", "liberate_0247_T1_ABC_K27me3")
sample_list <- sample_list[!(sample_list$sample_name) %in% sl_exclude,]

#Remove all excluded samples from the original list of samples; these are samples without baseline profiles, and T5. Only T1 and T3 should be included.
import_counts <- as.data.frame(import_counts)
import_counts <- import_counts[!grepl(paste(exclude, collapse = "|"), import_counts$import_counts),] #this will be a helpful line for the future.
#import_counts <- as.data.frame(import_counts) #convert back to data frame

counts <- mapply(read.delim, import_counts) #loop over files in import_counts, and read them into memory
counts_matrix <- do.call(cbind, counts) #bind individual datasets across columns

colnames(counts_matrix) <- c(paste0(sample_list$sample_name)) #change column names to file names from list

# index rows in counts_matrix
rownames(counts_matrix) <- 1:nrow(counts_matrix)

#Store data in a DGEList object
gc()


## code from here
# subset to only features present in original dataset after filtering
all_filtered_features_including_T3 <- counts_matrix[rownames(counts_matrix) %in% rownames(all_filtered_features),]

# in the numeric matrix, keep only samples that are in training set for T1, and only matched samples for T3; no test T3
# should be included
# subset sample_list by PATIENT_ID, then subset the matrix by the updated sample_list
sample_list <- sample_list[sample_list$PATIENT_ID %in% X_train_attributes$PATIENT_ID,]
all_filtered_features_including_T3 <- all_filtered_features_including_T3[,colnames(all_filtered_features_including_T3) %in% sample_list$sample_name]

# log2(CPM+1) normalize matrix
# CPM normalize
all_filtered_features_including_T3 <- cpm(all_filtered_features_including_T3, normalized.lib.sizes = TRUE, log = FALSE)

# log(CPM+1) transform
all_filtered_features_including_T3 <- log2(all_filtered_features_including_T3 + 1)

#X_train_filtered <- cpm_norm_scaled[(rownames(cpm_norm_scaled)) %in% predictors_rf,]
T3_modeled_features <- all_filtered_features_including_T3[(rownames(all_filtered_features_including_T3)) %in% predictors_rf_hyper,] # for hypermethylated features



# ## SANITY CHECK; check that the counts over modeled features in T3_modeled_features, for samples shared with X_train_filtered
# X_train_filtered <- cpm_norm_scaled[(rownames(cpm_norm_scaled)) %in% predictors_rf_hyper,]

# test <- T3_modeled_features[,colnames(T3_modeled_features) %in% colnames(X_train_filtered)]
# identical(test, X_train_filtered) # there are differences. FALSE produced.

# # find values where differences occur between the two matrices.
# differences <- test - X_train_filtered # almost every value is different. Why? They should be the same since they were processed in the same way. There's a problem somewhere.

# identical(rownames(T3_modeled_features), rownames(T3_modeled_features)) # correct feature subset
# identical(colnames(T3_modeled_features), colnames(T3_modeled_features)) # correct sample subset

# identical(all_filtered_features_including_T3, tmm_norm$counts) # correct set of features, and feature expression before CPM normalization
# identical(all_filtered_features_including_T3, cpm_norm_scaled) # not the same, scaling may be different. But the matrices do look the same using head().

# identical(T3_modeled_features, X_train_filtered) # FALSE. Changes occuring during the scaling step, I believe.

# differences <- T3_modeled_features - X_train_filtered # the differences are at a rounding level, there are essentially no differences between the two matrices.

# max(differences) #1.4210854715202e-14, basically negligable.

# confirmed, difference has to be the addition of min to the matrix.














# import clinical metadata
setwd("/cluster/home/sdemichi/steven_scottgroup.txt/human_plasma_cfChIP-Seq/liberate_plasma_cfchip/temp/")
clinical_metadata <- as.data.frame(readxl::read_xlsx("25_07_2023_cohort_metadata_labeling_UPDATED_WITH_CLINICAL_ANNOTATIONS_plus_library_size_cfdna_concn.xlsx"))

# import mean VAF information
setwd("/cluster/projects/scottgroup/people/steven/human_plasma_cfChIP-Seq/liberate_plasma_cfchip/other_profiles/cappseq_data_lymphoma/updated_batch_2_all_cases/")
mean_VAF_burden <- read.delim("Bratman.dcs.uniq.burden.txt", header = TRUE, sep = "\t") # for dcs.unique

# replace spaces in "Donor ID (rep)" so that you can match directly to mean VAF burden$Sample_id
clinical_metadata$`Donor ID (rep)` <- gsub(" ", "_", clinical_metadata$`Donor ID (rep)`)

# match mean VAF information to clinical metadata. Hopefully some of the T3 burden data comes back soon from OICR.
clinical_metadata$mean_VAF <- mean_VAF_burden$mean_VAF[match(clinical_metadata$'Donor ID (rep)', mean_VAF_burden$Sample_id)]

# match annotations from X_train to the new clinical annotations. Append cfDNA concentrations.
#all_filtered_features_X_train <- all_filtered_features[,colnames(all_filtered_features) %in% colnames(X_train)]


## SUBSET TO ONLY SAMPLES IN TRAINING SET; continue from here




# add additional column to sample list, which matches the format of clinical_metadata$`Bioinformatic Name`
sample_list <- sample_list %>%
mutate(`Bioinformatic Name` = sub("^liberate_(.*)_K27me3$", "\\1", sample_list$sample_name))

# subset sample metadata to training samples. #### TROUBLESHOOT; ONE OF THESE STEPS IS CREATING DUPLICATE ENTRIES IN clinical_metadata$`Bioinformatic Name`
training_sample_metadata <- sample_list[sample_list$sample_name %in% colnames(T3_modeled_features),]

# filter clinical metadata by append cfDNA concentration to test, from clinical metadata.
clinical_metadata <- clinical_metadata[clinical_metadata$`Bioinformatic Name` %in% training_sample_metadata$`Bioinformatic Name`,] 

# subset sample metadata to training samples
X_train_attributes_plus_T3 <- sample_list[sample_list$sample_name %in% colnames(all_filtered_features_including_T3),]

# sort clinical_metadata in order of colnames of X_train_attributes$PATIENT_ID
clinical_metadata <- clinical_metadata[match(X_train_attributes_plus_T3$`Bioinformatic Name`, clinical_metadata$`Bioinformatic Name`), ]

# overwrite X_train_attributes with clinical_metadata
X_train_attributes_plus_T3 <- clinical_metadata











#calculate the median of each column (aka counts over top 1500 differential features, for each sample)
median_X_train_filtered_plus_T3 <- apply(T3_modeled_features, 
                   MARGIN = 2, 
                   FUN = median)

#include the sums of the features per sample as an additional feature, and visualize
sum_X_train_filtered_plus_T3 <- colSums(T3_modeled_features)

#bind the newly calculated medians and sumsto the pre-existing metadata
X_train_attributes_plus_T3 <- as.data.frame(cbind(X_train_attributes_plus_T3, median_X_train_filtered_plus_T3)) #Subset this data frame for plotting.
X_train_attributes_plus_T3 <- as.data.frame(cbind(X_train_attributes_plus_T3, sum_X_train_filtered_plus_T3)) #Subset this data frame for plotting.

##### METADATA-SPECIFIC CHANGES BEFORE PLOTTING #####
X_train_attributes_plus_T3$ANN_ARBOR_STAGE <- ifelse(X_train_attributes_plus_T3$ANN_ARBOR_STAGE == "NA", "non-lymphoma", X_train_attributes_plus_T3$ANN_ARBOR_STAGE) # convert NA characters to "non-lymphoma"
X_train_attributes_plus_T3$IPI <- as.character(X_train_attributes_plus_T3$IPI) #convert IPI from int to char

X_train_attributes_plus_T3$mean_VAF <- as.numeric(X_train_attributes_plus_T3$mean_VAF) #convert IPI from int to char


#For tumor diameter, create bins (<5cm, 5-10cm, 10-15cm, >15cm), then assign samples to those bins based off of tumor diameter; separate column
X_train_attributes_plus_T3$TUMOR_MASS_MAX_CM <- as.numeric(X_train_attributes_plus_T3$TUMOR_MASS_MAX_CM)
X_train_attributes_plus_T3 <- X_train_attributes_plus_T3 %>% 
  mutate(tumor_diameter_binned = cut(TUMOR_MASS_MAX_CM, breaks = c(0, 5, 10, 15, 30), labels = c("<5", "5-10", "10-15", ">15")))

# combine Stage I/II and III/IV samples into early and late stage, respectively, for visualization
X_train_attributes_plus_T3 <- X_train_attributes_plus_T3 %>%
  mutate(Stage_Group = case_when(
    ANN_ARBOR_STAGE %in% c(1, 2) ~ "Early Stage",
    ANN_ARBOR_STAGE %in% c(3, 4) ~ "Late Stage",
    TRUE ~ as.character(ANN_ARBOR_STAGE)  # Keep other values as is
  ))

# set factor level for Stage_Group
X_train_attributes_plus_T3$Stage_Group <- factor(X_train_attributes_plus_T3$Stage_Group, levels = c("non-lymphoma", "Early Stage", "Late Stage"))

# add another column representing whether the samples are baseline or post-treatment
X_train_attributes_plus_T3 <- X_train_attributes_plus_T3 %>%
mutate(time_of_collection = case_when(
    str_detect(`Donor ID (rep)`, "HUCON|_C") ~ "Non-Lymphoma",
    str_detect(`Donor ID (rep)`, "T1") ~ "Baseline",
    str_detect(`Donor ID (rep)`, "T3|T5") ~ "Post-Treatment",
))

# visualize with T3 samples
p <- X_train_attributes_plus_T3 %>% 
  ggplot(aes(x = time_of_collection, y = median_X_train_filtered_plus_T3)) +#change to sum or median, depending on what you're visualizing
  #ggplot(aes(x = Stage_Group, y = sum_X_train_filtered, colour = Stage_Group)) +
  #ggplot(aes(x = as.character(NB_EN_SITES), y = sum_X_train_filtered, colour = as.character(NB_EN_SITES))) +
  geom_boxplot(aes(fill = time_of_collection), alpha = 0.3) +
  geom_point(aes(colour = mean_VAF), position = position_jitterdodge(0.2), size = 7, alpha = 0.6) + 
  theme_minimal(base_size = 18) +
  #scale_color_gradient(low = "#185a9d", high = "#43cea2") +
  scale_color_gradient(low = "blue", high = "red") +
  labs(x = "Stage Group",
       #x = "IPI",
       #y = "Sum of Scaled Feature Counts",
       y = "Median of Hypermethylated Feature Expression",
       colour = "Stage Group", 
       #colour = "IPI"
       )
  #ggtitle("Median Hypermethylated Counts of Differential Features")
  #ggtitle("Median of Counts over Features")
  #ggtitle("Sum of Counts over Modeled Features")
  ggtitle("Median of Counts over Modeled Features")
print(p)

- 29-09-2023 work in progress. Figured out how to subset the matrix to all training T1 and associated T3 samples.
- Next step is to perform all the boxplot transformations as performed before, and visualizing non-lymphoma, T1 and T3.
- 04-01-2024; worked fine with log2(CPM+1).
- NEED TO SUBSET TO ONLY TRAINING SET CASES!

### 12-10-2023 Feature Importance for Modeled Features

In [None]:
# use the VarImp function to measure the importance of each of the variables on classification performance.
# see if there is a subset of features that are particularly important.

# import modeling data
setwd("/cluster/projects/scottgroup/people/steven/human_plasma_cfChIP-Seq/liberate_plasma_cfchip/analysis/machine_learning_models/02_07_2023_RFE_RF_10kb_windows/differential_analysis_filter/")
load("10_09_2023_rf_10kb_with_hyperparameter_tuning_accuracyevaluation_withdifferentialfilter.RData")

# in case you're interested in only the hypermethylated features for this; SUBSET TO PREDICTORS WITH A POSITIVE logFC
# predictors_rf_hyper <- res[rownames(res) %in% predictors_rf & res$table$logFC > 0,]
# predictors_rf_hyper <- rownames(predictors_rf_hyper)

# check feature importance. Visualize top 100 features using bar plot.
importance <- varImp(model, scale = FALSE) # try with and without scaling
# importance <- importance[order(importance$importance$Overall, decreasing = TRUE), ]

variable_importance <- data.frame(feature = row.names(importance$importance),
                                  importance = importance$importance$Overall)

# sort by decreasing variable importance
variable_importance <- variable_importance[order(variable_importance$importance, decreasing = TRUE), ]

# subset to top 5% of important features (9370*0.05 = 469)
variable_importance_subset <- variable_importance[1:469, ]

# plot variables
p <- variable_importance_subset %>% 
  ggplot(aes(x = reorder(feature, -importance), y = importance, fill = feature)) +
  geom_bar(stat="identity") + 
  labs(x = "Features", y = "Variable Importance") + 
  # geom_text(aes(label = round(importance, 2)), vjust=1.6, color="white", size=4) + 
  theme_minimal() + 
  theme(legend.position = "none")
print(p)



# perform visualizations. Subset to top 5% by feature importance, and visualize these on the volcano plot. See if they are the 
# features with higher effect size.
de_features_subset_rf_highimportance <- res[(rownames(res)) %in% variable_importance_subset$feature,]



- Keep in mind, variable importance may not be additive. The sum of output of VarImp does not equal 1.

### Enrichment of modeled features in regulatory elements, genome wide

In [None]:
# See where modeled features are enriched
# load annotation set
setwd("/cluster/projects/scottgroup/people/steven/reference_sets/other_references")
annotated_ref_no_X_Y_M_blacklist <- read.delim("10kb_bin_genome_wide_annotated_reference_with_regulatory_features_no_X_Y_M_ENCODEBlacklist.txt", header = TRUE, sep = "\t")

# subset to modeled features

rf_hyper_annotations <- annotated_ref_no_X_Y_M_blacklist[annotated_ref_no_X_Y_M_blacklist$bin_id %in% predictors_rf_hyper,]
# test <- table(rf_hyper_annotations)

# take the summary of the annotations, and extract the important info for visualization
# cbind the summary of the annotations to one another into a single data frame in long format, and mutate a label representing the regulatory set.
cpg_annot <- as.data.frame(table(rf_hyper_annotations$cpg_annotations))
cpg_annot$annotation <- c("cpg_annotations")
cpg_annot$Var1 <- as.character(cpg_annot$Var1)
cpg_annot[5,1] <- "not_cpg" # change "other" annotation to "not_cpg"

gene_annot <- as.data.frame(table(rf_hyper_annotations$genic_annotations))
gene_annot$annotation <- c("genic_annotations")

enhancer_annot <- as.data.frame(table(rf_hyper_annotations$enhancer_annotations))
enhancer_annot$annotation <- c("enhancer_annotations")
enhancer_annot$Var1 <- as.character(enhancer_annot$Var1)
enhancer_annot[2,1] <- "not_enhancer" # change "other" annotation to "not_enhancer"

# combine into single annotation summary set
annotation_summary <- rbind(cpg_annot,gene_annot,enhancer_annot)

# set factor level for annotations
annotation_summary$annotation <- as.factor(annotation_summary$annotation)
# annotation_summary$Var1 <- unique(annotation_summary$Var1)

# visualize as bar plot
annotation_summary %>%
ggplot(aes(x = Var1, y = Freq, colour = annotation)) +
geom_point(size = 10) +
theme_minimal()



# create random subset of 2850 features, 100 times; compile in the same way as above.

# Initialize an empty list to store the results
results_list <- list()

for (iteration in 1:20) {
  # Randomly select X number of features; adjust according to the number of desired features
  #selected <- sample(rownames(X_train), 5056, replace = FALSE)
  selected <- sample(rownames(X_train), 2850, replace = FALSE)
  # selected <- sample(rownames(X_train), 5328, replace = FALSE)

  # Subset the matrix
  subset_matrix <- annotated_ref_no_X_Y_M_blacklist[annotated_ref_no_X_Y_M_blacklist$bin_id %in% selected,]

  # take the summary of the annotations, and extract the important info for visualization
  # cbind the summary of the annotations to one another into a single data frame in long format, and mutate a label representing the regulatory set.
  cpg_annot_random <- as.data.frame(table(subset_matrix$cpg_annotations))
  cpg_annot_random$annotation <- c("cpg_annotations")
  cpg_annot_random$Var1 <- as.character(cpg_annot_random$Var1)
  cpg_annot_random[5,1] <- "not_cpg" # change "other" annotation to "not_cpg"

  gene_annot_random <- as.data.frame(table(subset_matrix$genic_annotations))
  gene_annot_random$annotation <- c("genic_annotations")

  enhancer_annot_random <- as.data.frame(table(subset_matrix$enhancer_annotations))
  enhancer_annot_random$annotation <- c("enhancer_annotations")
  enhancer_annot_random$Var1 <- as.character(enhancer_annot_random$Var1)
  enhancer_annot_random[2,1] <- "not_enhancer" # change "other" annotation to "not_enhancer"

  # combine into single annotation summary set
  annotation_summary_random <- rbind(cpg_annot_random,
                                     gene_annot_random,
                                     enhancer_annot_random)

  # Append the results to the list
  results_list[[iteration]] <- annotation_summary_random[,2]
}

# Combine the results from all iterations into a single data frame
combined_results <- bind_rows(results_list, .id = "Iteration")

# make rowname a column in combined_results
combined_results <- combined_results %>%
  mutate(SampleName = str_extract(rownames(combined_results), ".*K27me3"))

# Calculate the mean of each observation in the "Sum" column across all iterations
mean_values <- combined_results %>%
  group_by(SampleName) %>%
  summarise(Mean_Median = mean(Median),
            Mean_Sum = mean(Sum))

















# visualize
hist(rf_hyper_annotations$bin_id, breaks = 22) # genome wide distribution of the modeled features (not scaled for genome size). Quick vis
# random iterations

# 20-10-2023 additional visualization; look at the genome-wide distribution of features by logFC using geom_segment.
# de_features_subset <- res[res$table$PValue <= 0.05,] # 9370 features
de_features_subset <- res[rownames(res) %in% predictors_rf,] # all modeled features

de_features_subset_rownames <- rownames(de_features_subset)

# need to import annotation set without regions removed, so full length segments can be created. Remove X, Y, and M from the annotation set.
setwd("/cluster/projects/scottgroup/people/steven/reference_sets/other_references")
annotated_ref_no_X_Y_M <- read.delim("10kb_bin_genome_wide_annotated_reference_with_regulatory_features_no_X_Y_M.bed", header = TRUE, sep = "\t")

# add chromosomal lengths to the annotation set by taking the sum of the widths over each chromosome.
annotated_ref_no_X_Y_M <- annotated_ref_no_X_Y_M %>%
  group_by(chrom) %>%
  mutate(chrom_lengths = sum(width)) %>%
  ungroup()

# subset to ranges of these features in the annotation set.
all_sig_annotations <- annotated_ref_no_X_Y_M[annotated_ref_no_X_Y_M$bin_id %in% de_features_subset_rownames,]

# sort de_features_subset by rowname, or order of bin ID. Then cbind to annotation set subset, for visualization.
de_features_subset <- de_features_subset[order(as.numeric(rownames(de_features_subset))), ]

combined_feature_expression_annotations <- cbind(de_features_subset, all_sig_annotations)

# set factor level for chromosome number.
combined_feature_expression_annotations$chrom <- factor(combined_feature_expression_annotations$chrom,
                                                        levels = paste0("chr", 1:22))

# add term for hypermethylated and hypomethylated features to a new column
combined_feature_expression_annotations <- combined_feature_expression_annotations %>%
  mutate(status = case_when(
      logFC > 0 ~ "hypermethylated",
      logFC < 0 ~ "hypomethylated")
        )


# TROUBLESHOOT; TRY TO MAKE START A NEW VARIABLE

# visualize, while taking into account chromosome size
genome_base <- combined_feature_expression_annotations %>%
    ggplot +
    # geom_bar(aes(x = chrom, y = chrom_lengths), # background; faulty, y-axis way too  big
    #         stat='identity', 
    #         fill='grey80', 
    #         colour='grey80', 
    #         width=.2) +
    # geom_point(aes(x=chrom, y=start, colour = logFC), # Vis 1 (primary)
    #     size=1,
    #     alpha = 0.8, 
    #     position=position_jitter(.4)) +
    # geom_segment(data = combined_feature_expression_annotations %>%
    #                 group_by(chrom) %>%
    #                 summarize(min_length = min(chrom_lengths), max_length = max(chrom_lengths)),
    #              aes(x = chrom, xend = chrom, y = min_length, yend = max_length),
    #              color = "black", linewidth = 10) +    
    geom_tile(aes(x=chrom, y=start, colour = logFC), # Vis 1 (primary)
        linewidth=1,
        alpha = 0.8) +
    theme_minimal() +
    theme(panel.border = element_blank(),
            panel.grid.major = element_blank(),
            panel.grid.minor = element_blank()) + 
    theme(axis.text.x = element_text(angle = 90, hjust = 1),
          axis.ticks.x = element_blank()) +
    labs(x = "Chromosome",
         y = "Chromosome Position") +
    scale_colour_gradient2(low = "blue", mid = "white", high = "red")
print(genome_base)




# visualize, without taking into account chromosome size
genome_base <- combined_feature_expression_annotations %>%
  ggplot() +
  geom_point(aes(x = chrom, y = logFC, colour = status),
            size=0.2,
            alpha = 0.5, 
            position=position_jitter(.3)) +
  theme_minimal() +
  theme(panel.border = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank()) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  labs(x = "Chromosome", y = "logFC") #+
  # scale_colour_manual(values = c("hypermethylated" = "red", "hypomethylated" = "blue")) #+
  # scale_y_log10() # can't log scale as you have negative logFC
print(genome_base)




# extra
feature_expression_distribution <- genome_base + 
    geom_segment(data = combined_feature_expression_annotations, 
        aes(x=chrom, xend=chrom, y=start, yend=end, colour = logFC),
        linewidth = 2)
print(feature_expression_distribution)

- Problem with loop; if there is none of a particular annotation in a subset of annotations, then the entry won't be in the final annotation table.
- Need a way to include the annotation if it isn't present in the subset, with a zero.

### 03-10-2023 picard fragment length, short fragment enrichment in plasma H3K27me3 profiles

In [None]:
# use output from MEDIPIPE to calculate the abundance of short fragments
#Goal; plot output of picard in a single figure to get an idea of the insert sizes/fragment length and whether there's a shift to the left in cancer.
setwd("/cluster/projects/scottgroup/people/steven/human_plasma_cfChIP-Seq/liberate_plasma_cfchip/")

#loop over all subdirectories and call all fragment length profiles.
k27me3_profiles <- list.files(path = ".", pattern = "insert_size_metrics.txt", recursive = TRUE)

#calculate frequency of fragment length for each of the pilot files
frag_length_all <- data.frame()

all_rows <- data.frame(insert_size = seq(1, 1000)) #to fill all rows to a fixed number

#for this loop, if extracting fragment length characteristics for violin plot, exclude row filling; only use for stacked histogram! Otherwise NA = error.
for (i in k27me3_profiles) {
  temp <- read.table(i, header = TRUE, skip = 10)
  temp$sample <- paste0(i)
  temp$total_sum <- sum(temp$All_Reads.fr_count)
  temp$frequency <- temp$All_Reads.fr_count / temp$total_sum
  #temp <- merge(all_rows, temp, all = TRUE) #merge dummy frame and temp to make temp a fixed number of rows
  #temp <- temp %>% 
  #  fill(sample, total_sum, .direction = c("updown")) #fill NAs in repeated character/numerical rows with values above and below
  #temp$sample <- str_match(temp$sample, "liberate_(.*?)_K27me3")[[1]] #make "sample" column characters a substring of the current characters
  frag_length_all <- rbind(frag_length_all,temp)
} 

#colour palate for line plot
colour_line <- met.brewer("Hokusai1", 9)
#plot_line
frag_length_all %>% 
  ggplot(aes(x = insert_size, y = frequency, colour = sample)) +
  geom_line() +
  theme_minimal(base_size = 16) +
  labs(x = "Insert Size",
       y = "Frequency",
       colour = "H3K27me3 Profile") +
  scale_colour_manual(values = colour_line) +
  scale_x_continuous(limits = c(100,200))

# #plot_alt; stacked histogram
# frag_length_all %>% 
#   ggplot(aes(x = insert_size, y = sample, fill = frequency)) +
#   geom_tile() +
#   theme_minimal(base_size = 10) +
#   labs(x = "Insert Size",
#        y = "Sample",
#        fill = "Frequency") +
#   scale_fill_gradient(low = "black", high = "#E85e19", na.value = "black") +
#   scale_x_continuous(limits = c(0,600))

#interestingly; there are a few cancer samples with a surplus of <150bp fragments; quantify the fraction of <150bp fragments across the different samples. and bin by cancer status.Do this for the entire cohort.

fraction_short_fragments <- data.frame()
range <- as.numeric(c(126:135, 240:330)) #range 1
# range <- as.numeric(c(170:200, 240:330)) #range 2; determined from median fragment length across T1, T3, and non-lymphoma. Not ideal.
# range <- as.numeric(c(170:200)) #range 3; determined from median fragment length across T1, T3, and non-lymphoma. Mononucleosome alone, not ideal.
# range <- as.numeric(c(126:135)) #range 4; mononucleosome alone.
# range <- as.numeric(c(240:330)) #range 5; dinucleosome alone. Truly the most informative range.
# range <- as.numeric(c(250:350)) #subtype

for (i in unique(frag_length_all$sample)) {
  temp <- sum(frag_length_all[which(frag_length_all$insert_size %in% range
                                    & frag_length_all$sample == i),5]) #select all rows where the insert size falls in the specified range
  fraction_short_fragments <- rbind(fraction_short_fragments,temp)
} #loop over each of the samples and take the sum of the frequency of fragments in the specified insert size range

#change depending on range selected
fraction_short_fragments <- cbind(fraction_short_fragments, unique(frag_length_all$sample)) #bind names of the samples to the new data frame
colnames(fraction_short_fragments) <- c("Value", "Sample")

# #healthy vs cancer condition
# fraction_short_fragments <- fraction_short_fragments %>%
#   mutate(cancer_status = case_when(
#     str_detect(Sample, "HUCON|_C") ~ "Healthy",
#     str_detect(Sample, "GCB|ABC|FL|NA") ~ "Cancer"
#     )) #conditional statement; if sample name has HUCON or _C, print "Healthy" in new column; if else, print "Cancer"

#T1 vs T3 vs healthy condition
fraction_short_fragments <- fraction_short_fragments %>%
  mutate(cancer_status = case_when(
    str_detect(Sample, "HUCON|_C") ~ "Healthy",
    str_detect(Sample, "T1") ~ "T1 (baseline)",
    str_detect(Sample, "T3|T4|T5") ~ "T3 (post-treatment)",
    #str_detect(Sample, "T5") ~ "T5 (relapse)"
    )) #keep in mind you grouped two of the T4 samples (longitudinal follow up) with the T3 group.

#subtypes
fraction_short_fragments <- fraction_short_fragments %>%
  mutate(subtype = case_when(
    str_detect(Sample, "Non-lymphoma") ~ "Healthy",
    str_detect(Sample, "GCB|ABC|NA") ~ "DLBCL",
    str_detect(Sample, "FL") ~ "FL",
    )) %>% 
  filter(!str_detect(Sample, "T3|T4|T5")) #Exclude T3, T4, and T5.

# extract the bioinformatic name from the Sample column, mutate as additional column to match to the sample metadata for other visualizations
fraction_short_fragments <- fraction_short_fragments %>%
mutate(bioinformatic_name = regmatches(Sample, regexpr("liberate_(.*?)_K27me3", Sample)))

# additional; draw lines bw T1 and T3 samples. Set up grouping variable

# colour palatte
colour_fraction_short_fragments <- MetBrewer::met.brewer(name = "Archambault", n = 3)

#plot frequencies for cancer vs normal
fraction_short_fragments %>% 
  ggplot(aes(x = cancer_status, y = Value, fill = cancer_status)) +
  geom_violin() +
  geom_point(alpha = 0.7, position = position_jitterdodge(0.05), size = 2.5) +
  # geom_line(aes(group = cancer_status)) +
  theme_minimal(base_size = 20) +
  labs(x = "Status",
       #y = "Fraction of <150bp fragments"
       y = "Fraction of fragments between 126-135bp and 240-330bp") +
  scale_fill_manual(values = colour_fraction_short_fragments)

##### metrics #####

#mean fraction of <150bp fragments, based off condition.
mean_sub150_cancer <- mean(fraction_under_150$Value[fraction_under_150$cancer_status == "Cancer"])
mean_sub150_healthy <- mean(fraction_under_150$Value[fraction_under_150$cancer_status == "Healthy"])

#backup code
sum(frag_length_all[which(frag_length_all$insert_size <= 150 & frag_length_all$sample == "liberate_0261_T1_GCB_K27me3_pool6_5_S5_insert_size_metrics.txt"),5]) #take the sum of the frequency of fragments over a specified range; this is currently taking all fragments less than 150 across all samples; subdivide calculation by character.

sum(frag_length_all[1:150,5])

#calculate median faction of short fragments for T1 and healthy
healthy_fraction_short_fragments <- fraction_short_fragments[fraction_short_fragments$cancer_status == "Healthy",]
T1_fraction_short_fragments <- fraction_short_fragments[fraction_short_fragments$cancer_status == "T1 (baseline)",]
T3_fraction_short_fragments <- fraction_short_fragments[fraction_short_fragments$cancer_status == "T3 (post-treatment)",]

mean(T1_fraction_short_fragments$Value)
mean(T3_fraction_short_fragments$Value)
mean(healthy_fraction_short_fragments$Value)

t.test(x = healthy_fraction_short_fragments$Value, y = T1_fraction_short_fragments$Value, var.equal = FALSE) #p = 0.0022 (significant)
t.test(x = T1_fraction_short_fragments$Value, y = T3_fraction_short_fragments$Value, var.equal = FALSE) #p = 0.0014 (significant)
t.test(x = healthy_fraction_short_fragments$Value, y = T3_fraction_short_fragments$Value, var.equal = FALSE) #p = 0.555 (not significant)



# 03-10-2023 CONTINUED; match fragment lengths to sample metadata. CONTINUE TOMORROW.
sample_list_training <- sample_list[sample_list$sample_name %in% colnames(cpm_norm),]







# 26-10-2023; generate average picard fragment length for all cancer and non-lymphoma samples. See if there is a significant difference.
#calculate frequency of fragment length for each of the pilot files
frag_length_all <- data.frame()

all_rows <- data.frame(insert_size = seq(1, 1000)) #to fill all rows to a fixed number

#for this loop, if extracting fragment length characteristics for violin plot, exclude row filling; only use for stacked histogram! Otherwise NA = error.
for (i in k27me3_profiles) {
  temp <- read.table(i, header = TRUE, skip = 10)
  temp$sample <- paste0(i)
  temp$total_sum <- sum(temp$All_Reads.fr_count)
  temp$frequency <- temp$All_Reads.fr_count / temp$total_sum
  #temp <- merge(all_rows, temp, all = TRUE) #merge dummy frame and temp to make temp a fixed number of rows
  #temp <- temp %>% 
  #  fill(sample, total_sum, .direction = c("updown")) #fill NAs in repeated character/numerical rows with values above and below
  #temp$sample <- str_match(temp$sample, "liberate_(.*?)_K27me3")[[1]] #make "sample" column characters a substring of the current characters
  frag_length_all <- rbind(frag_length_all,temp)
} 

# mutate cancer status (non-lymphoma, T1, T3) to each respective row.
#T1 vs T3 vs healthy condition
frag_length_all <- frag_length_all %>%
  mutate(cancer_status = case_when(
    str_detect(sample, "HUCON|_C") ~ "Non-lymphoma",
    str_detect(sample, "T1") ~ "T1 (baseline)",
    str_detect(sample, "T3|T4|T5") ~ "T3 (post-treatment)",
    #str_detect(Sample, "T5") ~ "T5 (relapse)"
    )) #keep in mind you grouped two of the T4 samples (longitudinal follow up) with the T3 group.

#subtypes
# frag_length_all <- frag_length_all %>%
#   mutate(subtype = case_when(
#     str_detect(sample, "Non-lymphoma") ~ "Healthy",
#     str_detect(sample, "GCB|ABC|NA") ~ "DLBCL",
#     str_detect(sample, "FL") ~ "FL",
#     )) %>% 
#   filter(!str_detect(sample, "T3|T4|T5")) #Exclude T3, T4, and T5.

# SIDE; for non-lymphoma vs lymphoma condition comparison only (ISLB poster)
frag_length_all <- frag_length_all[frag_length_all$cancer_status == c("Non-lymphoma", "T1 (baseline)"),]

# group by status, and calculate mean frequency across groups (non-lymphoma, T1, T3)
frag_length_grouped_median <- frag_length_all %>%
group_by(cancer_status, insert_size) %>%
summarize(median_frequency = median(frequency))

# group by lymphoma subtype, see if subtype related differences exist in the fragment lengths
# frag_length_grouped_median_subtype <- frag_length_all %>%
# group_by(subtype, insert_size) %>%
# summarize(median_frequency = median(frequency))



# plot
frag_length_grouped_median %>%
ggplot(aes(x = insert_size, y = median_frequency, colour = cancer_status)) +
geom_line() +
theme_minimal(base_size = 20) +
labs(x = "Fragment Size",
    y = "Fragment Frequency",
    colour = "Cancer Status") +
scale_x_continuous(limits = c(225,400)) +
scale_colour_manual(values = colour_fraction_short_fragments)

# stats for difference in mononucleosomal range




### Heatmap of features identified from modeling (10-09-2023)

In [None]:
# use heatmap as before to see the difference in the features between the groups. See if clusters of individual features are different, as we saw bi-directional 
# changes in feature expression from the differential analysis.

# load dataset
setwd("/cluster/projects/scottgroup/people/steven/human_plasma_cfChIP-Seq/liberate_plasma_cfchip/analysis/machine_learning_models/02_07_2023_RFE_RF_10kb_windows/differential_analysis_filter/")
load("10_09_2023_rf_10kb_with_hyperparameter_tuning_accuracyevaluation_withdifferentialfilter.RData")

# subset sample metadata to training samples, then visualize expression over the predictors.
X_train_attributes <- sample_list[sample_list$sample_name %in% colnames(cpm_norm),]

# subset original training matrix to the features in chosen feature set
#X_train_filtered <- cpm_norm[(rownames(cpm_norm)) %in% predictors_rf,]
X_train_filtered <- cpm_norm[(rownames(cpm_norm)) %in% predictors_rf_hyper,] # for hypermethylated features

#normalize counts over significant features from PCA.
heatmap_data <- X_train_filtered

heatmap_metadata <- data.frame(X_train_attributes$timepoint, 
                               X_train_attributes$DIAGNOSIS_CLASS_SUMMARIZE_HEALTHY.DIAGNOSIS_SPECIFIC_DLBCL,
                               #X_train_attributes$IPI,
                               #X_train_attributes$FLIPI,
                               X_train_attributes$ANN_ARBOR_STAGE#,
                               #X_train_attributes$TUMOR_MASS_MAX_CM
                               )

#change metadata labels (aka column names); add metadata as required
colnames(heatmap_metadata) <- c("timepoint",
                                "subtype",
                                #"IPI",
                                "ANN_ARBOR_STAGE"#,
                                #"z",
                                #"ab",
                                )

rownames(heatmap_metadata) <- colnames(heatmap_data)

###Convert integer columns to character, for metadata labeling
#heatmap_metadata$sample_list.IPI <- as.character(heatmap_metadata$sample_list.IPI)
#heatmap_metadata$sample_list.IPI <- as.character(heatmap_metadata$sample_list.FLIPI)
heatmap_metadata$ANN_ARBOR_STAGE <- as.character(heatmap_metadata$ANN_ARBOR_STAGE)

###Make all NAs as characters, for metadata labeling
heatmap_metadata$subtype[is.na(heatmap_metadata$subtype)] <- "NA"
#heatmap_metadata$sample_list.IPI[is.na(heatmap_metadata$sample_list.IPI)] <- "NA"
#heatmap_metadata$sample_list.FLIPI[is.na(heatmap_metadata$sample_list.FLIPI)] <- "NA"
heatmap_metadata$ANN_ARBOR_STAGE[is.na(heatmap_metadata$ANN_ARBOR_STAGE)] <- "NA"
#heatmap_metadata$sample_list.TUMOR_MASS_MAX_CM[is.na(heatmap_metadata$sample_list.TUMOR_MASS_MAX_CM)] <- 0

#Independent palattes for each piece of metadata
palatte_timepoint <- met.brewer(name = "Homer1", n = 2, type = "discrete")
palatte_subtype <- met.brewer(name = "Homer2", n = 5, type = "discrete")
#palatte_ipi <- RColorBrewer::brewer.pal(n = 7, name = "Spectral")
#palatte_flipi <- met.brewer(name = "Homer1", n = 6, type = "discrete")
palatte_stage <- met.brewer(name = "Homer2", n = 5, type = "discrete")
#palatte_diameter <- met.brewer(name = "Hokusai3", n = 92, type = "continuous")

###assign a colour palatte to each of the metadata labels, in list format for annotation_colours command in pheatmap.
heatmap_metadata_colour <- list(timepoint = setNames(palatte_timepoint, unique(heatmap_metadata$timepoint)),
                                subtype = setNames(palatte_subtype, unique(heatmap_metadata$subtype)))

h <- pheatmap(heatmap_data, 
         scale="row",
         #cluster_rows = FALSE, 
         clustering_method = "ward.D2",
         annotation_col = heatmap_metadata,
         annotation_colors = heatmap_metadata_colour,
         color=colorRampPalette(c("red", "white", "blue"))(100),
         show_rownames = FALSE)
print(h)

### Volcano plot of modeled features, on top of differential features

In [None]:
# plot 3 colours; black = all features, red = differential features, green = modeled features from RF+SMOTE
predictors_rf <- predictors(model)
de_features_subset <- res[res$table$PValue < 0.05,]
de_features_subset_rf <- res[(rownames(res)) %in% predictors_rf,]
# de_features_subset_rf <- de_features_subset_rf[de_features_subset_rf$table$logFC > 0,] # if you only want the hypermethylated fraction
de_features_rf <- rownames(de_features_subset_rf)
de_features_all <- rownames(res)
# de_features_subset_rf_highimportance <- res[(rownames(res)) %in% variable_importance_subset$feature,]
# de_features_rf_highimportance <- rownames(de_features_subset_rf_highimportance)


# plot volcano, visualize the differentially epressed features above significance threshold in red.
# table for all features
logFC <- res$table$logFC
p_value <- res$table$PValue
neg_log_p_value <- -log10(p_value)
df_feature_stats <- data.frame(logFC,neg_log_p_value,de_features_all) #compile data for volcano plot.

# table for all significant features
logFC_signif <- de_features_subset$table$logFC
p_value_signif <- de_features_subset$table$PValue
neg_log_p_value_signif <- -log10(p_value_signif)
df_feature_stats_signif <- data.frame(logFC_signif,neg_log_p_value_signif,de_features_subset) #compile data for volcano plot.

# table for all modeled features (subset of differential features)
logFC_signif_rf <- de_features_subset_rf$table$logFC
p_value_signif_rf <- de_features_subset_rf$table$PValue
neg_log_p_value_signif_rf <- -log10(p_value_signif_rf)
df_feature_stats_signif_rf <- data.frame(logFC_signif_rf,neg_log_p_value_signif_rf,de_features_subset_rf) #compile data for volcano plot.

# table for top 5% of modeled features, by feature importance using VarImp function (subset of modeled features)
# logFC_signif_rf_highimportance <- de_features_subset_rf_highimportance$table$logFC
# p_value_signif_rf_highimportance <- de_features_subset_rf_highimportance$table$PValue
# neg_log_p_value_signif_rf_highimportance <- -log10(p_value_signif_rf_highimportance)
# df_feature_stats_signif_rf_highimportance <- data.frame(logFC_signif_rf_highimportance,
                                                        # neg_log_p_value_signif_rf_highimportance,
                                                        # de_features_subset_rf_highimportance) #compile data for volcano plot.

v <- ggplot() +
  geom_point(data = df_feature_stats, aes(x = logFC, y = neg_log_p_value), colour = "black", alpha = 0.3) +
  geom_point(data = df_feature_stats_signif, aes(x = logFC_signif, y = neg_log_p_value_signif), colour = "red", alpha = 0.3) +
  geom_point(data = df_feature_stats_signif_rf, aes(x = logFC_signif_rf, y = neg_log_p_value_signif_rf), colour = "blue", alpha = 0.3) +
  geom_label(data = df_feature_stats_signif, aes(x = logFC_signif, y = neg_log_p_value_signif, label = rownames(df_feature_stats_signif))) +
  # geom_point(data = df_feature_stats_signif_rf_highimportance, 
             # aes(x = logFC_signif_rf_highimportance, y = neg_log_p_value_signif_rf_highimportance), colour = "green", alpha = 0.3) +
  geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "red") +
  #geom_vline(xintercept = c(-1, 1), linetype = "dashed", color = "blue") +
  labs(x = "Log Fold Change", y = "-log10(p-value)") +
  ggtitle("Volcano Plot") +
  theme_minimal() #+ 
  # geom_text(data = df_feature_stats_signif, aes(x = logFC_signif, y = neg_log_p_value_signif, label = rownames(df_feature_stats_signif)), vjust = -0.5, hjust = 0.5, color = "red", size = 3)

print(v)

### 28-09-2023 Linear regression, mean VAF vs median of hypermethylated features, compared to random features

In [None]:
# linear model, in ggplot. Requires X_train_attributes to be formatted from above chunks.
X_train_attributes %>% 
  ggplot(aes(x = mean_VAF, y = median_X_train_filtered)) +
  # ggplot(aes(x = mean_VAF, y = mean_X_train_filtered)) +
  # ggplot(aes(x = mean_VAF, y = sum_X_train_filtered)) +
  geom_point(size = 5) + 
  stat_smooth(method = "lm", formula = y ~ x, color = "red", se = TRUE) + 
  theme_minimal(base_size = 20) +
  labs(x = "Mean VAF",
       y = "Median of Hypermethylated Feature Expression"
      )

# lm stats
summary(lm(median_X_train_filtered ~ mean_VAF, X_train_attributes))
# summary(lm(mean_X_train_filtered ~ mean_VAF, X_train_attributes))
# summary(lm(sum_X_train_filtered ~ mean_VAF, X_train_attributes))


# linear model, but for random features. Requires X_train_attributes_subset to be formatted from above chunks.
p <- X_train_attributes_subset %>% 
  ggplot(aes(x = mean_VAF, y = Mean_Median)) +
  geom_point(size = 5) + 
  stat_smooth(method = "lm", formula = y ~ x, color = "red", se = TRUE) + 
  theme_minimal(base_size = 20) +
  labs(x = "Mean VAF",
       y = "Median Expression of 5328 Random Features, over 1000 Iterations"
      )
print(p)

# lm stats
summary(lm(Mean_Median ~ mean_VAF, X_train_attributes_subset))$r.squared
lm_k27_mean_VAF_randomsubset <- summary(lm(Mean_Median ~ mean_VAF, X_train_attributes_subset))



# 23-10-2023
# split DLBCL and FL cases
X_train_attributes_subtype <- X_train_attributes[!(X_train_attributes$DIAGNOSIS_CLASS_SUMMARIZE_HEALTHY == "FL"), ] # for DLBCL
X_train_attributes_subtype <- X_train_attributes[!(X_train_attributes$DIAGNOSIS_CLASS_SUMMARIZE_HEALTHY == "DLBCL"), ] # for FL

p <- X_train_attributes_subtype %>% 
  ggplot(aes(x = mean_VAF, y = median_X_train_filtered)) +
  geom_point(size = 5) + 
  stat_smooth(method = "lm", formula = y ~ x, color = "red", se = TRUE) + 
  theme_minimal(base_size = 20) +
  labs(x = "Mean VAF",
       y = "Median of Hypermethylated Feature Expression"
      )
print(p)

# lm stats
summary(lm(median_X_train_filtered ~ mean_VAF, X_train_attributes_subtype))


## test RF model on test set

In [None]:
# test set preparation
# apply cpm normalization to test data
X_test <- cpm(X_test, normalized.lib.sizes = TRUE, log = FALSE)

# subset X_test to features from training set differential analysis
X_test <- X_test[de_features,]

# project model onto new test data
X_test <- t(X_test)
predictions_differential_rf <- predict(model, newdata = X_test)

#create confusion matrix for the predictions
confusion_matrix_differential_rf <- table(Reference = y_test, Prediction = predictions_differential_rf)

# poor performance for healthy, great performance for lymphoma. This is just a limitation of the study, continue working with the features, regardless.
# More balanced dataset for the subtyping question, so maybe test performance will be better there?

# 11-09-2023 Survival analysis over modeled features from differential + RF

## Pre-process data, incorporate MTV and tumor burden from ichorCNA

In [None]:
# import MTV data; data generated by the nuclear medicine team at UHN
setwd("/cluster/projects/scottgroup/people/steven/human_plasma_cfChIP-Seq/liberate_plasma_cfchip/other_profiles/tmtv_data_lymphoma_cases")
tmtv_lymphoma <- readxl::read_xlsx("17-11-2023_PET_scan_data_lymphoma_cases.xlsx")

# import ichorCNA output and match to X_train_attributes in the same way you incorporated TMTV
setwd("/cluster/projects/scottgroup/people/steven/human_plasma_cfChIP-Seq/liberate_plasma_cfchip/other_profiles/ichorCNA_data_shallow_WGS_lymphoma")
ichorCNA_lymphoma <- read.delim("Bratman.ichorCNA.txt", header = TRUE, sep = "\t")

# merge MTV data frame with X_train_attributes
X_train_attributes_all_data_types <- merge(X_train_attributes, tmtv_lymphoma, by.x = "PATIENT_ID", by.y = "LIBERATE ID", all.x = TRUE)

# merge ichorCNA data frame with X_train_attributes; this step is where you're gaining duplicate rows!!
X_train_attributes_all_data_types <- merge(X_train_attributes_all_data_types, ichorCNA_lymphoma, by.x = "PATIENT_ID", by.y = "PATIENT_ID", all.x = TRUE)

# remove duplicate ichorCNA entries due to accidental duplicate sequencing of some shallow WGS profiles. Selected single representative entry for duplicates.
X_train_attributes_all_data_types <- X_train_attributes_all_data_types %>%
  distinct(PATIENT_ID, .keep_all = TRUE)

# remove healthy cases from X_train_attributes
X_train_attributes_all_data_types <- X_train_attributes_all_data_types[!(X_train_attributes_all_data_types$DIAGNOSIS_CLASS_SUMMARIZE_HEALTHY == "HEALTHY"),]

# convert PFS and OS to numeric
X_train_attributes_all_data_types$PFS <- as.numeric(X_train_attributes_all_data_types$PFS)
X_train_attributes_all_data_types$OS <- as.numeric(X_train_attributes_all_data_types$OS)
X_train_attributes_all_data_types$CODE_PFS <- as.numeric(X_train_attributes_all_data_types$CODE_PFS)
X_train_attributes_all_data_types$CODE_OS <- as.numeric(X_train_attributes_all_data_types$CODE_OS)

## all training cases, survival by median K27me3

In [None]:
# survival analysis

# calculate median cutoff for median lymphoma thresholds
# median_threshold <- median(X_train_attributes_all_data_types$median_X_train_filtered[!(X_train_attributes_all_data_types$DIAGNOSIS_CLASS_SUMMARIZE_HEALTHY == "HEALTHY")])
mean_threshold <- mean(X_train_attributes_all_data_types$median_X_train_filtered[!(X_train_attributes_all_data_types$DIAGNOSIS_CLASS_SUMMARIZE_HEALTHY == "HEALTHY")])
# mean_threshold <- median(X_train_attributes_update$cpm_norm_genes_median[!(X_train_attributes_update$DIAGNOSIS_CLASS_SUMMARIZE_HEALTHY == "HEALTHY")]) # for individual genes
# mean_threshold <- mean(X_train_attributes_update$cpm_norm_genes_median[!(X_train_attributes_update$DIAGNOSIS_CLASS_SUMMARIZE_HEALTHY == "HEALTHY")]) # for individual genes

library(survival)
library(survminer)

# determine optimal cutoff using survminer surv_cutpoint
# cutpoint <- surv_cutpoint(data = X_train_attributes_lymphoma, 
#                           time = "PFS", 
#                           event = "CODE_PFS",
#                           variables = "median_X_train_filtered")

# cutpoint <- cutpoint$cutpoint$cutpoint # this is the stupidest functional code I've ever written

# create survival object
# survival_object <- Surv(time = X_train_attributes_lymphoma$OS, event = X_train_attributes_lymphoma$CODE_OS) # for overall survival
survival_object <- Surv(time = X_train_attributes_all_data_types$PFS, event = X_train_attributes_all_data_types$CODE_PFS) # for progression free survival

# might have to remove NAs (aka healthy samples)
# high = above threshold, low = below threshold
# X_train_attributes_lymphoma$threshold <- ifelse(X_train_attributes_lymphoma$median_X_train_filtered > median_threshold, "High", "Low")
X_train_attributes_all_data_types$threshold <- ifelse(X_train_attributes_all_data_types$median_X_train_filtered > mean_threshold, "High", "Low")
# X_train_attributes_lymphoma$threshold <- ifelse(X_train_attributes_lymphoma$median_X_train_filtered > cutpoint, "High", "Low")

# 
km_fit <- survfit(survival_object ~ threshold, data = X_train_attributes_all_data_types)

# plot Kaplan Meier. Healthy samples were checked for exclusion from this plot.
survplot <- ggsurvplot(km_fit, 
                data = X_train_attributes_all_data_types, 
                pval = TRUE,
                conf.int = TRUE,
                ggtheme = theme_minimal(),
                risk.table = TRUE,
                xlab = "Time (Years)",
                ylab = "Overall Survival",
                # ylab = "Progression Free Survival (PFS)"
                )
survplot



# check that assumptions of cox proportional hazards model are met for each of the continuous measurements from each patient.
# Fit a Cox Proportional Hazards model
cox_model <- coxph(survival_object ~ median_X_train_filtered + `STLG (SUV*mL/Kg)` + mean_VAF + Tumor.Fraction, data = X_train_attributes_all_data_types)

# Check proportional hazards assumption using Schoenfeld residuals
cox_residuals <- cox.zph(cox_model)
print(cox_residuals) # individually, the assumptions are met, but combined, the assumption of residuals not correlated with time is violated.

# Plot Schoenfeld residuals
plot(cox_residuals) # distribution of residuals over time not horizontal; assumption of the model violated

# Check for linearity assumption using Martingale residuals
# cox_martingale_result <- residuals(cox_model, type = "martingale")
# plot(cox_martingale_result ~ X_train_attributes_all_data_types$median_X_train_filtered, xlab = "Variable 1", ylab = "Martingale Residuals")

# Check for outliers or influential points
# infl <- influence(cox_model)
# summary(infl)



- Mean gives us better results for the survival analysis, with log2(CPM+1) normalized counts. But the groups are unbalanced in that case; do they need to be equal? Look into this.
- Distribution of the median values is left skewed. Maybe median is most correct? I think mean is also acceptable. It matches the result of what the optimal cut point is.
- Regardless; OS looks awful so we need another approach. Maybe the multivariate cox proportional hazard is the way to do it.
- Assumptions violated for the cox model (for univariate and multivariate); leave this on the backburner for now.

## repeat for independent lymphoma subtypes

In [None]:
# repeat this for DLBCL and FL, separately

# in case code fails
X_train_attributes_lymphoma <- X_train_attributes[!(X_train_attributes$DIAGNOSIS_CLASS_SUMMARIZE_HEALTHY == "HEALTHY"),]

# convert PFS and OS to numeric
X_train_attributes_lymphoma$PFS <- as.numeric(X_train_attributes_lymphoma$PFS)
X_train_attributes_lymphoma$OS <- as.numeric(X_train_attributes_lymphoma$OS)
X_train_attributes_lymphoma$CODE_PFS <- as.numeric(X_train_attributes_lymphoma$CODE_PFS)
X_train_attributes_lymphoma$CODE_OS <- as.numeric(X_train_attributes_lymphoma$CODE_OS)

median_threshold <- median(X_train_attributes_lymphoma$median_X_train_filtered[!(X_train_attributes_lymphoma$DIAGNOSIS_CLASS_SUMMARIZE_HEALTHY %in% c("HEALTHY","FL"))])
# median_threshold <- median(X_train_attributes_lymphoma$median_X_train_filtered[!(X_train_attributes_lymphoma$DIAGNOSIS_CLASS_SUMMARIZE_HEALTHY %in% c("HEALTHY","DLBCL"))])

# subset X_train_attributes to DLBCL and FL samples only
X_train_attributes_lymphoma <- X_train_attributes_lymphoma[X_train_attributes_lymphoma$DIAGNOSIS_CLASS_SUMMARIZE_HEALTHY == "DLBCL",]
# X_train_attributes_lymphoma <- X_train_attributes_lymphoma[X_train_attributes_lymphoma$DIAGNOSIS_CLASS_SUMMARIZE_HEALTHY == "FL",]

# create survival object
# survival_object <- Surv(time = X_train_attributes_lymphoma$OS, event = X_train_attributes_lymphoma$CODE_OS) # for overall survival
survival_object <- Surv(time = X_train_attributes_lymphoma$PFS, event = X_train_attributes_lymphoma$CODE_PFS) # for progression free survival

# might have to remove NAs (aka healthy samples)
# high = above threshold, low = below threshold
X_train_attributes_lymphoma$threshold <- ifelse(X_train_attributes_lymphoma$median_X_train_filtered > median_threshold, "High", "Low")

# 
km_fit <- survfit(survival_object ~ threshold, data = X_train_attributes_lymphoma)

# plot Kaplan Meier. Healthy samples were checked for exclusion from this plot.
ggsurvplot(km_fit, 
                data = X_train_attributes_lymphoma, 
                pval = TRUE,
                conf.int = TRUE,
                ggtheme = theme_minimal(),
                risk.table = TRUE,
                xlab = "Time (Years)",
                #ylab = "Overall Survival",
                ylab = "Progression Free Survival (PFS)"
                )


- Consider adding the test set to this analysis. Needs more samples.
- Think about how to better stratify the cases using CAPP-Seq and MTV. Benchmark against these.
- How do I combine the independent clinical variables into a single survival model?

## Benchmark; survival prediction from median of mean VAF and MTV

In [None]:
# MTV

# median_threshold <- median(X_train_attributes_lymphoma$`STLG (SUV*mL/Kg)`[!(X_train_attributes_lymphoma$DIAGNOSIS_CLASS_SUMMARIZE_HEALTHY %in% c("HEALTHY","FL"))])
median_threshold <- median(X_train_attributes_lymphoma$`STLG (SUV*mL/Kg)`[!(X_train_attributes_lymphoma$DIAGNOSIS_CLASS_SUMMARIZE_HEALTHY %in% c("HEALTHY","DLBCL"))])

# subset X_train_attributes to DLBCL and FL samples only
X_train_attributes_lymphoma <- X_train_attributes_lymphoma[X_train_attributes_lymphoma$DIAGNOSIS_CLASS_SUMMARIZE_HEALTHY == "DLBCL",]
# X_train_attributes_lymphoma <- X_train_attributes_lymphoma[X_train_attributes_lymphoma$DIAGNOSIS_CLASS_SUMMARIZE_HEALTHY == "FL",]

# create survival object
# survival_object <- Surv(time = X_train_attributes_lymphoma$OS, event = X_train_attributes_lymphoma$CODE_OS) # for overall survival
survival_object <- Surv(time = X_train_attributes_lymphoma$PFS, event = X_train_attributes_lymphoma$CODE_PFS) # for progression free survival

# might have to remove NAs (aka healthy samples)
# high = above threshold, low = below threshold
X_train_attributes_lymphoma$threshold <- ifelse(X_train_attributes_lymphoma$`STLG (SUV*mL/Kg)` > median_threshold, "High", "Low")

# 
km_fit <- survfit(survival_object ~ threshold, data = X_train_attributes_lymphoma)

# plot Kaplan Meier. Healthy samples were checked for exclusion from this plot.
ggsurvplot(km_fit, 
                data = X_train_attributes_lymphoma, 
                pval = TRUE,
                conf.int = TRUE,
                ggtheme = theme_minimal(),
                risk.table = TRUE,
                xlab = "Time (Years)",
                #ylab = "Overall Survival",
                ylab = "Progression Free Survival (PFS)"
                )

- Issue; there are NA MTV values in the FL cases; exclude these from the grouped analyses.

## Multivariate Cox Proportional Hazards Analysis; K27me3 + MTV

In [None]:
# RUN FIRST CHUNK IN THIS SECTION BEFORE STARTING

# remove FL cases from X_train_attributes_all_data_types
# X_train_attributes_all_data_types <- X_train_attributes_all_data_types[!(X_train_attributes_all_data_types$DIAGNOSIS_CLASS_SUMMARIZE_HEALTHY == "FL"),]

# remove DLBCL cases from X_train_attributes_all_data_types
# X_train_attributes_all_data_types <- X_train_attributes_all_data_types[!(X_train_attributes_all_data_types$DIAGNOSIS_CLASS_SUMMARIZE_HEALTHY == "DLBCL"),]


# for PFS
# cox_model <- coxph(Surv(time = X_train_attributes_all_data_types$PFS, 
                        # event = X_train_attributes_all_data_types$CODE_PFS) ~ X_train_attributes_all_data_types$median_X_train_filtered + X_train_attributes_all_data_types$`STLG (SUV*mL/Kg)` + X_train_attributes_all_data_types$mean_VAF, 
                   # data = X_train_attributes_all_data_types)

# for OS
# cox_model <- coxph(Surv(time = X_train_attributes_all_data_types$PFS, 
#                         event = X_train_attributes_all_data_types$CODE_PFS) ~ X_train_attributes_all_data_types$median_X_train_filtered + X_train_attributes_all_data_types$`STLG (SUV*mL/Kg)` + X_train_attributes_all_data_types$mean_VAF, 
#                         data = X_train_attributes_all_data_types)

# # display this output as a forest plot; incorporate all cases in the cohort near publication, use only training samples for now.
# summary(cox_model)
# conf_int <- confint(cox_model)


# using the finalfit function; good way to look at all the univariate HRs.
library(finalfit)

# split K27, CAPP-Seq, MTV, and WGS measurements by median, and compare the two sets of groups; high and low.


dependent_os <- "Surv(time = OS, event = CODE_OS)"

dependent_pfs <- "Surv(time = PFS, event = CODE_PFS)"

explanatory_variables <- c("median_X_train_filtered", 
                           # "`STLG (SUV*mL/Kg)`",
                           "`MTV (mL)`",
                           "mean_VAF", 
                           "Tumor.Fraction"#, 
                           # "ANN_ARBOR_STAGE",
                           # "IPI_grouped"#,
                           # "ECOG",
                           # "NB_EN_SITES",
                           # "SEX"
                          )

# omit NA values from certain variables, as they may be causing trouble
# X_train_attributes_all_data_types$mean_VAF <- na.omit(X_train_attributes_all_data_types$mean_VAF)

# coxph.control() # might be worth adjusting if you encounter an interation issue, or no model convergence.

# generate model
X_train_attributes_all_data_types %>%
  # finalfit(dependent_os, explanatory_variables, metrics=TRUE)
  finalfit(dependent_pfs, explanatory_variables, metrics=TRUE) #%>%
  # setNames(1:5, c("Variable", "Level", "", "HR (univariable)", "HR (multivariable)"))

# for Odds Ratios; doesn't work for now.
X_train_attributes_all_data_types %>%
  or_plot(dependent_os, explanatory_variables)



- Continue with this once you've installed the forestplot package into your conda env.
- Also, install finalfit package in the meantime.
- For the git repository, make the coxph model generation a function to avoid repeat code.
- ISSUES; almost nothing is significant, except for ECOG 3; sex (M) is trending towards significance, strangely.
- ECOG 3 is still the only significant independent variable when we subset to DLBCL and FL cases only.
- Understand this analysis a bit more before showing to Scott or Robert.
- Also, HRs for median K27 expression is Inf, and confidence interval is from 0-Inf; that's strange.
- Recommendation from Bing AI bot; try a less complex model to start. OUTCOME IS GETTING BETTER.
- Figure out what to do with the missing values!
- 16-01-2024; Scott made a good point, OS may not be the best metric, as ECOG and IPI should be expected to correlate with these, as age should too.
- PFS may be better, and correlates better with our blood and MTV measurements.

## 06_12_2023 integration of K27 and MTV for prediction

In [None]:
# take the median of both K27 and MTV, to create 4 groups; low k27/low MTV, low K27/high MTV, high K27/low MTV, high K27/high MTV.

# remove cases NA for MTV
X_train_attributes_lymphoma <- X_train_attributes_lymphoma[complete.cases(X_train_attributes_lymphoma$`STLG (SUV*mL/Kg)`),]

# IF DESIRED; subset X_train_attributes to DLBCL and FL samples only
# X_train_attributes_lymphoma <- X_train_attributes_lymphoma[X_train_attributes_lymphoma$DIAGNOSIS_CLASS_SUMMARIZE_HEALTHY == "DLBCL",]
# X_train_attributes_lymphoma <- X_train_attributes_lymphoma[X_train_attributes_lymphoma$DIAGNOSIS_CLASS_SUMMARIZE_HEALTHY == "FL",]

# calculate median cutoff for median lymphoma thresholds
median_threshold_k27 <- median(X_train_attributes_lymphoma$median_X_train_filtered)
median_threshold_mtv <- median(X_train_attributes_lymphoma$`STLG (SUV*mL/Kg)`)

library(survival)
library(survminer)

# create survival object
survival_object <- Surv(time = X_train_attributes_lymphoma$OS, event = X_train_attributes_lymphoma$CODE_OS) # for overall survival
# survival_object <- Surv(time = X_train_attributes_lymphoma$PFS, event = X_train_attributes_lymphoma$CODE_PFS) # for progression free survival

# create the 4 groups listed at the top
X_train_attributes_lymphoma <- X_train_attributes_lymphoma %>%
mutate(threshold = case_when(
    X_train_attributes_lymphoma$median_X_train_filtered <= median_threshold_k27 & X_train_attributes_lymphoma$`STLG (SUV*mL/Kg)` <= median_threshold_mtv ~ "H3K27me3_low_STLG_low",
    X_train_attributes_lymphoma$median_X_train_filtered <= median_threshold_k27 & X_train_attributes_lymphoma$`STLG (SUV*mL/Kg)` > median_threshold_mtv ~ "H3K27me3_low_STLG_high",
    X_train_attributes_lymphoma$median_X_train_filtered > median_threshold_k27 & X_train_attributes_lymphoma$`STLG (SUV*mL/Kg)` <= median_threshold_mtv ~ "H3K27me3_high_STLG_low",
    X_train_attributes_lymphoma$median_X_train_filtered > median_threshold_k27 & X_train_attributes_lymphoma$`STLG (SUV*mL/Kg)` > median_threshold_mtv ~ "H3K27me3_high_STLG_high"
))

# 
km_fit <- survfit(survival_object ~ threshold, data = X_train_attributes_lymphoma)

# plot Kaplan Meier. Healthy samples were checked for exclusion from this plot.
ggsurvplot(km_fit, 
                data = X_train_attributes_lymphoma, 
                pval = TRUE,
                conf.int = TRUE,
                ggtheme = theme_minimal(),
                risk.table = TRUE,
                xlab = "Time (Years)",
                #ylab = "Overall Survival",
                ylab = "Progression Free Survival (PFS)"
                )

- Didn’t work as expected, considering some of the low low cases are passing / progressing before the mixed cases. Fortunately, high high definitely predicts the worst prognosis. Think about this some more.

## 14-12-2023 Swimmer plot, survival

In [None]:
# subset to survival data
survival_data <- X_train_attributes_lymphoma[,c("PFS", "CODE_PFS")]
colnames(survival_data) <- c("time", "status")

# Generate a swimmer plot
survminer::ggsurvplot(
  survival_object,
  data = survival_data,  # your_data should contain both time and status columns
  risk.table = TRUE,
  pval = TRUE,
  pval.coord = c(0, 0.1),  # Adjust the position of the p-value on the plot
  conf.int = TRUE
)

# Print the swimmer plot
print(swimmer_plot)

# 11-09-2023 Repeat ML framework (Differential + RF) without SMOTE, see if results are better than previous

In [None]:
# No SMOTE results
# load dataset
setwd("/cluster/projects/scottgroup/people/steven/human_plasma_cfChIP-Seq/liberate_plasma_cfchip/analysis/machine_learning_models/02_07_2023_RFE_RF_10kb_windows/differential_analysis_filter/")
load("11_09_2023_rf_10kb_with_hyperparameter_tuning_accuracyevaluation_withdifferentialfilter_NO_SMOTE.RData")

# check model output
print(model)
predictors_rf <- predictors(model)

- While SMOTE may reduce generalizability of the model for the minority set, performance is far better with SMOTE so stick with it for now.
- Apply the current strategy for subtyping question.

# 11-09-2023 Repeat ML framework (Differential + RF) for subtyping question, see if results are better than previous

In [None]:
# results
# load dataset
setwd("/cluster/projects/scottgroup/people/steven/human_plasma_cfChIP-Seq/liberate_plasma_cfchip/analysis/machine_learning_models/02_07_2023_RFE_RF_10kb_windows/differential_analysis_filter/")
load("13_09_2023_subtyping_rf_10kb_with_hyperparameter_tuning_accuracyevaluation_withdifferentialfilter.RData")

# model results
print(model)
predictors_rf_subtype <- predictors(model)

- Okay, not bad, not great. Probably requires the one vs all approach Althaf was discussing. I'll leave this to the future directions for now.

# 07-11-2023 Figure 1 as proposed - Comparisons to other sequencing types from plasma and PBMC

## Difference in fragment length distributions between data types

In [None]:
# goal here is to, using Picard fragment length frequencies, to figure out whether there is a significant difference in the fragment length distributions between
# plasma data types (plasma K27me3, cfMeDIP-Seq, CAPP-Seq (eventually), and WGS (DELFI)).
# only healthy profiles are used for this analysis, to reduce variability.

#loop over directories and call all fragment length profiles.
k27me3_profiles <- list.files(path = "/cluster/projects/scottgroup/people/steven/human_plasma_cfChIP-Seq/liberate_plasma_cfchip/.", 
                              pattern = "insert_size_metrics.txt", 
                              recursive = TRUE,
                              full.names = TRUE) # list the full paths of all plasma H3K27me3 profiles
k27me3_profiles <- k27me3_profiles[str_detect(k27me3_profiles, "HUCON")] # subset these to only non-lymphoma profiles; HUCON.

wgs_profiles <- list.files(path = "/cluster/projects/scottgroup/people/steven/reference_sets/delfi_wgs/picard_fragment_profile/.", 
                           pattern = "IS_metrics.txt",
                           full.names = TRUE)

medip_profiles <- list.files(path = "/cluster/projects/scottgroup/people/steven/reference_sets/justin_hhp_medip/alignment/samtools_dedup/picard_fragment_profile/.", 
                             pattern = "IS_metrics.txt",
                             full.names = TRUE)

k4me3_profiles <- list.files(path = "/cluster/projects/scottgroup/people/steven/human_plasma_cfChIP-Seq/k4me3/alignment/filter_bam/filter_mapq/fragment_length/", 
                             pattern = "IS_metrics.txt",
                             full.names = TRUE)

# combine lists into a single list and continue with calculating fragment frequencies
all_profiles <- c(k27me3_profiles,wgs_profiles,medip_profiles,k4me3_profiles)

#calculate frequency of fragment length for each of the pilot files
frag_length_all <- data.frame()

all_rows <- data.frame(insert_size = seq(1, 1000)) #to fill all rows to a fixed number

# import files and extract fragment length metrics in the form of fragment frequencies
for (i in all_profiles) {
  # Get the current working directory
  current_dir <- getwd()

  # Extract the directory path from the file path
  dir_path <- dirname(i)

  # Set the working directory to the directory of interest
  setwd(dir_path)

  # Read the file, and calculate fragment frequencies
  temp <- read.table(basename(i), header = TRUE, skip = 10)
  temp$sample <- paste0(i)
  temp$total_sum <- sum(temp$All_Reads.fr_count)
  temp$frequency <- temp$All_Reads.fr_count / temp$total_sum

  # Set the working directory back to the original directory
  setwd(current_dir)

  frag_length_all <- rbind(frag_length_all, temp)
}




# mutate data type (plasma H3K27me3, cfMeDIP-Seq, WGS) to each respective row.
frag_length_all <- frag_length_all %>%
  mutate(data_type = case_when(
    str_detect(sample, "K27me3") ~ "healthy plasma H3K27me3, n = 11",
    str_detect(sample, "h3k4me3") ~ "healthy plasma H3K4me3, n = 4",
    str_detect(sample, "WGS") ~ "DELFI healthy WGS, n = 6",
    str_detect(sample, "medip") ~ "healthy cfMeDIP-Seq, n = 4",
    ))

# group by data type, and calculate mean frequency across groups.
frag_length_grouped_median <- frag_length_all %>%
group_by(data_type, insert_size) %>%
summarize(median_frequency = median(frequency))

# plot
frag_length_grouped_median %>%
ggplot(aes(x = insert_size, y = median_frequency, colour = data_type)) +
geom_line() +
theme_minimal(base_size = 16) +
labs(x = "Fragment Size",
    y = "Fragment Frequency",
    colour = "Data Type") +
scale_x_continuous(limits = c(100,250)) # definitely more dinucleosome enrichment


- Need to fix the for loop, to call the different directories that the files are contained within.

## Compasrison between healthy plasma H3K27me3 profile (HUCON) and PBMC H3K27me3 profile

In [None]:
# import counts from one of the PBMCs and healthy profiles (HUCON). Subset to bins after filtering.
setwd("/cluster/projects/scottgroup/people/steven/reference_sets/pbmc_k27me3_Yang_et_al_2022/counts_10kb/remove_headers")
import_counts <- list.files(path = ".", pattern = "counts_10kb.txt$") #list counts data, 10kb windows

pbmc_counts <- mapply(read.delim, import_counts) #loop over files in import_counts, and read them into memory
pbmc_counts_matrix <- do.call(cbind, pbmc_counts) #bind individual datasets across columns
rownames(pbmc_counts_matrix) <- 1:nrow(pbmc_counts_matrix) # index rows

# subset to bins from above analyses
pbmc_counts_matrix_sub <- pbmc_counts_matrix[rownames(pbmc_counts_matrix) %in% de_features_all,]

# compare global counts to one another in 10Kb bins
# cbind counts from PBMCs to original matrix. Then, CPM normalize and scale as prior.
all_filtered_features <- cbind(all_filtered_features, pbmc_counts_matrix_sub)

cpm_norm_all <- cpm(all_filtered_features, normalized.lib.sizes = TRUE, log = FALSE)
cpm_norm_all <- as.data.frame(cpm_norm_all)

cpm_norm_all_scaled <- scale(cpm_norm_all) # try without scaling the values first
cpm_norm_all_scaled <- as.data.frame(cpm_norm_all_scaled)


cpm_norm_all_scaled %>%
ggplot(aes(x = liberate_HUCON_45_K27me3, y = SRR21893770_pbmc_k27_control5_dedup_counts_10kb.txt.X0)) +
geom_point(alpha = 0.1, colour = "#09456b") +
stat_smooth(method = "lm", formula = y ~ x, color = "red", se = TRUE) + 
theme_minimal(base_size = 20) +
labs(x = "Healthy Donor Plasma H3K27me3 #1",
    y = "PBMC H3K27me3") +
scale_x_log10() +
scale_y_log10()

# linear correlation
summary(lm(liberate_HUCON_45_K27me3 ~ SRR21893770_pbmc_k27_control5_dedup_counts_10kb.txt.X0, data = cpm_norm_all_scaled)) # looks good, visualize it



- The linear correlation with PBMCs is not great, as they are much higher in expression. Consider using another leukocyte set, for instance the one from Sadeh et al.

## Comparison between healthy plasma profiles

In [None]:
# linear correlation between healthy plasma profiles (HUCON)
df_cpm_norm_scaled <- as.data.frame(cpm_norm_scaled)

summary(lm(liberate_HUCON_42_K27me3 ~ liberate_HUCON_45_K27me3, data = df_cpm_norm_scaled)) # looks good, visualize it

df_cpm_norm_scaled %>%
# ggplot(aes(x = liberate_HUCON_39_K27me3, y = liberate_HUCON_45_K27me3)) +
# ggplot(aes(x = liberate_HUCON_39_K27me3, y = liberate_HUCON_42_K27me3)) +
ggplot(aes(x = liberate_HUCON_45_K27me3, y = liberate_HUCON_42_K27me3)) +
geom_point(alpha = 0.1, colour = "#09456b") +
stat_smooth(method = "lm", formula = y ~ x, color = "red", se = TRUE) + 
theme_minimal(base_size = 20) +
labs(x = "Healthy Donor Plasma H3K27me3 #1",
    y = "Healthy Donor Plasma H3K27me3 #2") +
scale_x_log10() +
scale_y_log10()



# spearman correlation across all healthy profiles in the study, including HUCON and LIBERATE controls
# subset matrix to only columns of healthy samples
cpm_norm_scaled <- as.data.frame(cpm_norm_scaled)
cpm_norm_scaled_healthy <- cpm_norm_scaled %>%
select(matches("HUCON|_C_"))

# convert back to matrix for Spearman correlation
cpm_norm_scaled_healthy <- as.matrix(cpm_norm_scaled_healthy)

cor_matrix <- cor(cpm_norm_scaled_healthy, method = "spearman")
cor_matrix <- as.data.frame(as.table(cor_matrix))

# Create a heatmap of the correlation matrix
cor_matrix %>%
ggplot(aes(x = Var1, y = Var2, fill = Freq)) +
geom_tile() +
scale_fill_gradient() +
theme_minimal(base_size = 20) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(x = "Sample", 
     y = "Sample", 
     fill = "Correlation")



## 09-01-2024 heatmap of fragment length profiles for all healthy and lymphoma cases

In [None]:
# carry code from R
setwd("~/Library/CloudStorage/OneDrive-UniversityofToronto/1 MBP/1 Bratman Lab/1 Experiments/11 Sequencing/19-08-2022_k4_k27me3_troubleshooting/bam/k27me3/fragment_length")
setwd("~/Library/CloudStorage/OneDrive-UniversityofToronto/1 MBP/1 Bratman Lab/1 Experiments/11 Sequencing/DLBCL_Cohort/medipipe_output/221124-8/fragment_length")

k27me3_profiles <- list.files(path = ".", pattern = ".txt")

#loop over all subdirectories and call all fragment length profiles.
k27me3_profiles <- list.files(path = "~/OneDrive - University of Toronto/1 MBP/1 Bratman Lab/1 Experiments/11 Sequencing/DLBCL_Cohort/medipipe_output/", pattern="insert_size_metrics.txt", recursive=TRUE)

setwd("~/Library/CloudStorage/OneDrive-UniversityofToronto/1 MBP/1 Bratman Lab/1 Experiments/11 Sequencing/DLBCL_Cohort/medipipe_output")

#calculate frequency of fragment length for each of the pilot files
frag_length_all <- data.frame()

all_rows <- data.frame(insert_size = seq(1, 1000)) #to fill all rows to a fixed number

#for this loop, if extracting fragment length characteristics for violin plot, exclude row filling; only use for stacked histogram! Otherwise NA = error.
for (i in k27me3_profiles) {
  temp <- read.table(i, header = TRUE, skip = 10)
  temp$sample <- paste0(i)
  temp$total_sum <- sum(temp$All_Reads.fr_count)
  temp$frequency <- temp$All_Reads.fr_count / temp$total_sum
  #temp <- merge(all_rows, temp, all = TRUE) #merge dummy frame and temp to make temp a fixed number of rows
  #temp <- temp %>% 
  #  fill(sample, total_sum, .direction = c("updown")) #fill NAs in repeated character/numerical rows with values above and below
  #temp$sample <- str_match(temp$sample, "liberate_(.*?)_K27me3")[[1]] #make "sample" column characters a substring of the current characters
  frag_length_all <- rbind(frag_length_all,temp)
} 

#colour palate for line plot
colour_line <- met.brewer("Hokusai1", 9)
#plot_line
frag_length_all %>% 
  ggplot(aes(x = insert_size, y = frequency, colour = sample)) +
  geom_line() +
  theme_minimal(base_size = 16) +
  labs(x = "Insert Size",
       y = "Frequency",
       colour = "H3K27me3 Profile") +
  scale_colour_manual(values = colour_line) +
  scale_x_continuous(limits = c(100,200))

#plot_alt; stacked histogram
frag_length_all %>% 
  ggplot(aes(x = insert_size, y = sample, fill = frequency)) +
  geom_tile() +
  theme_minimal(base_size = 10) +
  labs(x = "Insert Size",
       y = "Sample",
       fill = "Frequency") +
  scale_fill_gradient(low = "black", high = "#E85e19", na.value = "black") +
  scale_x_continuous(limits = c(0,600))

- Fill missing values with 0s.

# 14-11-2023 NMF on lymphoma features: Identification of subgroups

In [None]:
library(NMF)

# subset feature matrix to just lymphoma samples
X_train_filtered <- as.data.frame(X_train_filtered)
X_train_filtered_lymphoma <- X_train_filtered %>% select(-matches("HUCON|_C_"))
X_train_filtered_lymphoma <- as.matrix(X_train_filtered_lymphoma)

# Assuming your data is in a matrix called 'lymphoma_data'
# Rows are features, and columns are samples

# Create an NMF model with 3 components (subgroups)
nmf_model <- nmf(X_train_filtered_lymphoma, rank = 2, method = "brunet")

# Extract the basis matrix (features associated with each subgroup)
basis_matrix <- basis(nmf_model)

# Extract the coefficient matrix (sample memberships for each subgroup)
coef_matrix <- coef(nmf_model)

# Visualize the coefficient matrix to assign samples to subgroups
pheatmap(coef_matrix, cluster_cols = TRUE, cluster_rows = FALSE, 
         col = heat.colors(10), display_numbers = FALSE,
         main = "NMF Coefficient Matrix (Group Membership)",
         cellwidth = 15, cellheight = 15,
         angle_col = 45)  # This rotates the column labels by 45 degrees

# Access the cluster assignments for each sample
cluster_assignments <- max.col(coef_matrix)

# Add the cluster assignments to the original data
lymphoma_data_with_clusters <- cbind(X_train_filtered_lymphoma, Cluster = cluster_assignments)

# View the result
head(lymphoma_data_with_clusters)

In [None]:
- This seems promising, explore this further.
- Try NMF on both hypermethylated and hypomethylated feature sets.

# 17-11-2023 Correlation of modeled features with TMTV and ichorCNA from paired patients

## import MTV and ichorCNA data, and append to pre-existing data

In [None]:
# import MTV data; data generated by the nuclear medicine team at UHN
setwd("/cluster/projects/scottgroup/people/steven/human_plasma_cfChIP-Seq/liberate_plasma_cfchip/other_profiles/tmtv_data_lymphoma_cases")
tmtv_lymphoma <- readxl::read_xlsx("17-11-2023_PET_scan_data_lymphoma_cases.xlsx")

# import ichorCNA output and match to X_train_attributes in the same way you incorporated TMTV
setwd("/cluster/projects/scottgroup/people/steven/human_plasma_cfChIP-Seq/liberate_plasma_cfchip/other_profiles/ichorCNA_data_shallow_WGS_lymphoma")
ichorCNA_lymphoma <- read.delim("Bratman.ichorCNA.txt", header = TRUE, sep = "\t")

# merge MTV data frame with X_train_attributes
X_train_attributes_all_data_types <- merge(X_train_attributes, tmtv_lymphoma, by.x = "PATIENT_ID", by.y = "LIBERATE ID", all.x = TRUE)

# merge ichorCNA data frame with X_train_attributes; this step is where you're gaining duplicate rows!!
X_train_attributes_all_data_types <- merge(X_train_attributes_all_data_types, ichorCNA_lymphoma, by.x = "PATIENT_ID", by.y = "PATIENT_ID", all.x = TRUE)

# remove duplicate ichorCNA entries due to accidental duplicate sequencing of some shallow WGS profiles. Selected single representative entry for duplicates.
X_train_attributes_all_data_types <- X_train_attributes_all_data_types %>%
  distinct(PATIENT_ID, .keep_all = TRUE)

- Fix acquisition of duplicate rows here.
- SOLUTION; THERE ARE DUPLICATE MEASUREMENTS FROM ichorCNA.
- Work with "X_train_attributes_all_data_types" for all univariate and multivariate survival analyses.

## correlation with MTV data

In [None]:

# import MTV data; data generated by the nuclear medicine team at UHN
setwd("/cluster/projects/scottgroup/people/steven/human_plasma_cfChIP-Seq/liberate_plasma_cfchip/other_profiles/tmtv_data_lymphoma_cases")
tmtv_lymphoma <- readxl::read_xlsx("17-11-2023_PET_scan_data_lymphoma_cases.xlsx")

# merge MTV data frame with X_train_attributes
X_train_attributes_tmtv <- merge(X_train_attributes, tmtv_lymphoma, by.x = "PATIENT_ID", by.y = "LIBERATE ID", all.x = TRUE)

# visualize correlation between median k27me3 expression and MTV data as scatter plot
p <- X_train_attributes_all_data_types %>% 
  ggplot(aes(x = `STLG (SUV*mL/Kg)`, y = median_X_train_filtered)) +
  # ggplot(aes(x = `STLG (SUV*mL/Kg)`, y = mean_VAF)) + # for correlation with CAPP-Seq
  geom_point(size = 5) + 
  stat_smooth(method = "lm", formula = y ~ x, color = "red", se = TRUE) + 
  theme_minimal(base_size = 20) +
  labs(#x = "MTV (mL)",
       #x = "TLG (SUV*mL)",
       #x = "SMTV (mL/Kg)",
       x = "STLG (SUV*mL/Kg)",
       y = "Median of plasma H3K27me3 Hypermethylated Features"
       # y = "Mean VAF from CAPP-Seq"
      ) #+
  # scale_x_log10() +
  # scale_y_log10()
print(p)

# lm stats
summary(lm(median_X_train_filtered ~ `MTV (mL)`, data = X_train_attributes_all_data_types))
summary(lm(median_X_train_filtered ~ `TLG (SUV*mL)`, data = X_train_attributes_all_data_types))
summary(lm(median_X_train_filtered ~ `SMTV (mL/Kg)`, data = X_train_attributes_all_data_types))
summary(lm(median_X_train_filtered ~ `STLG (SUV*mL/Kg)`, data = X_train_attributes_all_data_types))
summary(lm(mean_VAF ~ `STLG (SUV*mL/Kg)`, data = X_train_attributes_all_data_types))



Standardized uptake value, SUV, (also referred to as the dose uptake ratio, DUR) is a widely used, robust PET quantifier, calculated simply as a ratio of tissue radioactivity concentration (for example in units [kBq/mL]) at time T, CPET(T), and administered dose (for example in units [MBq]) at the time of injection divided by body weight (usually in units [kg]). (http://www.turkupetcentre.net/petanalysis/model_suv.html)

TLG is the Total Lesion Glycolysis. Its unit is SUV*mL;
• sTLG, the Standardized Total Lesion Glycolysis, corresponds to TLG/PatientWeight.
It is measured in SUV*mL/kg;
• MTV is the protocol main value, the total Metabolic Tumor Volume. Its unit
is mL (or cm3
);
• sMTV is Standardized Metabolic Tumor Volume. More specifically, it is measured as MTV/PatientWeight and is in mL/kg;
• Dmax is the distance between the two lesions that are the furthest apart (with
ROIs center of mass as origin). It is one way to measure tumor dissemination;
• wDmax is the distance between the two lesions that are the furthest apart
calculated from a weigthed center of mass;
• DmaxVox is the distance between the two lesions that are the furthest apart
(with the outermost voxel).
(https://www.lifexsoft.org/images/phocagallery/documentation/ProtocolMTV/UserGuide/MTVUserGuide.pdf)

## 08-12-2023 Correlation of modeled features with tumor burden from ichorCNA from shallow WGS for matched cases

In [None]:
# ADDITIONAL STEP; remove cases where tumor burden from ichorCNA is <5%.
X_train_attributes_all_data_types_filtered <- X_train_attributes_all_data_types[X_train_attributes_all_data_types$`Tumor.Fraction` < 0.05,]

# visualize correlation between median k27me3 expression and tumor burden from ichorCNA
X_train_attributes_all_data_types %>% 
# X_train_attributes_all_data_types_filtered %>% 
  ggplot(aes(x = `Tumor.Fraction`, y = median_X_train_filtered)) +
  # ggplot(aes(x = `Tumor.Fraction`, y = mean_VAF)) + # for correlation with CAPP-Seq
  geom_point(size = 5) + 
  stat_smooth(method = "lm", formula = y ~ x, color = "red", se = TRUE) + 
  theme_minimal(base_size = 20) +
  labs(x = "ichorCNA Tumor Fraction",
       y = "Median of plasma H3K27me3 Hypermethylated Features"
       # y = "Mean VAF from CAPP-Seq"
      ) #+
  # scale_x_log10() +
  # scale_y_log10()

# lm stats
summary(lm(median_X_train_filtered ~ `Tumor.Fraction`, data = X_train_attributes_all_data_types))
# summary(lm(median_X_train_filtered ~ `Tumor.Fraction`, data = X_train_attributes_all_data_types_filtered))
summary(lm(mean_VAF ~ `Tumor.Fraction`, data = X_train_attributes_all_data_types))
# summary(lm(mean_VAF ~ `Tumor.Fraction`, data = X_train_attributes_all_data_types_filtered))


# 27-11-2023 UMAP of all filtered features

In [None]:
# apply UMAP to all filtered features in 10Kb bins
#Visualize training data as UMAP; see if data clustering is different than PCA, as non-linear combinations of variables are a consideration.
#Or consider using all data for this.
library(umap)
library(Rtsne)

# Compute the UMAP embedding
umap_all_features <- umap(cpm_norm)

# Create a data frame with the UMAP coordinates
umap_data <- as.data.frame(umap_all_features$layout)
colnames(umap_data) <- c("UMAP1", "UMAP2")

#cbind metadata to the UMAP embeddings.
umap_data <- cbind(umap_data,X_train_attributes)

# Plot the UMAP
umap_data %>% 
  ggplot(aes(x = UMAP1, y = UMAP2)) +
  # ggplot(aes(x = UMAP1, y = UMAP2, colour = timepoint)) +
  # ggplot(aes(x = UMAP1, y = UMAP2, colour = ANN_ARBOR_STAGE)) +
  # ggplot(aes(x = UMAP1, y = UMAP2, colour = DIAGNOSIS_CLASS_SUMMARIZE_HEALTHY.DIAGNOSIS_SPECIFIC_DLBCL)) +
  geom_point() +
  theme_minimal(base_size=20)


# Plot the tSNE alone
tsne_df <- data.frame(tsne_all_filtered_features_counts$Y)
tsne_df <- cbind(tsne_df,all_filtered_features$samples)

p <- tsne_df %>% 
  #ggplot(aes(x = X1, y = X2, colour = timepoint)) +
  ggplot(aes(x = X1, y = X2, colour = ANN_ARBOR_STAGE)) +
  #ggplot(aes(x = X1, y = X2, colour = DIAGNOSIS_CLASS_SUMMARIZE_HEALTHY.DIAGNOSIS_SPECIFIC_DLBCL)) +
  geom_point(size = 5) +
  theme_minimal(base_size=15)

- Re-think this analysis; in its current state, it's visualizing all the features over UMAP 1 and 2

# 28-11-2023 hypermethylated vs hypomethylated feature analyses

## PCA - hyper vs hypo

In [None]:
# compare significant hypermethylated and hypomethylated features via PCA. Parallel to the comparison using NMF, which revealed differences.
pca_data <- t(X_train_filtered) # for feature subset; exchange feature set where the boxplot visualizations were above.

# perform PCA
pca_res <- prcomp(pca_data, scale = FALSE) #Perform PCA.

# Plot the first two principal components
pca_res$x %>% 
  as.data.frame %>%
  #rownames_to_column("cancer_status") %>%
  #separate(cancer_status,c("subtype")) %>%
  ggplot(aes(x=PC1,y=PC2)) +
  #geom_point(aes(color=list$samples$DIAGNOSIS_CLASS_SUMMARIZE_HEALTHY),size=5) +
  #geom_point(aes(color=list$samples$DIAGNOSIS_CLASS_SUMMARIZE_HEALTHY.DIAGNOSIS_SPECIFIC_DLBCL),size=5) +
  #geom_point(aes(color=list$samples$timepoint),size=5) +
  geom_point(aes(color=y_train),size=5) +
  #geom_point(aes(color=list$samples$batch),size=5) +
  # geom_label(label = colnames(X_train_filtered)) + # to label the data points and figure out which two samples were the outliers
  theme_minimal(base_size=20) + 
  labs(colour = "Group") +
  xlab(paste0("PC1 (", round(pca_res$sdev[1]^2*100/sum(pca_res$sdev^2), 1), "%)")) +
  ylab(paste0("PC2 (", round(pca_res$sdev[2]^2*100/sum(pca_res$sdev^2), 1), "%)")) +
  theme(legend.position="right") #+
  # scale_x_continuous(limits = c(-20,20)) # zoom into the cluster of healthy and cancer samples +
  # scale_y_continuous(limits = c(-35,35)) # zoom into the cluster of healthy and cancer samples



- Interestingly, no separation between lymphoma and non-lymphoma cases when hyper or hyopmethylated features are used. But the median expression over the features is informative for groups.
- Two non-lymphoma cases which separate in the healthy cases are interesting.
- Figure out where these features are located. Also, look at the modeled features for more info.
- Side note; HUCON 35 and 36 are major outliers in the modeling and may be driving poor performance in the test set.

# 29-11-2023 Gene-specific plasma H3K27me3 expression

In [None]:
# subset to prognostic genes in lymphoma, see if there are prominent difference across lymphoma cases.

# import 10kb reference
setwd("/cluster/projects/scottgroup/people/steven/reference_sets/other_references")
annotated_ref_no_X_Y_M_blacklist <- read.delim("10kb_bin_genome_wide_annotated_reference_with_regulatory_features_no_X_Y_M_ENCODEBlacklist.txt", header = TRUE, sep = "\t")

# import gene reference
setwd("/cluster/projects/scottgroup/people/steven/reference_sets/UCSC_reference")
gene_reference <- read.delim("UCSC_allGENCODE_V44_gene_annotations", header = TRUE, sep = "\t")

#aside; import NFE2 binding sites
setwd("/cluster/projects/scottgroup/people/steven/reference_sets/tfbs_reference/")
nfe2_reference <- read.delim("all_NFE2_tfbs.tsv", header = FALSE, sep = "\t")
colnames(nfe2_reference) <- c("chrom", "start", "end")

# subset reference to only chr1-22
gene_reference <- gene_reference[gene_reference$chrom %in% paste0("chr",seq(1:22)),]

# subset gene reference to genes of interest
# NOTE; change code below to specify genes of interest
# gene_reference_subset <- gene_reference[gene_reference$name2 %in% paste0("RNA5S",seq(1:17)),] # subset to gene of interest
gene_reference_subset <- gene_reference[gene_reference$name2 %in% paste0(c("STAT3")),] # BCL2 and BCL6 are interesting; outliers

# EXTRA CODE; IF YOU WISH TO SUBSET TO A PANEL OF GENES
setwd("/cluster/projects/scottgroup/people/steven/reference_sets/other_references/gene_lists")
# gene_panel <- read.delim("Nanostring_Lymph2Cx_Scott_2014_genes_ALL.txt", header = FALSE, sep = "\t") # for all
# gene_panel <- read.delim("epic_seq_Alizadeh_genes_ALL.txt", header = FALSE, sep = "\t") # for all
# gene_panel <- read.delim("Schmitz_NEJM_2018_fig1_genes_ALL.txt", header = FALSE, sep = "\t") # for all
# gene_panel <- read.delim("Schmitz_NEJM_2018_fig1_genes_GCB_ONLY.txt", header = FALSE, sep = "\t") # for GCB
# gene_panel <- read.delim("Schmitz_NEJM_2018_fig1_genes_nonGCB_ONLY.txt", header = FALSE, sep = "\t") # for non-GCB
# gene_reference_subset <- gene_reference[gene_reference$name2 %in% gene_panel$V1,] # for the panels, double check there aren't extra genes

# convert gene reference subset and 10kb reference to GRanges objects and check the overlap. Keep bin ID for overlapping ranges
gr_gene_reference_subset <- GRanges(gene_reference_subset[,c(3,5,6)])
# gr_gene_reference_subset <- GRanges(nfe2_reference[,c(1:3)]) # for TFBS reference
gr_annotated_ref_no_X_Y_M_blacklist <- GRanges(annotated_ref_no_X_Y_M_blacklist)

# find overlaps in ranges
overlap_regions <- findOverlaps(gr_gene_reference_subset, gr_annotated_ref_no_X_Y_M_blacklist)

# subset to overlap queries and save bin IDs for subsetting original data matrix (plasma H3K27me3)
gr_annotated_ref_no_X_Y_M_blacklist_filtered_to_gene_subset <- gr_annotated_ref_no_X_Y_M_blacklist[subjectHits(overlap_regions)]
gene_bins <- unique(gr_annotated_ref_no_X_Y_M_blacklist_filtered_to_gene_subset$bin_id)

# subset normalized matrix to bin IDs
cpm_norm_genes <- cpm_norm_log2[rownames(cpm_norm_log2) %in% gene_bins,]

# take median across bins per sample. ACTIVE IF GENE SPANS MORE THAN ONE 10KB BIN.
cpm_norm_genes_median <- as.data.frame(apply(cpm_norm_genes, 2, median))
colnames(cpm_norm_genes_median) <- c("cpm_norm_genes_median")

# append cpm_norm_genes_median to sample metadata
X_train_attributes_update <- cbind(X_train_attributes,cpm_norm_genes_median)

# if gene fits in one 10kb bin
# X_train_attributes_update <- cbind(X_train_attributes,cpm_norm_genes)

# remove FL cases for SCHMIDT et al
# X_train_attributes_update <- X_train_attributes_update[!(X_train_attributes_update$DIAGNOSIS_CLASS_SUMMARIZE_HEALTHY == "FL"),]

# incorporate other data types
# linear regression with MTV and ichorCNA
setwd("/cluster/projects/scottgroup/people/steven/human_plasma_cfChIP-Seq/liberate_plasma_cfchip/other_profiles/tmtv_data_lymphoma_cases")
tmtv_lymphoma <- readxl::read_xlsx("17-11-2023_PET_scan_data_lymphoma_cases.xlsx")

# import ichorCNA output and match to X_train_attributes in the same way you incorporated TMTV
setwd("/cluster/projects/scottgroup/people/steven/human_plasma_cfChIP-Seq/liberate_plasma_cfchip/other_profiles/ichorCNA_data_shallow_WGS_lymphoma")
ichorCNA_lymphoma <- read.delim("Bratman.ichorCNA.txt", header = TRUE, sep = "\t")

# merge MTV data frame with X_train_attributes_update
X_train_attributes_update <- merge(X_train_attributes_update, tmtv_lymphoma, by.x = "PATIENT_ID", by.y = "LIBERATE ID", all.x = TRUE)

# merge ichorCNA data frame with X_train_attributes; this step is where you're gaining duplicate rows!!
X_train_attributes_update <- merge(X_train_attributes_update, ichorCNA_lymphoma, by.x = "PATIENT_ID", by.y = "PATIENT_ID", all.x = TRUE)

# remove duplicate ichorCNA entries due to accidental duplicate sequencing of some shallow WGS profiles. Selected single representative entry for duplicates.
X_train_attributes_update <- X_train_attributes_update %>%
  distinct(PATIENT_ID, .keep_all = TRUE)

# visualize across healthy and cancer conditions
X_train_attributes_update %>% 
  # ggplot(aes(x = `DIAGNOSIS_CLASS_SUMMARIZE_HEALTHY+DIAGNOSIS_SPECIFIC_DLBCL`, y = cpm_norm_genes_median)) +#change to sum or median, depending on what you're visualizing
  ggplot(aes(x = Stage_Group, y = cpm_norm_genes_median)) + # when gene spans >1 bin
  # ggplot(aes(x = Stage_Group, y = cpm_norm_genes)) + # when gene spans one bin
  # geom_boxplot(aes(fill = `DIAGNOSIS_CLASS_SUMMARIZE_HEALTHY+DIAGNOSIS_SPECIFIC_DLBCL`), alpha = 0.3) +
  geom_boxplot(aes(fill = Stage_Group), alpha = 0.3) +
  geom_point(aes(colour = mean_VAF), position = position_jitterdodge(0.2), size = 7, alpha = 0.6) + 
  theme_minimal(base_size = 18) +
  #scale_color_gradient(low = "#185a9d", high = "#43cea2") +
  scale_color_gradient(low = "blue", high = "red") +
  labs(x = "Subtype",
       #x = "IPI",
       # y = "Median of Hypermethylated Feature Expression",
       # y = "Median of ALL Hypermethylated Feature Expression",
       y = "Median of H3K27me3 expression over gene",
       colour = "mean VAF", 
       # fill = "Stage"
       fill = "Stage"
       #colour = "IPI"
       )
  #ggtitle("Median Hypermethylated Counts of Differential Features")
  #ggtitle("Median of Counts over Features")
  #ggtitle("Sum of Counts over Modeled Features")
  ggtitle("Median of Counts over Genes") # change gene name if required

# linear regression with CAPP-Seq
X_train_attributes_update %>% 
  ggplot(aes(x = mean_VAF, y = cpm_norm_genes_median)) +
  # ggplot(aes(x = mean_VAF, y = cpm_norm_genes)) +
  # ggplot(aes(x = `STLG (SUV*mL/Kg)`, y = cpm_norm_genes_median)) +
  # ggplot(aes(x = `Tumor.Fraction`, y = cpm_norm_genes_median)) +
  geom_point(size = 5) + 
  stat_smooth(method = "lm", formula = y ~ x, color = "red", se = TRUE) + 
  theme_minimal(base_size = 20) +
  labs(x = "Mean VAF",
       y = "Median of Hypermethylated Feature Expression"
      )

summary(lm(cpm_norm_genes_median ~ mean_VAF, X_train_attributes_update))
# summary(lm(cpm_norm_genes ~ mean_VAF, X_train_attributes_update)) # for 1 bin gene
# summary(lm(cpm_norm_genes_median ~ `STLG (SUV*mL/Kg)`, data = X_train_attributes_update_tmtv))
# summary(lm(cpm_norm_genes_median ~ `Tumor.Fraction`, data = X_train_attributes_update_tmtv))




##### SURVIVAL #####
# remove healthy cases from X_train_attributes
X_train_attributes_update <- X_train_attributes_update[!(X_train_attributes_update$DIAGNOSIS_CLASS_SUMMARIZE_HEALTHY == "HEALTHY"),]

# convert PFS and OS to numeric
X_train_attributes_update$PFS <- as.numeric(X_train_attributes_update$PFS)
X_train_attributes_update$OS <- as.numeric(X_train_attributes_update$OS)
X_train_attributes_update$CODE_PFS <- as.numeric(X_train_attributes_update$CODE_PFS)
X_train_attributes_update$CODE_OS <- as.numeric(X_train_attributes_update$CODE_OS)

# survival based off thresholds of gene K27 expression
# median_threshold <- median(X_train_attributes_update$cpm_norm_genes_median[!(X_train_attributes_update$DIAGNOSIS_CLASS_SUMMARIZE_HEALTHY == "HEALTHY")]) # for individual genes
mean_threshold <- median(X_train_attributes_update$cpm_norm_genes_median[!(X_train_attributes_update$DIAGNOSIS_CLASS_SUMMARIZE_HEALTHY == "HEALTHY")]) # for individual genes

library(survival)
library(survminer)

# determine optimal cutoff using survminer surv_cutpoint
# cutpoint <- surv_cutpoint(data = X_train_attributes_lymphoma, 
#                           time = "PFS", 
#                           event = "CODE_PFS",
#                           variables = "median_X_train_filtered")

# cutpoint <- cutpoint$cutpoint$cutpoint # this is the stupidest functional code I've ever written

# create survival object
survival_object <- Surv(time = X_train_attributes_update$OS, event = X_train_attributes_update$CODE_OS) # for overall survival
# survival_object <- Surv(time = X_train_attributes_lymphoma$PFS, event = X_train_attributes_lymphoma$CODE_PFS) # for progression free survival

# might have to remove NAs (aka healthy samples)
# high = above threshold, low = below threshold
# X_train_attributes_update$threshold <- ifelse(X_train_attributes_update$cpm_norm_genes_median > median_threshold, "High", "Low")
X_train_attributes_update$threshold <- ifelse(X_train_attributes_update$cpm_norm_genes_median > mean_threshold, "High", "Low")

# 
km_fit <- survfit(survival_object ~ threshold, data = X_train_attributes_update)

# plot Kaplan Meier. Healthy samples were checked for exclusion from this plot.
survplot <- ggsurvplot(km_fit, 
                data = X_train_attributes_update, 
                pval = TRUE,
                conf.int = TRUE,
                ggtheme = theme_minimal(),
                risk.table = TRUE,
                xlab = "Time (Years)",
                #ylab = "Overall Survival",
                ylab = "Progression Free Survival (PFS)"
                )
survplot


- first, check 5S ribosomal RNAs as a positive control region bw cancer and healthy
- RNA5S1-17; chr1 228610268 - 228646158; from Ensembl
- Similar median across the groups, but so variable within groups.
- Interesting finding though, when we map the mean VAF onto the data points, high burden samples have the lowest expression of bins converging with RNA5S. Is this because K27me3 is generally higher over other regions so less reads in this region?
- See if this is the case for other genes which we'd expect to be different in the cancer cases
- Consider looking at entire Schmitz et al gene panel to see if they segregate the subtypes.
- Also consider taking the sum, but this will be influenced by gene size.
- You need to split the Schmitz list into GCB vs non-GCB enriched. Then test.

- 15-12-2023; look into TFBS for NFE2, Sasha recommended the database the Griffin paper used. She downloaded the dataset to the cluster, I can subset to all sites targeted by NFE2. These sites have chromosomal locations, so I can subset to these regions in 10kb K27me3 bins and look at their joint expression.
- Follow up on current NFE2 targets; lots of sites, but will likely be compiled into overlapping 10kb bins. UPDATE; 149235 BINS IN TOTAL WITH NFE2 TFBS; this is too many.
- Stick with the discovery platform discussed at the lymphoma meeting.

## Stats - Welch's t-test between groups for different genes

In [None]:
# Welch's t-test

# for stage group
stats_healthy <- X_train_attributes_update[X_train_attributes_update$Stage_Group == "non-lymphoma",]
stats_early <- X_train_attributes_update[X_train_attributes_update$Stage_Group == "Early Stage",]
stats_late <- X_train_attributes_update[X_train_attributes_update$Stage_Group == "Late Stage",]

t.test(stats_healthy$cpm_norm_genes_median, stats_early$cpm_norm_genes_median, var.equal = FALSE)
t.test(stats_healthy$cpm_norm_genes_median, stats_late$cpm_norm_genes_median, var.equal = FALSE)
t.test(stats_early$cpm_norm_genes_median, stats_late$cpm_norm_genes_median, var.equal = FALSE)

# for GCB vs non-GCB
stats_gcb <- X_train_attributes_update[X_train_attributes_update$`DIAGNOSIS_CLASS_SUMMARIZE_HEALTHY+DIAGNOSIS_SPECIFIC_DLBCL` == "GCB",]
stats_non_gcb <- X_train_attributes_update[X_train_attributes_update$`DIAGNOSIS_CLASS_SUMMARIZE_HEALTHY+DIAGNOSIS_SPECIFIC_DLBCL` == "non-GCB",]
stats_fl <- X_train_attributes_update[X_train_attributes_update$`DIAGNOSIS_CLASS_SUMMARIZE_HEALTHY+DIAGNOSIS_SPECIFIC_DLBCL` == "FL",]

t.test(stats_gcb$cpm_norm_genes_median, stats_non_gcb$cpm_norm_genes_median, var.equal = FALSE)
t.test(stats_fl$cpm_norm_genes_median, stats_non_gcb$cpm_norm_genes_median, var.equal = FALSE)



RNA5S genes: p = 0.2469
BCL6: p = 0.1236
BCL2: p = 0.0791

GCB VS NON-GCB:
NOTCH2: P = 0.07998
STAT6: P = 0.05235 ; consider combining a couple of these genes? Or all genes from the Schmitz NEJM paper.
EZH2: P = 0.3845; even though the effect size looks larger, maybe median isn't appropriate here?

## DO NOT RUN; code to generate 10kb reference with gene names

In [None]:
# libraries
library(annotatr)

# import 10kb/100kb reference
setwd("~/OneDrive - University of Toronto/1 MBP/1 Bratman Lab/1 Experiments/11 Sequencing/Reference/other_reference")
ref_10kb <- read.delim("10kb_bin_genome_wide_genomic_ranges_reference_raw.txt", header = TRUE, sep = "\t")
#ref_100kb <- read.delim("100kb_bin_genome_wide_genomic_ranges_reference_raw.txt", header = TRUE, sep = "\t")

# annotate using annotatR
# subset to only chrom, start, end
bin_annotations_regions <- ref_10kb[,1:3] # change, depending on which reference you use
#bin_annotations_regions <- ref_100kb[,1:3]
bin_annotations_regions <- setNames(bin_annotations_regions, c("seqnames","start","end"))
bin_annotations_regions_gr <- GRanges(bin_annotations_regions)

#Create three separate annotation sets as columns; CpG annotations, noncoding regulatory, coding regulatory.
annotations_hg38_basicgenes <- c("hg38_basicgenes")
annotations_hg38_genic <- c("hg38_genes_1to5kb",
                            "hg38_genes_promoters", 
                            "hg38_genes_5UTRs", 
                            "hg38_genes_exons",
                            "hg38_genes_firstexons",
                            "hg38_genes_introns",
                            "hg38_genes_intronexonboundaries",
                            "hg38_genes_exonintronboundaries",
                            "hg38_genes_3UTRs",
                            "hg38_genes_intergenic")
annotations_hg38_enhancers <- c("hg38_enhancers_fantom")

annotation_build_hg38_cpg <- build_annotations(genome = "hg38", annotations = "hg38_cpgs") #basic annotation preset; CpG features
annotation_build_hg38_genic <- build_annotations(genome = "hg38", annotations = annotations_hg38_genic) #basic annotation preset; genic features
annotation_build_hg38_basicgenes <- build_annotations(genome = "hg38", annotations = annotations_hg38_basicgenes) #basic annotation preset; genic features
annotation_build_hg38_enhancers <- build_annotations(genome = "hg38", annotations = annotations_hg38_enhancers) #annotate enhancers (FANTOM5 w liftover)


#annotate regions
annotate_300bp_bins_hg38_cpg <- annotate_regions(regions = bin_annotations_regions_gr,
                                                 annotations = annotation_build_hg38_cpg,
                                                 quiet = FALSE)
unique_annotate_300bp_bins_hg38_cpg <- unique(annotate_300bp_bins_hg38_cpg) #remove duplicate annotations

annotate_300bp_bins_hg38_genic <- annotate_regions(regions = bin_annotations_regions_gr,
                                                 annotations = annotation_build_hg38_genic,
                                                 quiet = FALSE)
unique_annotate_300bp_bins_hg38_genic <- unique(annotate_300bp_bins_hg38_genic) #remove duplicate annotations

annotate_300bp_bins_hg38_enhancers <- annotate_regions(regions = bin_annotations_regions_gr,
                                                 annotations = annotation_build_hg38_enhancers,
                                                 quiet = FALSE)
unique_annotate_300bp_bins_hg38_enhancers <- unique(annotate_300bp_bins_hg38_enhancers) #remove duplicate annotations




# gene annotations require additional code, due to duplicate genes in bins
# 
annotate_300bp_bins_hg38_basicgenes <- annotate_regions(regions = bin_annotations_regions_gr,
                                                 annotations = annotation_build_hg38_basicgenes,
                                                 quiet = FALSE)

# convert to data frame
annotate_300bp_bins_hg38_basicgenes <- as.data.frame(annotate_300bp_bins_hg38_basicgenes) # the 10kb reference now has bin IDs from start

# remove rows where annot.symbol == "NA"
annotate_300bp_bins_hg38_basicgenes <- annotate_300bp_bins_hg38_basicgenes[!(is.na(annotate_300bp_bins_hg38_basicgenes$annot.symbol)),]

# use dplyr for this, select by bin_ID, and apply some function to collapse and remove duplicates of annot.symbol
gene_names_per_bin <- annotate_300bp_bins_hg38_basicgenes %>% 
  group_by(bin_id) %>% 
  summarize(gene_names = paste(unique(annot.symbol), collapse = ", ")) # separates genes in a single column with a comma

gene_names_per_bin <- as.data.frame(gene_names_per_bin)


#Convert one of the GRanges objects back to a data frame. Then do this for the remainder.
df_unique_annotate_300bp_bins_hg38_cpg <- as.data.frame(unique_annotate_300bp_bins_hg38_cpg)
df_unique_annotate_300bp_bins_hg38_genic <- as.data.frame(unique_annotate_300bp_bins_hg38_genic)
df_unique_annotate_300bp_bins_hg38_enhancers <- as.data.frame(unique_annotate_300bp_bins_hg38_enhancers)


#Match the annotated data frame to 
df_merged_cpg <- merge(x = bin_annotations_regions, y = df_unique_annotate_300bp_bins_hg38_cpg, by = c("seqnames", "start", "end"), all = TRUE)
df_merged_genic <- merge(x = bin_annotations_regions, y = df_unique_annotate_300bp_bins_hg38_genic, by = c("seqnames", "start", "end"), all = TRUE)
df_merged_enhancers <- merge(x = bin_annotations_regions, y = df_unique_annotate_300bp_bins_hg38_enhancers, 
                             by = c("seqnames", "start", "end"), all = TRUE)


#Bind new annotated features to original data frame of annotated bins.
bin_annotations_final <- cbind(ref_10kb,df_merged_cpg$annot.type,df_merged_genic$annot.type,df_merged_enhancers$annot.type)
#bin_annotations_final <- cbind(ref_100kb,df_merged_cpg$annot.type,df_merged_genic$annot.type,df_merged_enhancers$annot.type)

# merge bin_annotations_final and gene_names_per_bin, as there are many bins missing from gene_names_per_bin, since not all bins have genes.
bin_annotations_final <- merge(bin_annotations_final, gene_names_per_bin, by = "bin_id", all.x = TRUE)

bin_annotations_final <- data.frame(lapply(bin_annotations_final, function(x) ifelse(is.na(x), "other", x))) #make all NAs in the data frame "other".

bin_annotations_final <- setNames(bin_annotations_final,
                                  c("chrom","start","end","width","strand",
                                    "bin_id","cpg_annotations","genic_annotations","enhancer_annotations", "gene_name"))

#export annotation set at text file
setwd("~/Library/CloudStorage/OneDrive-UniversityofToronto/1 MBP/1 Bratman Lab/1 Experiments/11 Sequencing/Reference/other_reference")
#write.table(bin_annotations_final,
#            file = "10kb_bin_genome_wide_annotated_reference_with_regulatory_features.txt",
#            sep = "\t",
#            row.names = FALSE,
#            quote = FALSE)

write.table(bin_annotations_final,
            file = "10kb_bin_genome_wide_annotated_reference_with_regulatory_features_and_gene_names.txt",
            sep = "\t",
            row.names = FALSE,
            quote = FALSE)


## ADDITIONAL CODE; create gene reference vector, with no duplicate gene names
unique_genes <- unique(annotate_300bp_bins_hg38_basicgenes$annot.symbol)

# save unique_genes as a reference set
setwd("~/Library/CloudStorage/OneDrive-UniversityofToronto/1 MBP/1 Bratman Lab/1 Experiments/11 Sequencing/Reference/other_reference")
write(unique_genes, "unique_gene_names_annotatR.txt")

- Accidentally shifted columns of the reference to the right; fix before proceeding.

## Check expression of plasma K27me3 over all genes, sort by strongest correlation with mean VAF

In [None]:
# goal; to identify a minimal set of genes who's plasma H3K27me3 expression is significantly correlated with tumor burden. These targets
# may be prognostic.

# import 10kb reference with gene names per bin. Reference was created using annotatR; see chunk for creating annotation reference in RMarkdown file
# Note; creating the reference requires internet, so code is not included here.
# Note; ENCODE Blacklist regions were not removed from the bins, since they will be matched to genes for the analysis.
setwd("/cluster/projects/scottgroup/people/steven/reference_sets/other_references")
annotated_ref_w_gene_names <- read.delim("10kb_bin_genome_wide_annotated_reference_with_regulatory_features_and_gene_names.txt", header = TRUE, sep = "\t")

# import unique gene list as character vector, with each gene represented only once in the vector.
# import gene reference
unique_genes <- read.delim("unique_gene_names_annotatR.txt", header = FALSE, sep = "\t")
colnames(unique_genes) <- c("gene_name")

# loop; for each gene in unique_genes, subset annotated_ref_w_gene_names to rows with this gene, and quantify the median expression
# of all normalized plasma K27me3 bins with those genes. Append median expression with gene name to empty data frame (median_gene_k27_expression).
median_gene_k27_expression <- data.frame()


##### LEFT OFF HERE #####
for (i in 1:nrow(unique_genes)){
    # Extract gene names for the current row
    current_gene <- strsplit(unique_genes$gene_name[i], ", ")[[1]]

    # Subset annotated reference to bins with gene
    temp <- annotated_ref_w_gene_names[unlist(sapply(current_gene, function(gene) str_detect(annotated_ref_w_gene_names$gene_name, gene))),]
                                                     
    # subset normalized matrix to annotated reference subset, by bin_id
    matrix_genes <- cpm_norm_log2[rownames(cpm_norm_log2) %in% temp$bin_id,]
                                                     
    # take median expression of plasma K27me3 per patient, over the gene region
    median_expression <- t(as.data.frame(apply(matrix_genes, 
                               MARGIN = 2, 
                               FUN = median)))

    # make rowname of median_expression, the gene which the 10kb matrix was subset by
    rownames(median_expression) <- current_gene
                                                     
    # bind median expression to median_gene_k27_expression
    median_gene_k27_expression <- rbind(median_gene_k27_expression, median_expression)      
} # NEW ERROR ; MAY BE BECAUSE SOME OF THE GENES ARE NOT PRESENT IN THE NORMALIZED MATRIX, OR SOME ROWS HAVE NO GENES; LOOK INTO THE ERROR THROWN BY THE LOOP

# for each row of the new data frame, find the correlation with mean VAF per patient, and append the R2 and p-val to the data frame

# sort data frame by desc R2 (also, try descending p-val)

# select top 5% of correlations and explore further







# CHATGPT SOLUTION
# Initialize an empty data frame to store the results
correlation_results <- data.frame(gene_name = character(0), R2 = numeric(0), p_value = numeric(0))

# Iterate over unique genes
for (i in 1:nrow(unique_genes)) {
    # Extract gene names for the current row
    current_genes <- strsplit(unique_genes$gene_name[i], ", ")[[1]]

    # Subset annotated reference to bins with gene
    temp <- annotated_ref_w_gene_names[unlist(sapply(current_genes, function(gene) str_detect(annotated_ref_w_gene_names$gene_name, gene))),]

    # If you want to further process 'temp', you can add your code here

    # For each row of 'temp', find the correlation with mean VAF per patient
    for (j in 1:nrow(temp)) {
        # Assuming 'normalized_matrix' is your normalized matrix
        current_row <- normalized_matrix[temp$bin_id[j], ]

        # Calculate correlation
        cor_result <- cor.test(current_row, your_mean_VAF_vector)

        # Append the results to the data frame
        correlation_results <- rbind(correlation_results, data.frame(
            gene_name = temp$gene_name[j],
            R2 = cor_result$estimate^2,
            p_value = cor_result$p.value
        ))
    }
}

# Sort data frame by descending R2
correlation_results <- correlation_results[order(-correlation_results$R2),]

# Select top 5% of correlations
top_5_percent <- head(correlation_results, nrow(correlation_results) * 0.05)

# Explore further with 'top_5_percent'

- Continue this; use comments for guidance.
- Biases may be introduced by this approach, for instance, in gene clusters, there will be a lot of overlapping genes in bins. Measurements of gene K27 expression may be homogeneous over these clusters, so that genes in close proximity may receive the same measurement, therefore same correlation with mean VAF.
- An additional filter may be required; 1) for genes in close proximity, figure out which is most representative of the expression? Or 2) select the larger gene in the bin, as this would me more likely represented by K27 expression, which is broadly distributed?

- 03-01-2024; Side note; blacklist regions were included in the gene names reference. And apparently, some genes are in blacklist regions; consider filtering out blacklist regions from the reference, or at the beginning of this analysis (once the code is sorted).
- Also, need to remove healthy samples from the normalized matrix, cpm_norm_log2.

# 07-12-2023 Incorporation of test data for validation

## Bar plots over stage and IPI, with paired random feature selection

In [None]:
# start by subsetting the test matrix to the hypermethylated features of interest from Random Forest
cpm_X_test <- cpm(X_test, normalized.lib.sizes = TRUE, log = FALSE)

# log normalize and subset features
cpm_X_test_log2 <- log2(cpm_X_test + 1)
X_test_filtered <- cpm_X_test_log2[(rownames(cpm_X_test_log2)) %in% predictors_rf_hyper,]




# subset sample metadata to training samples.
test_sample_metadata <- sample_list[sample_list$sample_name %in% colnames(X_test_filtered),]

# re-import clinical metadata for test set
setwd("/cluster/home/sdemichi/steven_scottgroup.txt/human_plasma_cfChIP-Seq/liberate_plasma_cfchip/temp/")
clinical_metadata <- as.data.frame(readxl::read_xlsx("25_07_2023_cohort_metadata_labeling_UPDATED_WITH_CLINICAL_ANNOTATIONS_plus_library_size_cfdna_concn.xlsx"))

# replace spaces in "Donor ID (rep)" so that you can match directly to mean VAF burden$Sample_id
clinical_metadata$`Donor ID (rep)` <- gsub(" ", "_", clinical_metadata$`Donor ID (rep)`)

# match mean VAF information to clinical metadata
clinical_metadata$mean_VAF <- mean_VAF_burden$mean_VAF[match(clinical_metadata$'Donor ID (rep)', mean_VAF_burden$Sample_id)]

# filter clinical metadata by append cfDNA concentration to test, from clinical metadata.
clinical_metadata_test <- clinical_metadata[clinical_metadata$PATIENT_ID %in% test_sample_metadata$PATIENT_ID,]

# remove all T3 samples from the metadata
clinical_metadata_test <- subset(clinical_metadata_test, !grepl("T3|T5", clinical_metadata_test$`Donor ID (rep)`))

# add new column to clinical_metadata_test which matches the colnames of X_test_attributes
clinical_metadata_test <- clinical_metadata_test %>%
  mutate(Bioinformatic_name_update = paste0("liberate_",`Bioinformatic Name`,"_K27me3"))

# sort clinical_metadata in order of colnames of X_train_attributes$PATIENT_ID
clinical_metadata_test <- clinical_metadata_test[match(colnames(X_test_filtered), clinical_metadata_test$Bioinformatic_name_update), ] # NEED TO MAKE SURE THEY'RE IN ORDER; CORT BY X_TEST COLNAMES.

# overwrite X_train_attributes with clinical_metadata
X_test_attributes <- clinical_metadata_test

# save new matrix as a DGEobject, since this links the sample attributes and the counts
X_test_filtered_features_plus_attributes <- DGEList(counts = X_test_filtered, samples = X_test_attributes, remove.zeros = TRUE)






#calculate the median of each column (aka counts over top 1500 differential features, for each sample)
median_X_test_filtered <- apply(X_test_filtered_features_plus_attributes$counts, 
                   MARGIN = 2, 
                   FUN = median)

#bind the newly calculated medians and sumsto the pre-existing metadata
X_test_attributes <- as.data.frame(cbind(X_test_attributes, median_X_test_filtered)) #Subset this data frame for plotting.

##### METADATA-SPECIFIC CHANGES BEFORE PLOTTING #####
X_test_attributes$ANN_ARBOR_STAGE <- ifelse(X_test_attributes$ANN_ARBOR_STAGE == "NA", "non-lymphoma", X_test_attributes$ANN_ARBOR_STAGE) # convert NA characters to "non-lymphoma"
X_test_attributes$IPI <- as.character(X_test_attributes$IPI) #convert IPI from int to char

X_test_attributes$mean_VAF <- as.numeric(X_test_attributes$mean_VAF) #convert IPI from int to char


#For tumor diameter, create bins (<5cm, 5-10cm, 10-15cm, >15cm), then assign samples to those bins based off of tumor diameter; separate column
X_test_attributes$TUMOR_MASS_MAX_CM <- as.numeric(X_test_attributes$TUMOR_MASS_MAX_CM)
X_test_attributes <- X_test_attributes %>% 
  mutate(tumor_diameter_binned = cut(TUMOR_MASS_MAX_CM, breaks = c(0, 5, 10, 15, 30), labels = c("<5", "5-10", "10-15", ">15")))

# combine Stage I/II and III/IV samples into early and late stage, respectively, for visualization
X_test_attributes <- X_test_attributes %>%
  mutate(Stage_Group = case_when(
    ANN_ARBOR_STAGE %in% c(1, 2) ~ "Early Stage",
    ANN_ARBOR_STAGE %in% c(3, 4) ~ "Late Stage",
    TRUE ~ as.character(ANN_ARBOR_STAGE)  # Keep other values as is
  ))

# set factor level for Stage_Group
X_test_attributes$Stage_Group <- factor(X_test_attributes$Stage_Group, levels = c("non-lymphoma", "Early Stage", "Late Stage"))

# 24-10-2023; create new variable where IPI is grouped 1+2 and 4+5, as this is clinically relevant
X_test_attributes <- X_test_attributes %>%
  mutate(IPI_grouped = case_when(
    IPI %in% c(0, 1) ~ "0+1",
    IPI %in% c(4, 5) ~ "4+5",
    TRUE ~ as.character(IPI)  # Keep other values as is
  ))

# 24-10-2023; subset X_train_attributes to FL cases, and visualize FLIPI
X_test_attributes_FL <- X_test_attributes[X_test_attributes$DIAGNOSIS_CLASS_SUMMARIZE_HEALTHY == "FL", ]




##### PLOT #####

# modify visualization to incorporate mean VAF as the colour; KEEP WORKING ON THIS! The points are not striking here, maybe resort to the heat map for this visualization.
# p <- X_train_attributes_FL %>% 
p <- X_test_attributes %>% 
  ggplot(aes(x = IPI_grouped, y = median_X_test_filtered)) +#change to sum or median, depending on what you're visualizing
  # ggplot(aes(x = Stage_Group, y = median_X_test_filtered)) +#change to sum or median, depending on what you're visualizing
  #ggplot(aes(x = as.character(NB_EN_SITES), y = sum_X_train_filtered, colour = as.character(NB_EN_SITES))) +
  geom_boxplot(aes(fill = IPI_grouped), alpha = 0.3) +
  # geom_boxplot(aes(fill = Stage_Group), alpha = 0.3) +
  geom_point(aes(colour = mean_VAF), position = position_jitterdodge(0.2), size = 7, alpha = 0.6) + 
  theme_minimal(base_size = 18) +
  #scale_color_gradient(low = "#185a9d", high = "#43cea2") +
  scale_color_gradient(low = "blue", high = "red") +
  labs(x = "Stage_Group",
       #x = "IPI",
       # y = "Median of Hypermethylated Feature Expression",
       # y = "Median of ALL Hypermethylated Feature Expression",
        y = "Median of MODELED Hypermethylated Feature Expression",
        colour = "Stage_Group", 
       #colour = "IPI"
       )
  #ggtitle("Median Hypermethylated Counts of Differential Features")
  #ggtitle("Median of Counts over Features")
  #ggtitle("Sum of Counts over Modeled Features")
  ggtitle("Median of Counts over Modeled Features")
print(p)

- Trends in test set are generally holding up. Less of a difference between early stage and healthy, but that's okay, it's a very small test set.

## 13-12-2023 Survival incorporating test set

# 08-12-2023 Combination of variables into a single analysis (K27me3, TMTV, mean VAF, and ichorCNA tumor burden)

In [None]:
# subset matrix to only lymphoma cases
compared_burden_variables <- X_train_attributes_all_data_types[!(X_train_attributes_all_data_types$DIAGNOSIS_CLASS_SUMMARIZE_HEALTHY == "HEALTHY"),]

compared_burden_variables <- compared_burden_variables[,c("PATIENT_ID",
                                                          "median_X_train_filtered",# K27me3 modeled feature expression
                                                          "mean_VAF",# capp-seq
                                                          "STLG (SUV*mL/Kg)",# MTV
                                                          "Tumor.Fraction")]# ichorCNA

# for this analysis, make NA values 0 for comparisons
compared_burden_variables[is.na(compared_burden_variables)] <- 0

scaled_compared_burden_variables <- as.data.frame(scale(compared_burden_variables[,2:5]))

# re-append the patient IDs from the original matrix
scaled_compared_burden_variables <- cbind(scaled_compared_burden_variables, PATIENT_ID = compared_burden_variables$PATIENT_ID)

# order the data frame by decreasing median_X_train_filtered
# test <- order(scaled_compared_burden_variables$median_X_train_filtered, decreasing = TRUE)

# convert from wide to long format for visualization
scaled_compared_burden_variables <- gather(scaled_compared_burden_variables, key = "Measurement", value = "Value", -PATIENT_ID)

# visualize scaled data as box plot
scaled_compared_burden_variables %>% 
  ggplot(aes(x = PATIENT_ID, y = Measurement, fill = Value)) +#change to sum or median, depending on what you're visualizing
  geom_tile() +
  theme_minimal(base_size = 18) +
  scale_color_gradient(low = "blue", high = "red") #+
  # labs(x = "Stage_Group",
  #      #x = "IPI",
  #      # y = "Median of Hypermethylated Feature Expression",
  #      # y = "Median of ALL Hypermethylated Feature Expression",
  #       y = "Median of MODELED Hypermethylated Feature Expression",
  #       colour = "Stage_Group", 
  #      #colour = "IPI"
  #      )
  # #ggtitle("Median Hypermethylated Counts of Differential Features")
  # #ggtitle("Median of Counts over Features")
  # #ggtitle("Sum of Counts over Modeled Features")
  # ggtitle("Median of Counts over Modeled Features")

- Figure out what's going on here. Why are there cases that have high tumor burden by ichorCNA or TMTV, which do not have detectable mutations by CAPP-Seq.
- Continue working on this grouped comparison. Scaling was performed, need to re-name the variable with patient IDs, convert to long format, and visualize.
- issue; _C_ cases are present; need to remove HUCON and _C_ from patient ID column.
- I just realized, there are duplicate rows in this data frame with different measurements for ichorCNA; follow up. Probably the reason for more rows present.
- Duplicate rows removed; keep working on this. Maybe sort the heatmap or go for some other visualization? Also, some sort of concordance index between measurements? Or statistical test?

## Grid of linear correlations across data types; MTV, K27me3, CAPP-Seq, and ichorCNA

In [None]:
# code from above chunk
compared_burden_variables <- X_train_attributes_all_data_types[!(X_train_attributes_all_data_types$DIAGNOSIS_CLASS_SUMMARIZE_HEALTHY == "HEALTHY"),]

compared_burden_variables <- compared_burden_variables[,c("PATIENT_ID",
                                                          "median_X_train_filtered",# K27me3 modeled feature expression
                                                          "mean_VAF",# capp-seq
                                                          "STLG (SUV*mL/Kg)",# MTV
                                                          # "MTV (mL)",# MTV; less correlated with burden measurements
                                                          "Tumor.Fraction")]# ichorCNA

# for this analysis, make NA values 0 for comparisons
compared_burden_variables[is.na(compared_burden_variables)] <- 0

# CHANGE IF REQUIRED; remove low tumor burden samples

# tumor fraction (ichorCNA) < 0.03 (3%)
# compared_burden_variables <- compared_burden_variables[!(compared_burden_variables$Tumor.Fraction < 0.03),]

# mean VAF < 0.03 (3%)
# compared_burden_variables <- compared_burden_variables[!(compared_burden_variables$mean_VAF < 0.03),]

# both tumor fraction (ichorCNA) < 0.05 (5%) and mean VAF < 0.03 (3%)
# need to be more stringent
# compared_burden_variables <- compared_burden_variables[!(compared_burden_variables$mean_VAF < 0.03 & compared_burden_variables$Tumor.Fraction < 0.05),]


# combine the 4 measurements into a single grid of correlations.
# Customize upper panel
upper.panel<-function(x, y){
  points(x,y, pch=10)
  r <- round(cor(x, y), digits=2)
  p <- round(cor.test(x, y)$p.value, digits=6)
  txt <- paste0("R = ", r, "\n", "p = ", p)
  usr <- par("usr")
  on.exit(par(usr))
  par(usr = c(0, 1, 0, 1))
  text(0.5, 0.9, txt, cex = 1.5)
}
pairs(compared_burden_variables[,2:5], lower.panel = NULL, 
      upper.panel = upper.panel)

In [None]:
- Code from http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs 

# 15-01-2024 heatmap of clinical metadata, entire cohort (Figure 1)

In [None]:
# load library for heatmap visualization
library(pheatmap)

# import clinical metadata
setwd("/cluster/home/sdemichi/steven_scottgroup.txt/human_plasma_cfChIP-Seq/liberate_plasma_cfchip/temp/")
clinical_metadata <- as.data.frame(readxl::read_xlsx("25_07_2023_cohort_metadata_labeling_UPDATED_WITH_CLINICAL_ANNOTATIONS_plus_library_size_cfdna_concn.xlsx"))

# import mean VAF information
setwd("/cluster/projects/scottgroup/people/steven/human_plasma_cfChIP-Seq/liberate_plasma_cfchip/other_profiles/cappseq_data_lymphoma/updated_batch_2_all_cases/")
mean_VAF_burden <- read.delim("Bratman.dcs.uniq.burden.txt", header = TRUE, sep = "\t") # for dcs.unique

# replace spaces in "Donor ID (rep)" so that you can match directly to mean VAF burden$Sample_id
clinical_metadata$`Donor ID (rep)` <- gsub(" ", "_", clinical_metadata$`Donor ID (rep)`)

# match mean VAF information to clinical metadata
clinical_metadata$mean_VAF <- mean_VAF_burden$mean_VAF[match(clinical_metadata$'Donor ID (rep)', mean_VAF_burden$Sample_id)]

# remove healthy cases for this visualization
clinical_metadata_lymphoma <- clinical_metadata[!(clinical_metadata$DIAGNOSIS_CLASS_SUMMARIZE_HEALTHY) == "HEALTHY",]

# remove PBMC data
clinical_metadata_lymphoma <- clinical_metadata_lymphoma[!(clinical_metadata_lymphoma$DIAGNOSIS_CLASS_SUMMARIZE_HEALTHY) == "PBMC",]

# mutate additional column, which represents whether baseline samples have follow-up plasma
clinical_metadata_lymphoma <- clinical_metadata_lymphoma %>%
mutate(follow_up_plasma = duplicated(substr(`Bioinformatic Name`, 1, 4), fromLast = TRUE) | duplicated(substr(`Bioinformatic Name`, 1, 4)))

# remove all T3 and T5 samples
clinical_metadata_lymphoma <- clinical_metadata_lymphoma[!str_detect(clinical_metadata_lymphoma$`Bioinformatic Name`, "T3|T5"),]

# incorporate ichorCNA and MTV
# import MTV data; data generated by the nuclear medicine team at UHN
setwd("/cluster/projects/scottgroup/people/steven/human_plasma_cfChIP-Seq/liberate_plasma_cfchip/other_profiles/tmtv_data_lymphoma_cases")
tmtv_lymphoma <- readxl::read_xlsx("17-11-2023_PET_scan_data_lymphoma_cases.xlsx")

# import ichorCNA output and match to X_train_attributes in the same way you incorporated TMTV
setwd("/cluster/projects/scottgroup/people/steven/human_plasma_cfChIP-Seq/liberate_plasma_cfchip/other_profiles/ichorCNA_data_shallow_WGS_lymphoma")
ichorCNA_lymphoma <- read.delim("Bratman.ichorCNA.txt", header = TRUE, sep = "\t")

# merge MTV data frame with X_train_attributes
clinical_metadata_lymphoma <- merge(clinical_metadata_lymphoma, tmtv_lymphoma, by.x = "PATIENT_ID", by.y = "LIBERATE ID", all.x = TRUE)

# merge ichorCNA data frame with X_train_attributes; this step is where you're gaining duplicate rows!!
clinical_metadata_lymphoma <- merge(clinical_metadata_lymphoma, ichorCNA_lymphoma, by.x = "PATIENT_ID", by.y = "PATIENT_ID", all.x = TRUE)

# remove duplicate ichorCNA entries due to accidental duplicate sequencing of some shallow WGS profiles. Selected single representative entry for duplicates.
clinical_metadata_lymphoma <- clinical_metadata_lymphoma %>%
  distinct(PATIENT_ID, .keep_all = TRUE)

# mutate an additional column onto the data frame, named by the data type itself, which indicates whether the data is present.
clinical_metadata_lymphoma <- clinical_metadata_lymphoma %>%
  mutate(
      # CAPP-Seq
      CAPP_Seq = case_when(!is.na(mean_VAF) ~ TRUE, is.na(mean_VAF) ~ FALSE),
      
      # shallow WGS
      shallow_WGS = case_when(!is.na(`Tumor.Fraction`) ~ TRUE, is.na(`Tumor.Fraction`) ~ FALSE),
      
      # MTV
      metabolic_tumor_volume = case_when(!is.na(`STLG (SUV*mL/Kg)`) ~ TRUE, is.na(`STLG (SUV*mL/Kg)`) ~ FALSE))

# subset to only essential columns of the metadata for visualization
heatmap_metadata <- clinical_metadata_lymphoma[,c(9,
                                                  17,
                                                  23,
                                                  24,
                                                  25,
                                                  27:31,
                                                  34:35,
                                                  47,
                                                  74:76
                                                 )]

# convert columns w numeric values to numeric
heatmap_metadata <- heatmap_metadata %>%
  mutate_at(vars(AGE_CONSENT,
                 TUMOR_MASS_MAX_CM,
                 LDH,
                 NB_EN_SITES), as.numeric)

# to see indexed column names
# print(paste0("Column names and indices:\n", paste0(seq_along(colnames(clinical_metadata_lymphoma)), ". ", colnames(clinical_metadata_lymphoma), collapse = "|")))

# generate heatmap, once data is compiled
# Set the row names to donor IDs
# rownames(heatmap_metadata) <- heatmap_metadata$`Donor ID (rep)`



# CONTINUE FROM HERE

# # Generate the heatmap
# heatmap(heatmap_metadata#, 
#          # cluster_rows=FALSE, 
#          # cluster_cols=FALSE#, 
#          # color=colorRampPalette(c("white", "blue"))(100), 
#          # fontsize_row=5, fontsize_col=8, 
#          # labels_col=colnames(clinical_metadata), 
#          # labels_row=rownames(clinical_metadata), 
#          # main="Clinical Metadata Heatmap"
#         )

# for all metadata
heatmap_metadata %>%
  ggplot() +
  geom_tile(aes(x = `Donor ID (rep)`, y = "DIAGNOSIS_CLASS_SUMMARIZE_HEALTHY", fill = DIAGNOSIS_CLASS_SUMMARIZE_HEALTHY), show.legend = FALSE, width = 0.9, height = 0.9) +
  geom_tile(aes(x = `Donor ID (rep)`, y = "TREATMENT", fill = TREATMENT), show.legend = FALSE, width = 0.9, height = 0.9) +
  geom_tile(aes(x = `Donor ID (rep)`, y = "SEX", fill = SEX), show.legend = FALSE, width = 0.9, height = 0.9) +
  # # geom_tile(aes(x = `Donor ID (rep)`, y = "AGE_CONSENT", fill = AGE_CONSENT), show.legend = FALSE, width = 0.9, height = 0.9) +
  geom_tile(aes(x = `Donor ID (rep)`, y = "ANN_ARBOR_STAGE", fill = ANN_ARBOR_STAGE), show.legend = FALSE, width = 0.9, height = 0.9) +
  # # geom_tile(aes(x = `Donor ID (rep)`, y = "TUMOR_MASS_MAX_CM", fill = TUMOR_MASS_MAX_CM), show.legend = FALSE, width = 0.9, height = 0.9) +
  # # geom_tile(aes(x = `Donor ID (rep)`, y = "ECOG", fill = ECOG), show.legend = FALSE, width = 0.9, height = 0.9) +
  # # geom_tile(aes(x = `Donor ID (rep)`, y = "LDH", fill = LDH), show.legend = FALSE, width = 0.9, height = 0.9) +
  # # geom_tile(aes(x = `Donor ID (rep)`, y = "NB_EN_SITES", fill = NB_EN_SITES), show.legend = FALSE, width = 0.9, height = 0.9) +
  geom_tile(aes(x = `Donor ID (rep)`, y = "IPI", fill = IPI), show.legend = FALSE, width = 0.9, height = 0.9) +
  geom_tile(aes(x = `Donor ID (rep)`, y = "FLIPI", fill = FLIPI), show.legend = FALSE, width = 0.9, height = 0.9) +
  geom_tile(aes(x = `Donor ID (rep)`, y = "follow_up_plasma", fill = follow_up_plasma), show.legend = FALSE, width = 0.9, height = 0.9) +
  geom_tile(aes(x = `Donor ID (rep)`, y = "CAPP_Seq", fill = CAPP_Seq), show.legend = FALSE, width = 0.9, height = 0.9) +
  geom_tile(aes(x = `Donor ID (rep)`, y = "shallow_WGS", fill = shallow_WGS), show.legend = FALSE, width = 0.9, height = 0.9) +
  geom_tile(aes(x = `Donor ID (rep)`, y = "metabolic_tumor_volume", fill = metabolic_tumor_volume), show.legend = FALSE, width = 0.9, height = 0.9) +
  theme_minimal(base_size = 16) +
  theme(axis.text.x = element_text(angle = 45, hjust=1)) +
  coord_equal()





# for clinical metadata only
heatmap_metadata %>%
  ggplot() +
  geom_tile(aes(x = `Donor ID (rep)`, y = "DIAGNOSIS_CLASS_SUMMARIZE_HEALTHY", fill = DIAGNOSIS_CLASS_SUMMARIZE_HEALTHY), show.legend = FALSE, width = 0.9, height = 0.9) +
  geom_tile(aes(x = `Donor ID (rep)`, y = "TREATMENT", fill = TREATMENT), show.legend = FALSE, width = 0.9, height = 0.9) +
  geom_tile(aes(x = `Donor ID (rep)`, y = "SEX", fill = SEX), show.legend = FALSE, width = 0.9, height = 0.9) +
  # geom_tile(aes(x = `Donor ID (rep)`, y = "AGE_CONSENT", fill = AGE_CONSENT), show.legend = FALSE, width = 0.9, height = 0.9) +
  geom_tile(aes(x = `Donor ID (rep)`, y = "ANN_ARBOR_STAGE", fill = ANN_ARBOR_STAGE), show.legend = FALSE, width = 0.9, height = 0.9) +
  # # geom_tile(aes(x = `Donor ID (rep)`, y = "TUMOR_MASS_MAX_CM", fill = TUMOR_MASS_MAX_CM), show.legend = FALSE, width = 0.9, height = 0.9) +
  # # geom_tile(aes(x = `Donor ID (rep)`, y = "ECOG", fill = ECOG), show.legend = FALSE, width = 0.9, height = 0.9) +
  # # geom_tile(aes(x = `Donor ID (rep)`, y = "LDH", fill = LDH), show.legend = FALSE, width = 0.9, height = 0.9) +
  # # geom_tile(aes(x = `Donor ID (rep)`, y = "NB_EN_SITES", fill = NB_EN_SITES), show.legend = FALSE, width = 0.9, height = 0.9) +
  geom_tile(aes(x = `Donor ID (rep)`, y = "IPI", fill = IPI), show.legend = FALSE, width = 0.9, height = 0.9) +
  geom_tile(aes(x = `Donor ID (rep)`, y = "FLIPI", fill = FLIPI), show.legend = FALSE, width = 0.9, height = 0.9) +
  geom_tile(aes(x = `Donor ID (rep)`, y = "follow_up_plasma", fill = follow_up_plasma), show.legend = FALSE, width = 0.9, height = 0.9) +
  theme_minimal(base_size = 16) +
  theme(axis.text.x = element_text(angle = 45, hjust=1)) +
  coord_equal()

# for sequencing data only
heatmap_metadata %>%
  ggplot() +
  geom_tile(aes(x = `Donor ID (rep)`, y = "CAPP_Seq", fill = CAPP_Seq), width = 0.9, height = 0.9) +
  geom_tile(aes(x = `Donor ID (rep)`, y = "shallow_WGS", fill = shallow_WGS), width = 0.9, height = 0.9) +
  geom_tile(aes(x = `Donor ID (rep)`, y = "metabolic_tumor_volume", fill = metabolic_tumor_volume), width = 0.9, height = 0.9) +
  theme_minimal(base_size = 16) +
  theme(axis.text.x = element_text(angle = 45, hjust=1)) +
  coord_equal()

# for numeric (STILL GETTING THE SAME ERROR when you add multiple gradients)
heatmap_metadata %>%
  ggplot() +
  geom_tile(aes(x = `Donor ID (rep)`, y = "AGE_CONSENT", fill = AGE_CONSENT), width = 0.9, height = 0.9) +
  geom_tile(aes(x = `Donor ID (rep)`, y = "TUMOR_MASS_MAX_CM", fill = TUMOR_MASS_MAX_CM), width = 0.9, height = 0.9) +
  # geom_tile(aes(x = `Donor ID (rep)`, y = "ECOG", fill = ECOG), width = 0.9, height = 0.9) +
  geom_tile(aes(x = `Donor ID (rep)`, y = "LDH", fill = LDH), width = 0.9, height = 0.9) + 
  geom_tile(aes(x = `Donor ID (rep)`, y = "NB_EN_SITES", fill = NB_EN_SITES), show.legend = FALSE, width = 0.9, height = 0.9) +
  theme_minimal(base_size = 16) +
  theme(axis.text.x = element_text(angle = 45, hjust=1)) +
  scale_fill_gradient() +
  coord_equal()







# Convert the data frame from wide to long format (THIS IS TECHNICALLY THE WAY IT SHOULD BE DONE)
heatmap_metadata_long <- pivot_longer(heatmap_metadata, cols = everything(), names_to = "variable", values_to = "value")

# Create a simplified heatmap
ggplot(heatmap_metadata_long, aes(x = `Donor ID (rep)`, y = variable, fill = value)) +
  geom_tile() +
  theme_minimal(base_size = 18) +
  scale_fill_identity()