## Multi-omics Clustering

In [34]:
suppressPackageStartupMessages(library(tidyr))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(NMF))

### Data Preparation

In [35]:
PATH_DATA <- "../data_linkedomics/"

valid_samples <- read.csv("sample105.list", header = FALSE, col.names = "ID")
valid_samples$ID2 <- gsub("-", ".", valid_samples$ID) 
head(valid_samples,3)

Unnamed: 0_level_0,ID,ID2
Unnamed: 0_level_1,<chr>,<chr>
1,C3L-02613,C3L.02613
2,C3L-04072,C3L.04072
3,C3N-04282,C3N.04282


In [103]:
cnv <- read.table(paste0(PATH_DATA, "SCNA_log2_gene_level.cct"), header = TRUE, row.names = 1, sep = "\t", 
    # na.strings = "NA" 
    )
cnv <- cnv %>%
    # Select only the valid sample columns in order
    select(all_of(valid_samples$ID2)) %>%
    # Remove rows where any value is "NA"
    filter(rowSums(is.na(.)) == 0) %>%
    # Keep only rows with any value greater than 0.4 or less than -0.4
    filter(rowSums(. > 0.4 | . < -0.4) > 0) %>%
    # Subtract median from each row
    mutate(row_median = apply(., 1, median, na.rm = TRUE)) %>%
    mutate(across(-row_median, ~ . - row_median)) %>%
    select(-row_median)
tail(cnv, 3)
dim(cnv)

Unnamed: 0_level_0,C3L.02613,C3L.04072,C3N.04282,C3L.03639,C3L.01328,C3L.00277,C3L.02604,C3L.01971,C3N.01167,C3L.01031,⋯,C3N.00436,C3N.02944,C3L.03356,C3N.02589,C3L.03388,C3N.01388,C3N.00198,C3N.01380,C3N.01715,C3N.03426
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
DAZ4,-0.12735,-0.59551,-0.92239,-0.72322,-0.02908,0.07958,0.82266,0.23247,-0.45531,0.28581,⋯,-0.36377,0.30116,-0.06191,-0.05711,-0.38335,-0.12518,-0.10954,0.11612,0.40891,-0.47479
BPY2C,-0.12735,-0.59551,-0.92239,-0.72322,-0.02908,0.07958,0.82266,0.23247,-0.45531,0.28581,⋯,-0.36377,0.30116,-0.06191,-0.05711,-0.38335,-0.12518,-0.10954,0.11612,0.40891,-0.47479
CDY1,-0.12735,-0.59551,-0.92239,-0.72322,-0.02908,0.07958,0.82266,0.23247,-0.45531,0.28581,⋯,-0.36377,0.30116,-0.06191,-0.05711,-0.38335,-0.12518,-0.10954,0.11612,0.40891,-0.47479


In [104]:
prot <- read.table(paste0(PATH_DATA, "proteomics_gene_level_MD_abundance_tumor.cct"), header = TRUE, row.names = 1, sep = "\t", na.strings = "NA")
prot <- prot %>%
    # Order columns as in valid_samples
    select(valid_samples$ID2) %>%
    # sort the columns by the order in valid_samples
    select(order(match(names(.), valid_samples$ID2))) %>%
    # Remove rows with all NAs
    filter(rowSums(is.na(.)) == 0) %>%
    # median normalization
    mutate(row_median = apply(., 1, median, na.rm = TRUE)) %>%
    mutate(across(-row_median, ~ . / row_median)) %>%
    select(-row_median) %>%
    # Do log2 transformation
    mutate(across(everything(), ~ log2(.)))

head(prot, 3)
dim(prot)

Unnamed: 0_level_0,C3L.02613,C3L.04072,C3N.04282,C3L.03639,C3L.01328,C3L.00277,C3L.02604,C3L.01971,C3N.01167,C3L.01031,⋯,C3N.00436,C3N.02944,C3L.03356,C3N.02589,C3L.03388,C3N.01388,C3N.00198,C3N.01380,C3N.01715,C3N.03426
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
A1BG,-0.02990673,-0.05704551,0.01628874,0.01603432,-0.007786107,0.08029986,-0.002396511,-0.02007117,0.002849704,-0.012854305,⋯,-0.047168575,0.002205547,0.008227415,0.023093964,-0.01740784,0.007965865,0.006648201,0.03795399,0.0008283693,0.01275042
A1CF,-0.1076881,-0.06315414,-0.02110286,-0.03337825,-0.027146384,0.01286132,-0.043795955,0.04575079,0.065486572,-0.003501735,⋯,-0.007676754,0.034893213,0.001121374,0.014860119,-0.01008405,-0.049558683,0.050916236,-0.03390004,-0.0403040175,0.02859428
A2M,-0.04459164,-0.07561623,-0.01808992,-0.03178421,-0.005866177,0.02693054,0.006563652,0.03762665,0.006559212,-0.015757502,⋯,-0.016670852,0.0,0.052783243,0.004635516,-0.02938669,0.028271871,0.038905775,0.03043543,-0.0143345714,0.03923146


In [105]:
rna <- read.table(paste0(PATH_DATA, "mRNA_RSEM_UQ_log2_Tumor.cct"), header = TRUE, row.names = 1, sep = "\t", na.strings = "NA")
rna <- rna %>%
    # Order columns as in valid_samples
    select(valid_samples$ID2) %>%
    # sort the columns by the order in valid_samples
    select(order(match(names(.), valid_samples$ID2))) %>%
    # Remove rows with any NAs or any zero values
    filter(rowSums(is.na(.)) == 0) %>%
    filter(rowSums(. == 0) == 0) %>%
    # Subtract the median of each row 
    mutate(row_median = apply(., 1, median, na.rm = TRUE)) %>%
    mutate(across(-row_median, ~ . - row_median)) %>%
    select(-row_median)

dim(rna)
head(rna, 3)

Unnamed: 0_level_0,C3L.02613,C3L.04072,C3N.04282,C3L.03639,C3L.01328,C3L.00277,C3L.02604,C3L.01971,C3N.01167,C3L.01031,⋯,C3N.00436,C3N.02944,C3L.03356,C3N.02589,C3L.03388,C3N.01388,C3N.00198,C3N.01380,C3N.01715,C3N.03426
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
A1BG,-0.6208639,-1.5641153,-0.03598678,-1.499229,-1.030409,-0.3632095,-0.7079235,0.4973275,0.1398404,0.4582964,⋯,0.5012384,0.196264,0.45922795,0.05454697,-0.3385884,-0.06842248,0.5971399,0.72255331,0.04504042,-1.018051
A1BG-AS1,-0.8663854,-0.8302223,0.1110559,-1.422656,-1.161717,-1.8484433,-0.5696947,0.3464761,-0.2569541,0.4220428,⋯,0.3638638,0.3672824,0.31895969,-0.71812584,-0.217941,-0.09720916,0.3269861,0.70061799,0.3022495,-1.071129
A1CF,-2.8039993,-3.0594026,-2.42468317,-1.982175,-1.343388,-2.0692887,-1.9460596,1.0109679,2.3566886,-0.2918026,⋯,0.2894103,0.9205599,0.06451494,0.61939774,-1.5979746,-2.31924199,1.5137978,-0.01529477,-2.96096702,1.411194


In [106]:
# glyco <- read.table(paste0(PATH_DATA, "N-glycoproteomics_peptide_level_ratio_tumor.cct"), header = TRUE, row.names = 1, sep = "\t", na.strings = "NA")
glyco <- read.table(paste0(PATH_DATA, "N-glycoproteomics_Site_level_ratio_tumor.cct"), header = TRUE, row.names = 1, sep = "\t", na.strings = "NA")
glyco <- glyco %>%
    # Order columns as in valid_samples
    select(valid_samples$ID2) %>%
    # sort the columns by the order in valid_samples
    select(order(match(names(.), valid_samples$ID2))) %>%
    # Remove rows with all NAs
    filter(rowSums(is.na(.)) == 0) #%>%
    # Subtract the median of each row 
    # mutate(.data = ., row_median = apply(., 1, median, na.rm = TRUE)) %>%
    # mutate(across(-row_median, ~ . - row_median)) %>%
    # select(-row_median)

dim(glyco)
head(glyco, 3)

Unnamed: 0_level_0,C3L.02613,C3L.04072,C3N.04282,C3L.03639,C3L.01328,C3L.00277,C3L.02604,C3L.01971,C3N.01167,C3L.01031,⋯,C3N.00436,C3N.02944,C3L.03356,C3N.02589,C3L.03388,C3N.01388,C3N.00198,C3N.01380,C3N.01715,C3N.03426
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
n[TMT11plex]RHEEGHMLNC[Carbamidomethyl]TC[Carbamidomethyl]FGQGR-N3H4F0S0G0,0.2711286,0.3086638,1.1849827,-0.006491598,1.9091718,1.278178,1.0885962,-0.7653889,-0.9867006,-0.5941954,⋯,0.5728847,0.9236759,-0.555157,-0.24003358,-0.2509592,0.05717399,-0.16628917,-0.5030644,0.7816165,1.034915863
n[TMT11plex]LLQVVYLHSNNITK[TMT11plex]-N5H6F1S2G0,-0.1230223,-0.4341817,0.24471,0.079600075,-0.9919879,1.089329,-0.1866849,0.3223009,0.4387361,0.2199116,⋯,-0.94928102,-1.3285715,1.3734742,0.7323404,-1.2241213,-0.83878453,-0.08174441,0.7487903,0.1999972,-0.779378985
n[TMT11plex]NYTADYDK[TMT11plex]-N2H8F0S0G0,0.5089437,0.5752988,-0.2466641,-0.21386651,0.1689218,-1.305813,-0.3507612,0.3861138,-0.9076605,-0.2210551,⋯,0.08706562,-0.9358628,-0.7485728,-0.04754478,0.6560518,-0.76525368,-1.1101838,-0.9045606,-0.2454482,0.006003751


In [107]:
# psty <- read.table(paste0(PATH_DATA, "phosphoproteomics_gene_level_MD_abundance_tumor.cct"), header = TRUE, row.names = 1, sep = "\t", na.strings = "NA")
psty <- read.table(paste0(PATH_DATA, "phosphoproteomics_site_level_MD_abundance_tumor.cct"), header = TRUE, row.names = 1, sep = "\t", na.strings = "NA")
# psty <- read.table(paste0(PATH_DATA, "phosphoproteomics_MultiSite_level_MD_abundance_tumor.cct"), header = TRUE, row.names = 1, sep = "\t", na.strings = "NA")
psty <- psty %>%
    # Only select columns that are in valid_samples
    select(valid_samples$ID2) %>%
    # sort the columns by the order in valid_samples
    select(order(match(names(.), valid_samples$ID2))) %>%
    # Remove rows with all NAs or too small values
    filter(rowSums(is.na(.)) == 0) %>%
    # filter(rowSums(. < 1e-10) == 0) %>%
    # Subtract the median of each row 
    mutate(row_median = apply(., 1, median, na.rm = TRUE)) %>%
    mutate(across(-row_median, ~ . / row_median)) %>%
    select(-row_median) %>%
    # Do log2 transformation
    mutate(across(everything(), ~ log2(pmax(., 1e-10))))
    # mutate(across(everything(), ~ log2(.)))
dim(psty)
head(psty, 3)

Unnamed: 0_level_0,C3L.02613,C3L.04072,C3N.04282,C3L.03639,C3L.01328,C3L.00277,C3L.02604,C3L.01971,C3N.01167,C3L.01031,⋯,C3N.00436,C3N.02944,C3L.03356,C3N.02589,C3L.03388,C3N.01388,C3N.00198,C3N.01380,C3N.01715,C3N.03426
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
NP_000028.3_S781,-0.0032892523,-0.09882047,-0.001863032,0.06418698,-0.016158518,-0.08553476,0.061424745,0.0920351,-0.002218144,-0.02439635,⋯,0.051337695,0.07273598,0.062203354,-0.06401302,-0.019261737,0.28241507,0.08484637,0.047996792,0.15066791,0.284944808
NP_000028.3_S834,-0.0800175628,0.0,-0.034027167,0.03720242,-0.019537779,-0.03624982,0.035035816,0.09724809,-0.008214741,-0.0238946,⋯,0.010157085,0.03829329,0.03177595,-0.03711336,-0.006308129,0.20050218,0.0342917,0.030525641,0.14200041,0.192726994
NP_000029.2_S2260,0.0005534952,0.02156564,-0.046694025,0.03888092,-0.008756757,-0.0336017,-0.005744726,-0.04557517,-0.049288602,-0.0334126,⋯,-0.002327981,0.02801253,-0.002616697,-0.0131348,0.012638146,0.01510528,-0.01505923,-0.004890773,0.01898168,-0.002611995


In [108]:
# Concatenate all data frames
data <- bind_rows(
    cnv = cnv,
    prot = prot,
    rna = rna,
    glyco = glyco,
    psty = psty,
    .id = "data_type"
)
dim(data)
feature_counts <- data %>%
    group_by(data_type) %>%
    summarise(feature_count = n()) %>%
    ungroup()
print(feature_counts)

row_sd <- apply(data[ , -1], 1, sd, na.rm = TRUE)
cutoff <- quantile(row_sd, probs = 0.05, na.rm = TRUE)
data <- data[row_sd > cutoff, ] %>%
    # Column-wise z-score normalization
    mutate(across(-data_type, ~ (.-mean(.)) / sd(.))) %>%
    # Sort columns by name
    select(data_type, sort(names(.)[-1]))
 
dim(data)
tail(data,10)
# Count the number of features per data type
feature_counts <- data %>%
    group_by(data_type) %>%
    summarise(feature_count = n()) %>%
    ungroup()
print(feature_counts)

[90m# A tibble: 5 × 2[39m
  data_type feature_count
  [3m[90m<chr>[39m[23m             [3m[90m<int>[39m[23m
[90m1[39m cnv                [4m9[24m361
[90m2[39m glyco              [4m1[24m454
[90m3[39m prot               [4m5[24m773
[90m4[39m psty               [4m2[24m086
[90m5[39m rna               [4m1[24m[4m8[24m197


Unnamed: 0_level_0,data_type,C3L.00017,C3L.00102,C3L.00277,C3L.00589,C3L.00598,C3L.00599,C3L.00622,C3L.00625,C3L.00819,⋯,C3N.03754,C3N.03780,C3N.03839,C3N.03840,C3N.03853,C3N.03884,C3N.04119,C3N.04126,C3N.04282,C3N.04283
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
NP_958845.1_S79,psty,0.07835299,0.030296763,-0.024128358,-0.013725453,-0.0545177,-0.125929296,-0.0819336,-0.06304079,-0.003033698,⋯,0.004937903,-0.005692795,0.17909972,-0.02909559,0.046551116,0.090379343,-0.0453307657,-0.0080918,0.07527426,0.024059175
NP_976324.2_S575,psty,0.081008533,0.022066683,0.053506993,-0.03703133,0.08028063,0.120926612,-0.11333387,0.02940277,0.05115861,⋯,-0.005830122,-0.046650176,0.09756209,-0.06018493,0.010150112,0.112353322,-0.0455958811,-0.09018357,0.08183862,0.047538875
NP_976324.2_S710,psty,0.135903912,0.010110379,0.088455511,-0.033578771,0.05505784,0.044012939,-0.06795209,0.0610896,0.008434746,⋯,-0.026915767,-0.009312529,0.09711188,-0.0483721,0.055783863,0.113929773,-0.0075636541,-0.0690753,0.08104682,0.06712666
NP_982272.2_S904,psty,0.030969769,0.014843403,-0.079691544,-0.038611549,-0.07649163,-0.108885292,-0.06672173,-0.15907173,-0.021856961,⋯,0.003398384,-0.079841477,0.08170891,0.01354087,0.056240875,0.136384632,-0.036897137,-0.11831072,0.10867523,0.015418696
NP_987100.1_T227,psty,0.135851208,-0.347830249,0.171077995,0.105514781,-0.03543496,0.158294357,-0.21393113,-0.09617729,0.005587596,⋯,-0.120777975,0.057593027,0.02659467,0.0713554,0.144694208,0.038803392,-0.1737745401,0.12851071,0.16913776,0.004652569
NP_996826.2_S1414,psty,0.064662288,0.084642505,0.122891015,0.038899221,-0.14483404,-0.04520605,-0.01097168,0.15381136,0.088504672,⋯,-0.018611993,-0.06340537,0.0578292,0.02279495,-0.009731266,0.073827242,0.0007590622,0.05293134,0.08457984,0.04462709
NP_997245.2_S881,psty,0.127971662,-0.029880918,0.001264008,0.114308751,-0.06441532,0.002967254,0.05050564,-0.01894824,-0.072325141,⋯,-0.145721567,0.038175015,0.09025037,0.05256681,0.013140905,0.070513355,0.0422291717,0.05330586,0.06111702,-0.011166879
NP_997641.1_S2294,psty,0.032289854,0.050368631,0.227503137,-0.008719183,0.02032066,-0.077230736,-0.11548179,0.17128371,0.041987672,⋯,-0.102981811,-0.041832498,-0.04315974,0.04921527,0.080390935,0.005625389,0.0195362886,0.01804733,0.14274512,0.132656246
NP_998754.1_S1154,psty,0.007732008,-0.009380744,0.050882228,-0.047110437,0.07504556,-0.024725457,-0.06191116,-0.15275313,0.023693184,⋯,0.018673914,-0.002925644,0.1207694,0.06034005,0.102346225,0.01724127,-0.0139533199,-0.07045692,0.12777281,0.132405661
NP_998754.1_S894,psty,0.001584361,0.029674207,-0.008941243,-0.00529095,-0.07812377,-0.051012398,-0.09106203,-0.0986495,0.017361695,⋯,0.00494826,-0.02479522,0.16774408,0.04027845,0.07255355,0.061241185,-0.0128371832,-0.00276621,0.11619894,0.070870359


[90m# A tibble: 5 × 2[39m
  data_type feature_count
  [3m[90m<chr>[39m[23m             [3m[90m<int>[39m[23m
[90m1[39m cnv                [4m9[24m361
[90m2[39m glyco              [4m1[24m454
[90m3[39m prot               [4m3[24m971
[90m4[39m psty               [4m2[24m044
[90m5[39m rna               [4m1[24m[4m8[24m197


In [109]:
data_type <- data$data_type
data_num <- data[, -1]

positive_only_matrix <- data_num
positive_only_matrix[positive_only_matrix < 0] <- 0
# Abs values of negative values
negative_only_matrix <- data_num
negative_only_matrix[negative_only_matrix > 0] <- 0
negative_only_matrix <- abs(negative_only_matrix)

combined_matrix <- rbind(positive_only_matrix, negative_only_matrix)

combined_matrix <- combined_matrix[rowSums(combined_matrix) != 0, ]

dim(data)
dim(combined_matrix)

In [110]:
membership <- nmf_clu@fit@H %>%
    t() %>%
    as.data.frame() %>%
    # Set the cluster column, V1 > V2
    mutate(cluster = ifelse(V1 >= V2, "1", "2")) %>%
    # Divide by the sum of the two 
    mutate(sum = V1 + V2) %>%
    mutate(V1 = V1 / sum, V2 = V2 / sum) %>%
    mutate(cluster = ifelse(V1 >= V2, "1", "2"))
membership

Unnamed: 0_level_0,V1,V2,cluster,sum
Unnamed: 0_level_1,<dbl>,<dbl>,<chr>,<dbl>
C3L.00017,2.411827e-01,0.75881733,2,0.005572523
C3L.00102,4.532528e-01,0.54674725,2,0.004766979
C3L.00277,6.836035e-01,0.31639655,1,0.005596046
C3L.00589,1.826408e-01,0.81735923,2,0.006071938
C3L.00598,9.176941e-01,0.08230593,1,0.005329709
C3L.00599,6.223116e-02,0.93776884,2,0.004646528
C3L.00622,2.020734e-01,0.79792661,2,0.004793819
C3L.00625,3.806831e-01,0.61931692,2,0.005874077
C3L.00819,7.843885e-01,0.21561148,1,0.005772333
C3L.00928,5.979592e-01,0.40204083,1,0.005297152


In [113]:
nmf_clu <- nmf(as.matrix(combined_matrix), 2, .options=list(verbose = TRUE), maxIter=500)
membership <- nmf_clu@fit@H %>%
    t() %>%
    as.data.frame() %>%
    # Set the cluster column, V1 > V2
    mutate(cluster = ifelse(V1 >= V2, "1", "2")) %>%
    # Divide by the sum of the two 
    mutate(sum = V1 + V2) %>%
    mutate(V1 = V1 / sum, V2 = V2 / sum) %>%
    select(-sum) %>%
    mutate(cluster = ifelse(V1 > V2, "1", "2")) %>% 
    # substitute ""." with "-" in the sample IDs
    mutate(sample = rownames(.)) %>%
    mutate(sample = gsub("\\.", "-", sample)) %>%
    select(sample, cluster, V1, V2)
    

# Count the number of samples in each cluster
membership_summary <- membership %>%
    group_by(cluster) %>%
    summarise(count = n())
print(membership_summary)
# Exclude rownames from the output
rownames(membership) <- NULL
write.table(membership, "membership.tsv", sep = "\t", quote = FALSE, row.names = FALSE)

NMF algorithm: 'brunet'

NMF seeding method: random



Iterations: 500/500 
DONE (stopped at 500/500 iterations)
[90m# A tibble: 2 × 2[39m
  cluster count
  [3m[90m<chr>[39m[23m   [3m[90m<int>[39m[23m
[90m1[39m 1          52
[90m2[39m 2          53
