## Multi-omics Clustering

In [2]:
suppressPackageStartupMessages(library(tidyr))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(NMF))

### Data Preparation

In [3]:
PATH_DATA <- "../data_linkedomics/"

valid_samples <- read.csv("sample105.list", header = FALSE, col.names = "ID")
valid_samples$ID2 <- gsub("-", ".", valid_samples$ID) 
head(valid_samples,3)

Unnamed: 0_level_0,ID,ID2
Unnamed: 0_level_1,<chr>,<chr>
1,C3L-02613,C3L.02613
2,C3L-04072,C3L.04072
3,C3N-04282,C3N.04282


In [4]:
cnv <- read.table(paste0(PATH_DATA, "SCNA_log2_gene_level.cct"), header = TRUE, row.names = 1, sep = "\t", 
    # na.strings = "NA" 
    )
cnv <- cnv %>%
    # Select only the valid sample columns in order
    select(all_of(valid_samples$ID2)) %>%
    # Remove rows where any value is "NA"
    filter(rowSums(is.na(.)) == 0) %>%
    # Subtract median from each row
    mutate(row_median = apply(., 1, median, na.rm = TRUE)) %>%
    mutate(across(-row_median, ~ . - row_median)) %>%
    select(-row_median)
tail(cnv, 3)
dim(cnv)

Unnamed: 0_level_0,C3L.02613,C3L.04072,C3N.04282,C3L.03639,C3L.01328,C3L.00277,C3L.02604,C3L.01971,C3N.01167,C3L.01031,⋯,C3N.00436,C3N.02944,C3L.03356,C3N.02589,C3L.03388,C3N.01388,C3N.00198,C3N.01380,C3N.01715,C3N.03426
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
DAZ4,-0.12735,-0.59551,-0.92239,-0.72322,-0.02908,0.07958,0.82266,0.23247,-0.45531,0.28581,⋯,-0.36377,0.30116,-0.06191,-0.05711,-0.38335,-0.12518,-0.10954,0.11612,0.40891,-0.47479
BPY2C,-0.12735,-0.59551,-0.92239,-0.72322,-0.02908,0.07958,0.82266,0.23247,-0.45531,0.28581,⋯,-0.36377,0.30116,-0.06191,-0.05711,-0.38335,-0.12518,-0.10954,0.11612,0.40891,-0.47479
CDY1,-0.12735,-0.59551,-0.92239,-0.72322,-0.02908,0.07958,0.82266,0.23247,-0.45531,0.28581,⋯,-0.36377,0.30116,-0.06191,-0.05711,-0.38335,-0.12518,-0.10954,0.11612,0.40891,-0.47479


In [5]:
prot <- read.table(paste0(PATH_DATA, "proteomics_gene_level_MD_abundance_tumor.cct"), header = TRUE, row.names = 1, sep = "\t", na.strings = "NA")
prot <- prot %>%
    # Order columns as in valid_samples
    select(valid_samples$ID2) %>%
    # sort the columns by the order in valid_samples
    select(order(match(names(.), valid_samples$ID2))) %>%
    # Remove rows with all NAs
    filter(rowSums(is.na(.)) == 0) %>%
    # median normalization
    mutate(row_median = apply(., 1, median, na.rm = TRUE)) %>%
    mutate(across(-row_median, ~ . / row_median)) %>%
    select(-row_median) %>%
    # Do log2 transformation
    mutate(across(everything(), ~ log2(.)))

head(prot, 3)
dim(prot)

Unnamed: 0_level_0,C3L.02613,C3L.04072,C3N.04282,C3L.03639,C3L.01328,C3L.00277,C3L.02604,C3L.01971,C3N.01167,C3L.01031,⋯,C3N.00436,C3N.02944,C3L.03356,C3N.02589,C3L.03388,C3N.01388,C3N.00198,C3N.01380,C3N.01715,C3N.03426
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
A1BG,-0.02990673,-0.05704551,0.01628874,0.01603432,-0.007786107,0.08029986,-0.002396511,-0.02007117,0.002849704,-0.012854305,⋯,-0.047168575,0.002205547,0.008227415,0.023093964,-0.01740784,0.007965865,0.006648201,0.03795399,0.0008283693,0.01275042
A1CF,-0.1076881,-0.06315414,-0.02110286,-0.03337825,-0.027146384,0.01286132,-0.043795955,0.04575079,0.065486572,-0.003501735,⋯,-0.007676754,0.034893213,0.001121374,0.014860119,-0.01008405,-0.049558683,0.050916236,-0.03390004,-0.0403040175,0.02859428
A2M,-0.04459164,-0.07561623,-0.01808992,-0.03178421,-0.005866177,0.02693054,0.006563652,0.03762665,0.006559212,-0.015757502,⋯,-0.016670852,0.0,0.052783243,0.004635516,-0.02938669,0.028271871,0.038905775,0.03043543,-0.0143345714,0.03923146


In [6]:
rna <- read.table(paste0(PATH_DATA, "mRNA_RSEM_UQ_log2_Tumor.cct"), header = TRUE, row.names = 1, sep = "\t", na.strings = "NA")
rna <- rna %>%
    # Order columns as in valid_samples
    select(valid_samples$ID2) %>%
    # sort the columns by the order in valid_samples
    select(order(match(names(.), valid_samples$ID2))) %>%
    # Remove rows with any NAs or any zero values
    filter(rowSums(is.na(.)) == 0) %>%
    filter(rowSums(. == 0) == 0) %>%
    # Subtract the median of each row 
    mutate(row_median = apply(., 1, median, na.rm = TRUE)) %>%
    mutate(across(-row_median, ~ . - row_median)) %>%
    select(-row_median)

dim(rna)
head(rna, 3)

Unnamed: 0_level_0,C3L.02613,C3L.04072,C3N.04282,C3L.03639,C3L.01328,C3L.00277,C3L.02604,C3L.01971,C3N.01167,C3L.01031,⋯,C3N.00436,C3N.02944,C3L.03356,C3N.02589,C3L.03388,C3N.01388,C3N.00198,C3N.01380,C3N.01715,C3N.03426
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
A1BG,-0.6208639,-1.5641153,-0.03598678,-1.499229,-1.030409,-0.3632095,-0.7079235,0.4973275,0.1398404,0.4582964,⋯,0.5012384,0.196264,0.45922795,0.05454697,-0.3385884,-0.06842248,0.5971399,0.72255331,0.04504042,-1.018051
A1BG-AS1,-0.8663854,-0.8302223,0.1110559,-1.422656,-1.161717,-1.8484433,-0.5696947,0.3464761,-0.2569541,0.4220428,⋯,0.3638638,0.3672824,0.31895969,-0.71812584,-0.217941,-0.09720916,0.3269861,0.70061799,0.3022495,-1.071129
A1CF,-2.8039993,-3.0594026,-2.42468317,-1.982175,-1.343388,-2.0692887,-1.9460596,1.0109679,2.3566886,-0.2918026,⋯,0.2894103,0.9205599,0.06451494,0.61939774,-1.5979746,-2.31924199,1.5137978,-0.01529477,-2.96096702,1.411194


In [7]:
# glyco <- read.table(paste0(PATH_DATA, "N-glycoproteomics_peptide_level_ratio_tumor.cct"), header = TRUE, row.names = 1, sep = "\t", na.strings = "NA")
glyco <- read.table(paste0(PATH_DATA, "N-glycoproteomics_Site_level_ratio_tumor.cct"), header = TRUE, row.names = 1, sep = "\t", na.strings = "NA")
glyco <- glyco %>%
    # Order columns as in valid_samples
    select(valid_samples$ID2) %>%
    # sort the columns by the order in valid_samples
    select(order(match(names(.), valid_samples$ID2))) %>%
    # Remove rows with all NAs
    filter(rowSums(is.na(.)) == 0) #%>%
    # Subtract the median of each row 
    # mutate(across(everything(), ~ . - median(., na.rm = TRUE)))

dim(glyco)
head(glyco, 3)

Unnamed: 0_level_0,C3L.02613,C3L.04072,C3N.04282,C3L.03639,C3L.01328,C3L.00277,C3L.02604,C3L.01971,C3N.01167,C3L.01031,⋯,C3N.00436,C3N.02944,C3L.03356,C3N.02589,C3L.03388,C3N.01388,C3N.00198,C3N.01380,C3N.01715,C3N.03426
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
n[TMT11plex]RHEEGHMLNC[Carbamidomethyl]TC[Carbamidomethyl]FGQGR-N3H4F0S0G0,0.2711286,0.3086638,1.1849827,-0.006491598,1.9091718,1.278178,1.0885962,-0.7653889,-0.9867006,-0.5941954,⋯,0.5728847,0.9236759,-0.555157,-0.24003358,-0.2509592,0.05717399,-0.16628917,-0.5030644,0.7816165,1.034915863
n[TMT11plex]LLQVVYLHSNNITK[TMT11plex]-N5H6F1S2G0,-0.1230223,-0.4341817,0.24471,0.079600075,-0.9919879,1.089329,-0.1866849,0.3223009,0.4387361,0.2199116,⋯,-0.94928102,-1.3285715,1.3734742,0.7323404,-1.2241213,-0.83878453,-0.08174441,0.7487903,0.1999972,-0.779378985
n[TMT11plex]NYTADYDK[TMT11plex]-N2H8F0S0G0,0.5089437,0.5752988,-0.2466641,-0.21386651,0.1689218,-1.305813,-0.3507612,0.3861138,-0.9076605,-0.2210551,⋯,0.08706562,-0.9358628,-0.7485728,-0.04754478,0.6560518,-0.76525368,-1.1101838,-0.9045606,-0.2454482,0.006003751


In [8]:
# psty <- read.table(paste0(PATH_DATA, "phosphoproteomics_gene_level_MD_abundance_tumor.cct"), header = TRUE, row.names = 1, sep = "\t", na.strings = "NA")
psty <- read.table(paste0(PATH_DATA, "phosphoproteomics_site_level_MD_abundance_tumor.cct"), header = TRUE, row.names = 1, sep = "\t", na.strings = "NA")
# psty <- read.table(paste0(PATH_DATA, "phosphoproteomics_MultiSite_level_MD_abundance_tumor.cct"), header = TRUE, row.names = 1, sep = "\t", na.strings = "NA")
psty <- psty %>%
    # Only select columns that are in valid_samples
    select(valid_samples$ID2) %>%
    # sort the columns by the order in valid_samples
    select(order(match(names(.), valid_samples$ID2))) %>%
    # Remove rows with all NAs or too small values
    filter(rowSums(is.na(.)) == 0) %>%
    filter(rowSums(. < 1e-10) == 0) %>%
    # Subtract the median of each row 
    mutate(row_median = apply(., 1, median, na.rm = TRUE)) %>%
    mutate(across(-row_median, ~ . / row_median)) %>%
    select(-row_median) %>%
    # Do log2 transformation
    # mutate(across(everything(), ~ log2(pmax(., 1e-10))))
    mutate(across(everything(), ~ log2(.)))
dim(psty)
head(psty, 3)

Unnamed: 0_level_0,C3L.02613,C3L.04072,C3N.04282,C3L.03639,C3L.01328,C3L.00277,C3L.02604,C3L.01971,C3N.01167,C3L.01031,⋯,C3N.00436,C3N.02944,C3L.03356,C3N.02589,C3L.03388,C3N.01388,C3N.00198,C3N.01380,C3N.01715,C3N.03426
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
NP_000028.3_S781,-0.003289252,-0.098820469,-0.001863032,0.06418698,-0.01615852,-0.08553476,0.06142474,0.0920351,-0.002218144,-0.02439635,⋯,0.0513377,0.07273598,0.062203354,-0.064013021,-0.019261737,0.282415071,0.08484637,0.04799679,0.15066791,0.2849448
NP_000028.3_S834,-0.080017563,0.0,-0.034027167,0.03720242,-0.01953778,-0.03624982,0.03503582,0.09724809,-0.008214741,-0.0238946,⋯,0.01015709,0.03829329,0.03177595,-0.03711336,-0.006308129,0.200502184,0.0342917,0.03052564,0.14200041,0.192727
NP_000072.2_S2105,-0.020938229,-0.009456211,-0.024632151,0.01100132,-0.06223903,-0.02472232,-0.02370921,0.02074372,0.003174092,0.03309474,⋯,-0.14639489,-0.01789628,0.007726258,0.004582555,-0.002117515,-0.009992859,0.01682114,0.0,-0.01286737,-0.1006145


In [9]:
# Concatenate all data frames
data <- bind_rows(
    cnv = cnv,
    prot = prot,
    rna = rna,
    glyco = glyco,
    psty = psty,
    .id = "data_type"
)
dim(data)
row_sd <- apply(data[ , -1], 1, sd, na.rm = TRUE)
cutoff <- quantile(row_sd, probs = 0.05, na.rm = TRUE)
data <- data[row_sd > cutoff, ] %>%
    # Column-wise z-score normalization
    mutate(across(-data_type, ~ (.-mean(.)) / sd(.))) %>%
    # Sort columns by name
    select(data_type, sort(names(.)[-1]))

dim(data)
tail(data,10)
# Count the number of features per data type
feature_counts <- data %>%
    group_by(data_type) %>%
    summarise(feature_count = n()) %>%
    ungroup()
print(feature_counts)

Unnamed: 0_level_0,data_type,C3L.00017,C3L.00102,C3L.00277,C3L.00589,C3L.00598,C3L.00599,C3L.00622,C3L.00625,C3L.00819,⋯,C3N.03754,C3N.03780,C3N.03839,C3N.03840,C3N.03853,C3N.03884,C3N.04119,C3N.04126,C3N.04282,C3N.04283
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
NP_958845.1_S79,psty,0.079446002,0.03630348,-0.026318794,-0.013611849,-0.05005705,-0.172905292,-0.088639119,-0.050904401,-0.007752476,⋯,0.021672913,0.004738933,0.1707685412,-0.10959935,0.050341969,0.08423931,-0.059991279,-0.0007296711,0.06338611,0.010021778
NP_976324.2_S575,psty,0.082564705,0.02576112,0.06852189,-0.039887376,0.10901623,0.170325698,-0.131091317,0.053230996,0.053485233,⋯,0.008844694,-0.0448768665,0.0792314601,-0.18174744,0.009397321,0.11084302,-0.060311673,-0.0932079326,0.07073944,0.0365422279
NP_976324.2_S710,psty,0.147034528,0.01044562,0.111215602,-0.035994889,0.07925124,0.063384138,-0.069736553,0.088925435,0.005206951,⋯,-0.016275165,0.0003539852,0.0787260371,-0.15433375,0.060727164,0.11275161,-0.014349525,-0.0694289794,0.06985248,0.0586667396
NP_982272.2_S904,psty,0.023798552,0.01650842,-0.094195741,-0.041668947,-0.07598812,-0.149207133,-0.068073138,-0.159080895,-0.029022903,⋯,0.019838845,-0.0850848303,0.0614341098,-0.01065424,0.061241222,0.13993751,-0.049799194,-0.1248938067,0.10080156,0.0002623087
NP_987100.1_T227,psty,0.146972631,-0.44806034,0.212148654,0.120822055,-0.02753782,0.222282194,-0.267095711,-0.088231852,0.00198965,⋯,-0.128095569,0.0814034202,-0.0004391136,0.12351415,0.16073547,0.02179675,-0.21521626,0.1531561797,0.1685311,-0.011898091
NP_996826.2_S1414,psty,0.063367469,0.10591795,0.153282608,0.045718296,-0.15663787,-0.060666862,0.007299217,0.193374219,0.09568656,⋯,-0.00638267,-0.0651741198,0.0346258925,0.01082142,-0.012965689,0.06419983,-0.004291478,0.0680142954,0.07381013,0.033253351
NP_997245.2_S881,psty,0.137718794,-0.04078144,0.004700958,0.130736559,-0.06173706,0.006313795,0.090414648,-0.001235223,-0.086052319,⋯,-0.157811497,0.0578804281,0.0710230697,0.07991204,0.012761431,0.06018773,0.045825368,0.068436194,0.0475273,-0.0297662448
NP_997641.1_S2294,psty,0.025348876,0.06201465,0.28107858,-0.007967676,0.03825847,-0.105194314,-0.133995231,0.213056396,0.043122006,⋯,-0.106894553,-0.0390407275,-0.0787479171,0.07213424,0.088405738,-0.01837157,0.018400899,0.0287166604,0.13896632,0.1326827699
NP_998754.1_S1154,psty,-0.003492166,-0.01452163,0.065315431,-0.051250769,0.10283842,-0.032190435,-0.061569401,-0.151963153,0.022449098,⋯,0.038036967,0.008091062,0.1052848222,0.09795117,0.11310153,-0.00430832,-0.02207147,-0.070985402,0.12219448,0.1323997326
NP_998754.1_S894,psty,-0.01071204,0.03550601,-0.007765952,-0.004102615,-0.07791418,-0.068740067,-0.10098047,-0.091016742,0.015294469,⋯,0.021685251,-0.0184017571,0.1580202838,0.05139484,0.079590076,0.048962,-0.020722613,0.005269728,0.10922955,0.0628952689


[90m# A tibble: 5 × 2[39m
  data_type feature_count
  [3m[90m<chr>[39m[23m             [3m[90m<int>[39m[23m
[90m1[39m cnv               [4m1[24m[4m9[24m855
[90m2[39m glyco              [4m1[24m454
[90m3[39m prot               [4m3[24m490
[90m4[39m psty               [4m1[24m954
[90m5[39m rna               [4m1[24m[4m8[24m197


In [10]:
data_type <- data$data_type
data_num <- data[, -1]

positive_only_matrix <- data_num
positive_only_matrix[positive_only_matrix < 0] <- 0
# Abs values of negative values
negative_only_matrix <- data_num
negative_only_matrix[negative_only_matrix > 0] <- 0
negative_only_matrix <- abs(negative_only_matrix)

combined_matrix <- rbind(positive_only_matrix, negative_only_matrix)

combined_matrix <- combined_matrix[rowSums(combined_matrix) != 0, ]

dim(data)
dim(combined_matrix)

In [None]:
membership <- nmf_clu@fit@H %>%
    t() %>%
    as.data.frame() %>%
    # Set the cluster column, V1 > V2
    mutate(cluster = ifelse(V1 >= V2, "1", "2")) %>%
    # Divide by the sum of the two 
    mutate(sum = V1 + V2) %>%
    mutate(V1 = V1 / sum, V2 = V2 / sum) %>%
    mutate(cluster = ifelse(V1 >= V2, "1", "2"))
membership

Unnamed: 0_level_0,V1,V2,cluster,newsum
Unnamed: 0_level_1,<dbl>,<dbl>,<chr>,<dbl>
C3L.00017,2.321938e-01,0.76780622,2,1
C3L.00102,4.576865e-01,0.54231351,2,1
C3L.00277,6.720580e-01,0.32794204,1,1
C3L.00589,1.816791e-01,0.81832090,2,1
C3L.00598,9.131920e-01,0.08680796,1,1
C3L.00599,5.268393e-02,0.94731607,2,1
C3L.00622,2.099518e-01,0.79004823,2,1
C3L.00625,3.786299e-01,0.62137005,2,1
C3L.00819,7.698312e-01,0.23016879,1,1
C3L.00928,6.001700e-01,0.39983004,1,1


In [None]:
nmf_clu <- nmf(as.matrix(combined_matrix), 2, .options=list(verbose = TRUE), maxIter=500)
membership <- nmf_clu@fit@H %>%
    t() %>%
    as.data.frame() %>%
    # Set the cluster column, V1 > V2
    mutate(cluster = ifelse(V1 >= V2, "1", "2")) %>%
    # Divide by the sum of the two 
    mutate(sum = V1 + V2) %>%
    mutate(V1 = V1 / sum, V2 = V2 / sum) %>%
    select(-sum) %>%
    mutate(cluster = ifelse(V1 >= V2, "1", "2"))

# Count the number of samples in each cluster
membership_summary <- membership %>%
    group_by(cluster) %>%
    summarise(count = n())
print(membership_summary)


write.table(membership, "membership.tsv", sep = "\t", quote = FALSE)

NMF algorithm: 'brunet'

NMF seeding method: random



Iterations: 500/500 
DONE (stopped at 500/500 iterations)
[90m# A tibble: 2 × 2[39m
  cluster count
  [3m[90m<chr>[39m[23m   [3m[90m<int>[39m[23m
[90m1[39m 1          53
[90m2[39m 2          52


ERROR: Error in rownames_to_column(., "ID"): could not find function "rownames_to_column"


In [21]:
# Replace "-" with "." in row names
membership <- membership %>%
    mutate(ID = gsub("\\.", "-", ID)) %>%
    column_to_rownames("ID")
membership



ERROR: Error in column_to_rownames(., "ID"): could not find function "column_to_rownames"
