## Multi-omics Clustering

In [150]:
suppressPackageStartupMessages(library(tidyr))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(NMF))

### Data Preparation

In [151]:
PATH_DATA <- "../data_linkedomics/"

valid_samples <- read.csv("sample105.list", header = FALSE, col.names = "ID")
valid_samples$ID2 <- gsub("-", ".", valid_samples$ID) 
head(valid_samples,3)

Unnamed: 0_level_0,ID,ID2
Unnamed: 0_level_1,<chr>,<chr>
1,C3L-02613,C3L.02613
2,C3L-04072,C3L.04072
3,C3N-04282,C3N.04282


In [None]:
cnv <- read.table(paste0(PATH_DATA, "SCNA_log2_gene_level.cct"), header = TRUE, row.names = 1, sep = "\t", 
    # na.strings = "NA" 
    )
cnv <- cnv %>%
    # Select only the valid sample columns in order
    select(all_of(valid_samples$ID2)) %>%
    # Remove rows where any value is "NA"
    filter(rowSums(is.na(.)) == 0) %>%
    # Subtract median from each row
    mutate(row_median = apply(., 1, median, na.rm = TRUE)) %>%
    mutate(across(-row_median, ~ . - row_median)) %>%
    select(-row_median)
tail(cnv, 3)
dim(cnv)

Unnamed: 0_level_0,C3L.02613,C3L.04072,C3N.04282,C3L.03639,C3L.01328,C3L.00277,C3L.02604,C3L.01971,C3N.01167,C3L.01031,⋯,C3N.00436,C3N.02944,C3L.03356,C3N.02589,C3L.03388,C3N.01388,C3N.00198,C3N.01380,C3N.01715,C3N.03426
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
DAZ4,-0.12735,-0.59551,-0.92239,-0.72322,-0.02908,0.07958,0.82266,0.23247,-0.45531,0.28581,⋯,-0.36377,0.30116,-0.06191,-0.05711,-0.38335,-0.12518,-0.10954,0.11612,0.40891,-0.47479
BPY2C,-0.12735,-0.59551,-0.92239,-0.72322,-0.02908,0.07958,0.82266,0.23247,-0.45531,0.28581,⋯,-0.36377,0.30116,-0.06191,-0.05711,-0.38335,-0.12518,-0.10954,0.11612,0.40891,-0.47479
CDY1,-0.12735,-0.59551,-0.92239,-0.72322,-0.02908,0.07958,0.82266,0.23247,-0.45531,0.28581,⋯,-0.36377,0.30116,-0.06191,-0.05711,-0.38335,-0.12518,-0.10954,0.11612,0.40891,-0.47479


In [185]:
prot <- read.table(paste0(PATH_DATA, "proteomics_gene_level_MD_abundance_tumor.cct"), header = TRUE, row.names = 1, sep = "\t", na.strings = "NA")
prot <- prot %>%
    # Order columns as in valid_samples
    select(valid_samples$ID2) %>%
    # sort the columns by the order in valid_samples
    select(order(match(names(.), valid_samples$ID2))) %>%
    # Remove rows with all NAs
    filter(rowSums(is.na(.)) == 0) %>%
    # # median normalization
    mutate(row_median = apply(., 1, median, na.rm = TRUE)) %>%
    mutate(across(-row_median, ~ . - row_median)) %>%
    # mutate(across(-row_median, ~ . / row_median)) %>%
    select(-row_median) #%>%
    # # Do log2 transformation
    # mutate(across(everything(), ~ log2(.)))

head(prot, 3)
dim(prot)

Unnamed: 0_level_0,C3L.02613,C3L.04072,C3N.04282,C3L.03639,C3L.01328,C3L.00277,C3L.02604,C3L.01971,C3N.01167,C3L.01031,⋯,C3N.00436,C3N.02944,C3L.03356,C3N.02589,C3L.03388,C3N.01388,C3N.00198,C3N.01380,C3N.01715,C3N.03426
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
A1BG,-0.5851688,-1.105782,0.3238525,0.318766,-0.1535164,1.6325372,-0.0473396,-0.3940588,0.05639423,-0.25300037,⋯,-0.9174415,0.04363692,0.16312039,0.46024067,-0.3420844,0.1579204,0.131738,0.7603061,0.01638153,0.2531925
A1CF,-1.6638917,-0.990823,-0.3359187,-0.5290717,-0.4312188,0.2071515,-0.6917098,0.7453636,1.0742651,-0.05608174,⋯,-0.1227688,0.56632889,0.01798806,0.23951135,-0.1611325,-0.7811726,0.8310128,-0.5372456,-0.63732554,0.4630793
A2M,-0.9019673,-1.513266,-0.3692739,-0.6457553,-0.1202553,0.5583888,0.1351343,0.7830743,0.1350427,-0.32192121,⋯,-0.3404731,0.0,1.10432467,0.09537351,-0.5975399,0.5864739,0.8100558,0.63183,-0.29299536,0.8169296


In [186]:
rna <- read.table(paste0(PATH_DATA, "mRNA_RSEM_UQ_log2_Tumor.cct"), header = TRUE, row.names = 1, sep = "\t", na.strings = "NA")
rna <- rna %>%
    # Order columns as in valid_samples
    select(valid_samples$ID2) %>%
    # sort the columns by the order in valid_samples
    select(order(match(names(.), valid_samples$ID2))) %>%
    # Remove rows with any NAs or any zero values
    filter(rowSums(is.na(.)) == 0) %>%
    filter(rowSums(. == 0) == 0) %>%
    # Subtract the median of each row 
    mutate(row_median = apply(., 1, median, na.rm = TRUE)) %>%
    mutate(across(-row_median, ~ . - row_median)) %>%
    select(-row_median)

dim(rna)
head(rna, 3)

Unnamed: 0_level_0,C3L.02613,C3L.04072,C3N.04282,C3L.03639,C3L.01328,C3L.00277,C3L.02604,C3L.01971,C3N.01167,C3L.01031,⋯,C3N.00436,C3N.02944,C3L.03356,C3N.02589,C3L.03388,C3N.01388,C3N.00198,C3N.01380,C3N.01715,C3N.03426
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
A1BG,-0.6208639,-1.5641153,-0.03598678,-1.499229,-1.030409,-0.3632095,-0.7079235,0.4973275,0.1398404,0.4582964,⋯,0.5012384,0.196264,0.45922795,0.05454697,-0.3385884,-0.06842248,0.5971399,0.72255331,0.04504042,-1.018051
A1BG-AS1,-0.8663854,-0.8302223,0.1110559,-1.422656,-1.161717,-1.8484433,-0.5696947,0.3464761,-0.2569541,0.4220428,⋯,0.3638638,0.3672824,0.31895969,-0.71812584,-0.217941,-0.09720916,0.3269861,0.70061799,0.3022495,-1.071129
A1CF,-2.8039993,-3.0594026,-2.42468317,-1.982175,-1.343388,-2.0692887,-1.9460596,1.0109679,2.3566886,-0.2918026,⋯,0.2894103,0.9205599,0.06451494,0.61939774,-1.5979746,-2.31924199,1.5137978,-0.01529477,-2.96096702,1.411194


In [187]:
# glyco <- read.table(paste0(PATH_DATA, "N-glycoproteomics_peptide_level_ratio_tumor.cct"), header = TRUE, row.names = 1, sep = "\t", na.strings = "NA")
glyco <- read.table(paste0(PATH_DATA, "N-glycoproteomics_Site_level_ratio_tumor.cct"), header = TRUE, row.names = 1, sep = "\t", na.strings = "NA")
glyco <- glyco %>%
    # Order columns as in valid_samples
    select(valid_samples$ID2) %>%
    # sort the columns by the order in valid_samples
    select(order(match(names(.), valid_samples$ID2))) %>%
    # Remove rows with all NAs
    filter(rowSums(is.na(.)) == 0) #%>%
    # Subtract the median of each row 
    # mutate(.data = ., row_median = apply(., 1, median, na.rm = TRUE)) %>%
    # mutate(across(-row_median, ~ . - row_median)) %>%
    # select(-row_median)

dim(glyco)
head(glyco, 3)

Unnamed: 0_level_0,C3L.02613,C3L.04072,C3N.04282,C3L.03639,C3L.01328,C3L.00277,C3L.02604,C3L.01971,C3N.01167,C3L.01031,⋯,C3N.00436,C3N.02944,C3L.03356,C3N.02589,C3L.03388,C3N.01388,C3N.00198,C3N.01380,C3N.01715,C3N.03426
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
n[TMT11plex]RHEEGHMLNC[Carbamidomethyl]TC[Carbamidomethyl]FGQGR-N3H4F0S0G0,0.2711286,0.3086638,1.1849827,-0.006491598,1.9091718,1.278178,1.0885962,-0.7653889,-0.9867006,-0.5941954,⋯,0.5728847,0.9236759,-0.555157,-0.24003358,-0.2509592,0.05717399,-0.16628917,-0.5030644,0.7816165,1.034915863
n[TMT11plex]LLQVVYLHSNNITK[TMT11plex]-N5H6F1S2G0,-0.1230223,-0.4341817,0.24471,0.079600075,-0.9919879,1.089329,-0.1866849,0.3223009,0.4387361,0.2199116,⋯,-0.94928102,-1.3285715,1.3734742,0.7323404,-1.2241213,-0.83878453,-0.08174441,0.7487903,0.1999972,-0.779378985
n[TMT11plex]NYTADYDK[TMT11plex]-N2H8F0S0G0,0.5089437,0.5752988,-0.2466641,-0.21386651,0.1689218,-1.305813,-0.3507612,0.3861138,-0.9076605,-0.2210551,⋯,0.08706562,-0.9358628,-0.7485728,-0.04754478,0.6560518,-0.76525368,-1.1101838,-0.9045606,-0.2454482,0.006003751


In [188]:
# psty <- read.table(paste0(PATH_DATA, "phosphoproteomics_gene_level_MD_abundance_tumor.cct"), header = TRUE, row.names = 1, sep = "\t", na.strings = "NA")
# psty <- read.table(paste0(PATH_DATA, "phosphoproteomics_site_level_MD_abundance_tumor.cct"), header = TRUE, row.names = 1, sep = "\t", na.strings = "NA")
psty <- read.table(paste0(PATH_DATA, "phosphoproteomics_MultiSite_level_MD_abundance_tumor.cct"), header = TRUE, row.names = 1, sep = "\t", na.strings = "NA")
psty <- psty %>%
    # Only select columns that are in valid_samples
    select(valid_samples$ID2) %>%
    # sort the columns by the order in valid_samples
    select(order(match(names(.), valid_samples$ID2))) %>%
    # Remove rows with all NAs or too small values
    filter(rowSums(is.na(.)) == 0) %>%
    # filter(rowSums(. < 1e-10) == 0) %>%
    # Subtract the median of each row 
    mutate(row_median = apply(., 1, median, na.rm = TRUE)) %>%
    mutate(across(-row_median, ~ . - row_median)) %>%
    # mutate(across(-row_median, ~ . / row_median)) %>%
    select(-row_median)# %>%
    # Do log2 transformation
    # mutate(across(everything(), ~ log2(pmax(., 1e-10))))
    # mutate(across(everything(), ~ log2(.)))
dim(psty)
head(psty, 3)

Unnamed: 0_level_0,C3L.02613,C3L.04072,C3N.04282,C3L.03639,C3L.01328,C3L.00277,C3L.02604,C3L.01971,C3N.01167,C3L.01031,⋯,C3N.00436,C3N.02944,C3L.03356,C3N.02589,C3L.03388,C3N.01388,C3N.00198,C3N.01380,C3N.01715,C3N.03426
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
NP_000028.3_779_783_1_1_S781,-0.04838673,-1.4066365,-0.02741978,0.96664697,-0.2366441,-1.2230832,0.92415655,1.3996,-0.03264223,-0.3562723,⋯,0.7696804,1.0986698,0.9361254,-0.9221319,-0.2817887,4.59423084,1.2870355,0.7187544,2.3389445,4.6395828
NP_000028.3_834_834_1_1_S834,-1.32582108,0.0,-0.57279872,0.64190458,-0.3305412,-0.6097459,0.6040656,1.713517,-0.13952299,-0.4036416,⋯,0.1736147,0.6609779,0.54724,-0.6240852,-0.107211,3.66382673,0.591083,0.5254788,2.5417091,3.5120566
NP_000090.1_43_43_1_1_S43,-1.07956346,-0.8113492,-0.36560071,-0.02966643,0.4662583,1.643501,0.02305836,-1.080684,-0.38535324,-0.4929099,⋯,0.2672333,-0.8759604,0.1960877,0.7051428,-0.1503937,-0.02100318,0.6130231,0.4299193,-0.7458884,-0.1079166


In [189]:
head(cnv)

Unnamed: 0_level_0,C3L.02613,C3L.04072,C3N.04282,C3L.03639,C3L.01328,C3L.00277,C3L.02604,C3L.01971,C3N.01167,C3L.01031,⋯,C3N.00436,C3N.02944,C3L.03356,C3N.02589,C3L.03388,C3N.01388,C3N.00198,C3N.01380,C3N.01715,C3N.03426
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
SAMD11,0.11468,0.01621,-0.04934,0.13109,0.00122,-0.38004,0.07121,-0.03695,0.36353,-0.21679,⋯,0.01575,0.03511,0.00414,0.02596,0.0228,-0.11879,-0.04517,0.04616,0.00432,0.0032
NOC2L,0.11468,0.01621,-0.04934,0.13109,0.00122,-0.38004,0.07121,-0.03695,0.36353,-0.21679,⋯,0.01575,0.03511,0.00414,0.02596,0.0228,-0.11879,-0.04517,0.04616,0.00432,0.0032
KLHL17,0.11468,0.01621,-0.04934,0.13109,0.00122,-0.38004,0.07121,-0.03695,0.36353,-0.21679,⋯,0.01575,0.03511,0.00414,0.02596,0.0228,-0.11879,-0.04517,0.04616,0.00432,0.0032
PLEKHN1,0.11468,0.01621,-0.04934,0.13109,0.00122,-0.38004,0.07121,-0.03695,0.36353,-0.21679,⋯,0.01575,0.03511,0.00414,0.02596,0.0228,-0.11879,-0.04517,0.04616,0.00432,0.0032
PERM1,0.11468,0.01621,-0.04934,0.13109,0.00122,-0.38004,0.07121,-0.03695,0.36353,-0.21679,⋯,0.01575,0.03511,0.00414,0.02596,0.0228,-0.11879,-0.04517,0.04616,0.00432,0.0032
HES4,0.1135,0.01503,-0.05052,0.12991,4e-05,-0.38122,0.07003,-0.03813,0.36235,0.20581,⋯,0.01457,0.03393,0.00296,0.02478,0.02162,-0.11997,-0.04635,0.04498,0.00314,0.00202


In [190]:
# Concatenate all data frames
cnv$data_type <- "cnv"
prot$data_type  <- "prot"
rna$data_type   <- "rna"
glyco$data_type <- "glyco"
psty$data_type  <- "psty"
data<- rbind(cnv, prot, rna,glyco, psty) %>%
    relocate(data_type)
feature_counts <- data %>%
    group_by(data_type) %>%
    summarise(feature_count = n()) %>%
    ungroup()
print(feature_counts)


row_sd <- apply(data[ , -1], 1, sd, na.rm = TRUE)
cutoff <- quantile(row_sd, probs = 0.05, na.rm = TRUE)
data <- data[row_sd > cutoff, ] %>%
    # Column-wise z-score normalization
    mutate(across(-data_type, ~ (.-mean(.)) / sd(.))) %>%
    # Sort columns by name
    select(data_type, sort(names(.)[-1]))

dim(data)
head(data,10)
# Count the number of features per data type
feature_counts <- data %>%
    group_by(data_type) %>%
    summarise(feature_count = n()) %>%
    ungroup()
print(feature_counts)

[90m# A tibble: 5 × 2[39m
  data_type feature_count
  [3m[90m<chr>[39m[23m             [3m[90m<int>[39m[23m
[90m1[39m cnv               [4m1[24m[4m9[24m855
[90m2[39m glyco              [4m1[24m454
[90m3[39m prot               [4m5[24m773
[90m4[39m psty               [4m2[24m243
[90m5[39m rna               [4m1[24m[4m8[24m197


Unnamed: 0_level_0,data_type,C3L.00017,C3L.00102,C3L.00277,C3L.00589,C3L.00598,C3L.00599,C3L.00622,C3L.00625,C3L.00819,⋯,C3N.03754,C3N.03780,C3N.03839,C3N.03840,C3N.03853,C3N.03884,C3N.04119,C3N.04126,C3N.04282,C3N.04283
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
SAMD11,cnv,0.1575385,-0.0892881,-0.623277,0.01894679,0.03118658,0.05725237,0.02637657,-0.3986777,-0.3310686,⋯,0.005673214,0.02089495,0.05536301,-0.3136748,0.1179187,0.1052462,0.06875376,0.03378693,-0.005422868,0.108157
NOC2L,cnv,0.1575385,-0.0892881,-0.623277,0.01894679,0.03118658,0.05725237,0.02637657,-0.3986777,-0.3310686,⋯,0.005673214,0.02089495,0.05536301,-0.3136748,0.1179187,0.1052462,0.06875376,0.03378693,-0.005422868,0.108157
KLHL17,cnv,0.1575385,-0.0892881,-0.623277,0.01894679,0.03118658,0.05725237,0.02637657,-0.3986777,-0.3310686,⋯,0.005673214,0.02089495,0.05536301,-0.3136748,0.1179187,0.1052462,0.06875376,0.03378693,-0.005422868,0.108157
PLEKHN1,cnv,0.1575385,-0.0892881,-0.623277,0.01894679,0.03118658,0.05725237,0.02637657,-0.3986777,-0.3310686,⋯,0.005673214,0.02089495,0.05536301,-0.3136748,0.1179187,0.1052462,0.06875376,0.03378693,-0.005422868,0.108157
PERM1,cnv,0.1575385,-0.0892881,-0.623277,0.01894679,0.03118658,0.05725237,0.02637657,-0.3986777,-0.3310686,⋯,0.005673214,0.02089495,0.05536301,-0.3136748,0.1179187,0.1052462,0.06875376,0.03378693,-0.005422868,0.108157
HES4,cnv,0.1555225,-0.09182637,-0.6253056,0.01616695,0.02906107,0.05501756,0.02338942,-0.4013551,-0.3338182,⋯,0.003260593,0.01816861,0.05327573,-0.3165751,0.1151789,0.1033124,0.06612978,0.03085036,-0.007479336,0.1059898
ISG15,cnv,0.1555225,-0.09182637,-0.6253056,0.01616695,0.02906107,0.05501756,0.02338942,-0.4013551,-0.3338182,⋯,0.003260593,0.01816861,0.05327573,-0.3165751,0.1151789,0.1033124,0.06612978,0.03085036,-0.007479336,0.1059898
AGRN,cnv,0.1555225,-0.09182637,-0.6253056,0.01616695,0.02906107,0.05501756,0.02338942,-0.4013551,-0.3338182,⋯,0.003260593,0.01816861,0.05327573,-0.3165751,0.1151789,0.1033124,0.06612978,0.03085036,-0.007479336,0.1059898
RNF223,cnv,0.1555225,-0.09182637,-0.6253056,0.01616695,0.02906107,0.05501756,0.02338942,-0.4013551,-0.3338182,⋯,0.003260593,0.01816861,0.05327573,-0.3165751,0.1151789,0.1033124,0.06612978,0.03085036,-0.007479336,0.1059898
C1orf159,cnv,0.1555225,-0.09182637,-0.6253056,0.01616695,0.02906107,0.05501756,0.02338942,-0.4013551,-0.3338182,⋯,0.003260593,0.01816861,0.05327573,-0.3165751,0.1151789,0.1033124,0.06612978,0.03085036,-0.007479336,0.1059898


[90m# A tibble: 5 × 2[39m
  data_type feature_count
  [3m[90m<chr>[39m[23m             [3m[90m<int>[39m[23m
[90m1[39m cnv               [4m1[24m[4m7[24m472
[90m2[39m glyco              [4m1[24m454
[90m3[39m prot               [4m5[24m773
[90m4[39m psty               [4m2[24m243
[90m5[39m rna               [4m1[24m[4m8[24m197


In [191]:
data_type <- data$data_type
data_num <- data[, -1]

positive_only_matrix <- data_num
positive_only_matrix[positive_only_matrix < 0] <- 0
# Abs values of negative values
negative_only_matrix <- data_num
negative_only_matrix[negative_only_matrix > 0] <- 0
negative_only_matrix <- abs(negative_only_matrix)

positive_only_matrix$data_type <- data_type
negative_only_matrix$data_type <- data_type

combined_matrix <- rbind(positive_only_matrix, negative_only_matrix) %>%
    relocate(data_type)

# combined_matrix <- combined_matrix[rowSums(combined_matrix) != 0, ]
# 1. numeric-only 부분 선택
numeric_part <- combined_matrix[, setdiff(names(combined_matrix), "data_type")]

# 2. rowSums 계산
nonzero_rows <- rowSums(numeric_part) != 0

# 3. 필터링
combined_matrix <- combined_matrix[nonzero_rows, ]

dim(data)
dim(combined_matrix)

In [192]:
# 숫자 데이터만 추출
mat_for_nmf <- as.matrix(combined_matrix[, setdiff(names(combined_matrix), "data_type")])

# NMF 실행
nmf_clu <- nmf(mat_for_nmf, 2, .options = list(verbose = TRUE), maxIter = 500, method ="lee")
membership <- nmf_clu@fit@H %>%
    t() %>%
    as.data.frame() %>%
    # Set the cluster column, V1 > V2
    mutate(cluster = ifelse(V1 >= V2, "1", "2")) %>%
    # Divide by the sum of the two 
    mutate(sum = V1 + V2) %>%
    mutate(V1 = V1 / sum, V2 = V2 / sum) %>%
    select(-sum) %>%
    mutate(cluster = ifelse(V1 > V2, "1", "2")) %>% 
    # substitute ""." with "-" in the sample IDs
    mutate(sample = rownames(.)) %>%
    mutate(sample = gsub("\\.", "-", sample)) %>%
    select(sample, cluster, V1, V2)
    

# Count the number of samples in each cluster
membership_summary <- membership %>%
    group_by(cluster) %>%
    summarise(count = n())
print(membership_summary)
# Exclude rownames from the output
rownames(membership) <- NULL
head(membership,5)
write.table(membership, "membership.tsv", sep = "\t", quote = FALSE, row.names = FALSE)

NMF algorithm: 'lee'

NMF seeding method: random



Iterations: 500/500 
DONE (stopped at 500/500 iterations)
[90m# A tibble: 2 × 2[39m
  cluster count
  [3m[90m<chr>[39m[23m   [3m[90m<int>[39m[23m
[90m1[39m 1          55
[90m2[39m 2          50


Unnamed: 0_level_0,sample,cluster,V1,V2
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<dbl>
1,C3L-00017,2,0.1684102,0.8315898
2,C3L-00102,2,0.4528973,0.5471027
3,C3L-00277,1,0.6670459,0.3329541
4,C3L-00589,2,0.1380543,0.8619457
5,C3L-00598,1,0.9999977,2.27666e-06


In [None]:
weights <- as.data.frame(nmf_clu@fit@W)
weights$data_type <- combined_matrix$data_type
weights <- weights %>%
    relocate(data_type)
 
# write.table(weights, "nmf_weights.tsv", sep = "\t", quote = FALSE)
head(weights,3)

Unnamed: 0_level_0,data_type,V1,V2
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>
SAMD11,cnv,3.847392e-06,9.093966e-07
NOC2L,cnv,3.738249e-06,9.44825e-07
KLHL17,cnv,3.738249e-06,9.44825e-07


In [218]:
features <- extractFeatures(nmf_clu@fit@W, method = "kim")
# feature_names <- lapply(features, function(index_vec) {rownames(nmf_clu@fit@W)[index_vec]}) # signed feature ID
all_indices <- unique(unlist(features))
filtered_weights <- weights[all_indices,]
length(features[[1]])
length(features[[2]])
# featureScore(nmf_clu@fit@W, method="kim")

filtered_acc <- filtered_weights["data_type"]
filtered_acc$names <- rownames(filtered_acc)
rownames(filtered_acc) <- NULL

filtered_acc <- filtered_acc[!duplicated(filtered_acc[, c("data_type", "names")]), ]

head(filtered_idx,6)

Unnamed: 0_level_0,data_type,names
Unnamed: 0_level_1,<chr>,<chr>
1,rna,MUC621
2,rna,SST21
3,rna,SLC30A811
4,rna,CHGA21
5,rna,SLC39A511
6,rna,S100A21


In [None]:
data$names<- rownames(data)
data_filtered <- merge(data, filtered_acc, by =c("data_type", "names"))
tail(data_filtered)
dim(data)
dim(data_filtered)
# write.table(data_filtered, "filtered_zscores.tsv", sep="\t",  quote = FALSE, row.names = FALSE)


Unnamed: 0_level_0,data_type,names,C3L.00017,C3L.00102,C3L.00277,C3L.00589,C3L.00598,C3L.00599,C3L.00622,C3L.00625,⋯,C3N.03754,C3N.03780,C3N.03839,C3N.03840,C3N.03853,C3N.03884,C3N.04119,C3N.04126,C3N.04282,C3N.04283
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
5878,rna,ZSWIM41,-0.90064093,-0.48249818,0.03006379,-0.6274672,1.93581828,-0.1192915,0.29209976,1.43113122,⋯,0.8156068,-1.4622981,-1.2444601,-0.86064793,-0.2429108,0.16150891,0.04671981,-0.282219,-0.4391541,-0.605375
5879,rna,ZWILCH1,-0.65857339,0.07629318,-0.23292467,0.2541038,0.65130079,0.4276126,-0.33888855,-0.65703591,⋯,2.1012851,-0.5499445,0.3512714,0.05411585,0.375896,0.46966825,-0.45646435,-0.7307805,0.9044597,0.4522684
5880,rna,ZWINT1,-0.41320879,-2.52271065,0.24238348,-1.3843079,0.18375625,-1.0116353,-0.67795251,-0.52380385,⋯,1.1575259,-1.6978079,1.456381,-0.58808443,-0.6516843,0.18213774,-2.33882186,-0.3655287,-1.3449098,1.3271418
5881,rna,ZXDA1,0.24691697,-0.88318957,0.27707755,0.739102,0.15076705,1.0788258,0.41596222,-0.68513361,⋯,-0.7421219,0.1890739,1.1837669,0.2185558,0.2303327,-0.81504672,-0.35171963,0.0753987,0.6294796,0.2722895
5882,rna,ZYX2,-0.06854312,0.36218482,0.0216131,-2.0298256,0.53282893,-0.2310507,0.08716187,0.36128338,⋯,2.3202499,-0.6410885,-0.9353127,-0.46597337,-0.1939643,1.076166,-0.14514803,-0.4087984,0.4635944,-0.6519857
5883,rna,ZZEF12,-0.26012346,-0.33628484,0.65840649,-0.5606664,0.02744141,0.145948,0.19474646,-0.07891817,⋯,0.5846421,0.6741786,-0.9500286,0.58040057,0.2650774,-0.04587414,-0.24935946,-0.4100432,-0.2994478,-1.1593071
