<a href="https://colab.research.google.com/github/shannonwasson/CIS661_Group3/blob/main/CIS661_Group3_Code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
install.packages(c("umap", "maptools"))


library(GEOquery)
library(limma)
library(umap)


# load series and platform data from GEO


gset <- getGEO("GSE47022", GSEMatrix =TRUE, AnnotGPL=TRUE)
if (length(gset) > 1) idx <- grep("GPL8321", attr(gset, "names")) else idx <- 1
gset <- gset[[idx]]


# make proper column names to match toptable
fvarLabels(gset) <- make.names(fvarLabels(gset))


# group membership for all samples
gsms <- "111111110000000033333333222222555555544444444"
sml <- strsplit(gsms, split="")[[1]]


# log2 transformation
ex <- exprs(gset)
qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T))
LogC <- (qx[5] > 100) ||
  (qx[6]-qx[1] > 50 && qx[2] > 0)
if (LogC) { ex[which(ex <= 0)] <- NaN
exprs(gset) <- log2(ex) }


# assign samples to groups and set up design matrix
gs <- factor(sml)
groups <- make.names(c("control diet induced obese mice","control regular weight mice","MS diet induced obese mice","MS regular weight mice","SS diet induced obese mice","SS regular weight mice"))
levels(gs) <- groups
gset$group <- gs
design <- model.matrix(~group + 0, gset)
colnames(design) <- levels(gs)


gset <- gset[complete.cases(exprs(gset)), ] # skip missing values


fit <- lmFit(gset, design)  # fit linear model


# set up contrasts of interest and recalculate model coefficients
cts <- paste(groups, c(tail(groups, -1), head(groups, 1)), sep="-")
cont.matrix <- makeContrasts(contrasts=cts, levels=design)
fit2 <- contrasts.fit(fit, cont.matrix)


# compute statistics and table of top significant genes
fit2 <- eBayes(fit2, 0.01)
tT <- topTable(fit2, adjust="fdr", sort.by="B", number=250)


tT <- subset(tT, select=c("ID","adj.P.Val","P.Value","F","GB_ACC","SPOT_ID","Gene.Symbol","Gene.symbol","Gene.title"))
write.table(tT, file=stdout(), row.names=F, sep="\t")


# Visualize and quality control test results.
# Build histogram of P-values for all genes. Normal test
# assumption is that most genes are not differentially expressed.
tT2 <- topTable(fit2, adjust="fdr", sort.by="B", number=Inf)
hist(tT2$adj.P.Val, col = "grey", border = "white", xlab = "P-adj",
     ylab = "Number of genes", main = "P-adj value distribution")


# summarize test results as "up", "down" or "not expressed"
dT <- decideTests(fit2, adjust.method="fdr", p.value=0.05, lfc=0)


# original code
# create Q-Q plot for t-statistic
t.good <- which(!is.na(fit2$F)) # filter out bad probes
qqt(fit2$t[t.good], fit2$df.total[t.good], main="Moderated t statistic")


# edited code
# create Q-Q plot for t-statistic
t.good <- which(!is.na(fit2$F)) # filter out bad probes
qqt(fit2$t[t.good], fit2$df.total[t.good], main="Moderated t statistic", ylim=c(-10,10))




# volcano plot (log P-value vs log fold change)
colnames(fit2) # list contrast names
ct <- 1        # choose contrast of interest
# Please note that the code provided to generate graphs serves as a guidance to
# the users. It does not replicate the exact GEO2R web display due to multitude
# of graphical options.
#
# The following will produce basic volcano plot using limma function:
volcanoplot(fit2, coef=ct, main=colnames(fit2)[ct], pch=20,
            highlight=length(which(dT[,ct]!=0)), names=rep('+', nrow(fit2)))
#original code
# MD plot (log fold change vs mean log expression)
# highlight statistically significant (p-adj < 0.05) probes
plotMD(fit2, column=ct, status=dT[,ct], legend=F, pch=20, cex=1)
abline(h=0)


#edited
# MD plot (log fold change vs mean log expression)
# highlight statistically significant (p-adj < 0.05) probes
plotMD(fit2, column=ct, status=dT[,ct], legend=F, pch=20, cex=1)
abline(h=0)


################################################################
# General expression data analysis
ex <- exprs(gset)


# box-and-whisker plot
dev.new(width=3+ncol(gset)/6, height=5)
ord <- order(gs)  # order samples by group
palette(c("#1B9E77", "#7570B3", "#E7298A", "#E6AB02", "#D95F02",
          "#66A61E", "#A6761D", "#B32424", "#B324B3", "#666666"))
par(mar=c(7,4,2,1))
title <- paste ("GSE47022", "/", annotation(gset), sep ="")
boxplot(ex[,ord], boxwex=0.6, notch=T, main=title, outline=FALSE, las=2, col=gs[ord])
legend("topleft", groups, fill=palette(), bty="n")
dev.off()


# expression value distribution
par(mar=c(4,4,2,1))
title <- paste ("GSE47022", "/", annotation(gset), " value distribution", sep ="")
plotDensities(ex, group=gs, main=title, legend ="topright")


# UMAP plot (dimensionality reduction)
ex <- na.omit(ex) # eliminate rows with NAs
ex <- ex[!duplicated(ex), ]  # remove duplicates
ump <- umap(t(ex), n_neighbors = 15, random_state = 123)
par(mar=c(3,3,2,6), xpd=TRUE)
plot(ump$layout, main="UMAP plot, nbrs=15", xlab="", ylab="", col=gs, pch=20, cex=1.5)
legend("topright", inset=c(-0.15,0), legend=levels(gs), pch=20,
       col=1:nlevels(gs), title="Group", pt.cex=1.5)
library("maptools")  # point labels without overlaps
pointLabel(ump$layout, labels = rownames(ump$layout), method="SANN", cex=0.6)


# mean-variance trend, helps to see if precision weights are needed
plotSA(fit2, main="Mean variance trend, GSE47022")


# T-test Code- Using R-Studio:


# Load series and platform data from GEO
gset <- getGEO("GSE47022", GSEMatrix = TRUE, AnnotGPL = TRUE)
if (length(gset) > 1) idx <- grep("GPL8321", attr(gset, "names")) else idx <- 1
gset <- gset[[idx]]


# Make proper column names to match toptable
fvarLabels(gset) <- make.names(fvarLabels(gset))


# Group membership for all samples
gsms <- "111111110000000033333333222222555555544444444"
sml <- strsplit(gsms, split = "")[[1]]


# Mapping of sml to group labels
group_labels <- c("control diet induced obese mice", "control regular weight mice",
                  "MS diet induced obese mice", "MS regular weight mice",
                  "SS diet induced obese mice", "SS regular weight mice")


sml_groups <- factor(sml, levels = c("1", "0", "3", "2", "5", "4"), labels = group_labels)


# Debug: Print sample labels and groups
cat("Sample labels (sml):\n")
print(sml)
cat("Group assignments based on sml:\n")
print(table(sml_groups))


# Log2 transformation
ex <- exprs(gset)
qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm = TRUE))
LogC <- (qx[5] > 100) || (qx[6] - qx[1] > 50 && qx[2] > 0)
if (LogC) {
  ex[which(ex <= 0)] <- NaN
  exprs(gset) <- log2(ex)
}


# Assign samples to groups and set up design matrix
gset$group <- sml_groups
design <- model.matrix(~group + 0, gset)
colnames(design) <- levels(sml_groups)
gset <- gset[complete.cases(exprs(gset)), ]


# Debug: Print group sizes and group assignments
cat("Group sizes:\n")
for (group in levels(sml_groups)) {
  cat("Group:", group, "Size:", sum(sml_groups == group), "\n")
}
cat("Group assignments:\n")
print(table(sml_groups))


# Function to perform t-tests with additional checks
compare_groups <- function(group1, group2) {
  samples_group1 <- which(sml_groups == group1)
  samples_group2 <- which(sml_groups == group2)

  cat("Comparing", group1, "with", group2, "\n")
  cat("Samples in", group1, ":", length(samples_group1), "\n")
  cat("Samples in", group2, ":", length(samples_group2), "\n")

  if (length(samples_group1) == 0 || length(samples_group2) == 0) {
    cat("Skipping comparison due to zero samples in one of the groups\n")
    return(data.frame(Gene = character(0), P.Value = numeric(0), adj.P.Val = numeric(0)))
  }

  # Ensure there are no missing values in the selected samples
  valid_genes <- rowSums(!is.na(ex[, samples_group1])) == length(samples_group1) &
    rowSums(!is.na(ex[, samples_group2])) == length(samples_group2)
  ex_filtered <- ex[valid_genes, ]

  cat("Number of valid genes for comparison:", sum(valid_genes), "\n")

  t_test_results <- apply(ex_filtered, 1, function(x) {
    t.test(x[samples_group1], x[samples_group2])
  })

  p_values <- sapply(t_test_results, function(x) x$p.value)
  t_test_results_df <- data.frame(Gene = rownames(ex_filtered), P.Value = p_values)
  t_test_results_df$adj.P.Val <- p.adjust(t_test_results_df$P.Value, method = "fdr")
  return(t_test_results_df)
}


# Specify the comparisons
comparisons <- list(
  c("control diet induced obese mice", "control regular weight mice"),
  c("MS diet induced obese mice", "MS regular weight mice"),
  c("SS diet induced obese mice", "SS regular weight mice")
)


# Perform and store results for all comparisons
results_list <- lapply(comparisons, function(comp) {
  compare_groups(comp[1], comp[2])
})


# View results for the first comparison
head(results_list[[1]])
head(results_list[[1]])

# K Fold Code
gset <- getGEO("GSE47022", GSEMatrix =TRUE, AnnotGPL=TRUE)
if (length(gset) > 1) idx <- grep("GPL8321", attr(gset, "names")) else idx <- 1
gset <- gset[[idx]]


# make proper column names to match toptable
fvarLabels(gset) <- make.names(fvarLabels(gset))


# group membership for all samples
gsms <- "111111110000000033333333222222555555544444444"
sml <- strsplit(gsms, split="")[[1]]


# log2 transformation
ex <- exprs(gset)
qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T))
LogC <- (qx[5] > 100) ||
  (qx[6]-qx[1] > 50 && qx[2] > 0)
if (LogC) { ex[which(ex <= 0)] <- NaN
exprs(gset) <- log2(ex) }


# assign samples to groups and set up design matrix
gs <- factor(sml)
groups <- make.names(c("control diet induced obese mice","control regular weight mice","MS diet induced obese mice","MS regular weight mice","SS diet induced obese mice","SS regular weight mice"))
levels(gs) <- groups
gset$group <- gs
design <- model.matrix(~group + 0, gset)
colnames(design) <- levels(gs)


gset <- gset[complete.cases(exprs(gset)), ] # skip missing values


fit <- lmFit(gset, design)  # fit linear model


# set up contrasts of interest and recalculate model coefficients
cts <- paste(groups, c(tail(groups, -1), head(groups, 1)), sep="-")
cont.matrix <- makeContrasts(contrasts=cts, levels=design)
fit2 <- contrasts.fit(fit, cont.matrix)


# compute statistics and table of top significant genes
fit2 <- eBayes(fit2, 0.01)
tT <- topTable(fit2, adjust="fdr", sort.by="B", number=250)


tT <- subset(tT, select=c("ID","adj.P.Val","P.Value","F","GB_ACC","SPOT_ID","Gene.Symbol","Gene.symbol","Gene.title"))
write.table(tT, file=stdout(), row.names=F, sep="\t")

