## Normalize entire dataset
Add annotations and extract basic statistics on taxonomic and protein assignment. Use EdgeR to perform Trimmed Mean Normalization to account for differences in sample size.

In [1]:
load("raw_count_data_07262018.RData", verbose = T) #Save RData object in DOI linked Zenodo

In [None]:
library(edgeR)

In [None]:
# Re order wide
names(wide_count_data)
wide_ordered<-wide_count_data[c(1:8,21:26,34:45,27:33,11:12,17:18,13:14,19:20,10,16,9,15)]
y<-dim(wide_ordered)[2]

# Treat ALOHA july and march as replicates:
dge_obj<-DGEList(counts = wide_ordered[3:y], genes= wide_ordered[1:2], group=c(rep("Catalina_surface",6),rep("PortofLA_surface",6), rep("SPOT_surface", 12),rep("SPOT_150m",3), rep("SPOT_890m",4), rep("ALOHA_surface", 4),rep("ALOHA_DCM", 4),rep("ALOHA_150m",2), rep("ALOHA_1000m",2) ))
dge_obj$samples
data<-calcNormFactors(dge_obj, method="TMM") # TMM normalization
data$samples # Normalized library values
cpm_data<-cpm(data, normalized.lib.sizes=TRUE, log=FALSE) #obtain only CPM (not logged)
cpm_data<-as.data.frame(cpm_data)
data_CPM<-data.frame(data$genes,cpm_data)

# Reformat data:
melt_CPM<-melt(data_CPM)
head(melt_CPM)
unique(melt_CPM$variable)
str(melt_CPM)
unique(melt_CPM$variable)
# Make sure all non-numerics are factors
tmp<-colsplit(melt_CPM$variable, "_", c("Location", "Depth", "Replicate"))
df_melt_CPM<-cbind(melt_CPM[1:3],tmp,melt_CPM[4])
str(df_melt_CPM)
#
# save(data, data_CPM, df_melt_CPM, file="Normalized_data_08022018.RData")

In [None]:
# Load from here - RData available at linked Zenodo DOI
load("Normalized_data_08022018.RData",verbose=T)

In [None]:
# Calculate averages across replicates
# First, fix up labels
head(df_melt_CPM)
df_melt_CPM$Location<-(as.character(gsub("July","ALOHA",df_melt_CPM$Location)))
df_melt_CPM$Location<-(as.character(gsub("March","ALOHA",df_melt_CPM$Location)))
df_melt_CPM$Depth<-(as.character(gsub("5m","surface",df_melt_CPM$Depth)))
str(df_melt_CPM)
tmp<-df_melt_CPM
tmp$variable<-NULL; tmp$Replicate<-NULL  #remove columns that distinguish replicates
#
avg_CPM<-tmp %>%
  group_by(Location, Depth, Taxonomy, KO) %>%
  summarise(mean_CPM=mean(value)) %>%
  as.data.frame
# class(avg_CPM)
# head(avg_CPM)
# str(avg_CPM)

## Add annotations for all downstream analysis:
# Explore taxonomic composition
tax<-colsplit(avg_CPM$Taxonomy, ";", c("Supergroup", "Phylum", "Class", "Order", "Family", "Genus", "Species"))
df_wtax<-cbind(avg_CPM, tax)
df_wtax$Species<-as.character(gsub(";","",df_wtax$Species))
head(df_wtax)

# Add KEGG pathway information - 2 flavors
load("KeggID_moduleInfo.Data", verbose=T)
# head(data_CPM)
head(avg_CPM)
head(K0_all)
colnames(K0_all)[1]<-"KO"
### KOs expanded if a KO appears in more than one module
library(plyr)
df_wKO_wdups<-join(avg_CPM, K0_all, by="KO",type="left", match="all")
#
save(avg_CPM, df_wtax, df_wKO_wdups, file="Normed_avg_annotated_08022018.RData")

In [None]:
# Re-norm data for each taxonomic group
# Obtain dataframe of counts averaged over replicates and for each taxonomic group
load("raw_count_data_07262018.RData",verbose=T)
library(edgeR)
library(reshape2)
library(dplyr)

In [None]:
# Generate separate dataframe:
tax<-c("Dinoflagellate","Ciliate","Haptophyta","Bacillariophyceae","Chlorophyta", "Pelagophyceae", "MAST", "Rhizaria")
# Treat ALOHA july and march as replicates:
names(wide_count_data)
wide_ordered<-wide_count_data[c(1:8,21:26,34:45,27:33,11:12,17:18,13:14,19:20,10,16,9,15)]
y<-dim(wide_ordered)[2]
#
for(i in tax){
  tmp_counts<-subset(wide_ordered, grepl(i, wide_ordered$Taxonomy))
  y<-dim(tmp_counts)[2]
  #Use tmp_counts below to perform edgeR normalization for each taxonomic group
  dge_obj<-DGEList(counts = tmp_counts[3:y], genes= tmp_counts[1:2], group=c(rep("Catalina_surface",6),rep("PortofLA_surface",6), rep("SPOT_surface", 12),rep("SPOT_150m",3), rep("SPOT_890m",4), rep("ALOHA_surface", 4),rep("ALOHA_DCM", 4),rep("ALOHA_150m",2), rep("ALOHA_1000m",2) ))
  dge_obj$samples
  data<-calcNormFactors(dge_obj, method="TMM") # TMM normalization
  data$samples # Normalized library values
  cpm_data<-cpm(data, normalized.lib.sizes=TRUE, log=FALSE) #obtain only CPM (not logged)
  cpm_data<-as.data.frame(cpm_data)
  data_CPM<-data.frame(data$genes,cpm_data)
  melt_CPM<-melt(data_CPM)
  tmp2<-colsplit(melt_CPM$variable, "_", c("Location", "Depth", "Replicate"))
  cpm_tmp<-cbind(melt_CPM, tmp2)
  cpm_tmp$Location<-(as.character(gsub("July","ALOHA",cpm_tmp$Location)))
  cpm_tmp$Location<-(as.character(gsub("March","ALOHA",cpm_tmp$Location)))
  cpm_tmp$Depth<-(as.character(gsub("5m","surface",cpm_tmp$Depth)))
  # Calc averages
  long_avg<-cpm_tmp %>%
    group_by(Location, Depth, Taxonomy, KO) %>%
    summarise(mean_count=mean(value)) %>%
    as.data.frame
  head(long_avg)
  long_avg$sample<-paste(long_avg$Location, long_avg$Depth, sep="_")
  long_avg$taxa<-i
  name <- paste("dfnorm", i, sep = "_")
  assign(name, long_avg)
  print("done with");print(i)
} 

In [None]:
# Check all
# head(dfnorm_Dinoflagellate)
# head(dfnorm_Ciliate)
# head(dfnorm_Haptophyta)
# head(dfnorm_Bacillariophyceae)
# head(dfnorm_Chlorophyta)
# head(dfnorm_Pelagophyceae)
# head(dfnorm_MAST)
# head(dfnorm_Rhizaria)

In [None]:
# Save:
comboTax<-rbind(dfnorm_Dinoflagellate, dfnorm_Ciliate, dfnorm_Haptophyta, dfnorm_Bacillariophyceae, dfnorm_Chlorophyta, dfnorm_Pelagophyceae, dfnorm_MAST, dfnorm_Rhizaria)
#
save(dfnorm_Dinoflagellate, dfnorm_Ciliate, dfnorm_Haptophyta, dfnorm_Bacillariophyceae, dfnorm_Chlorophyta, dfnorm_Pelagophyceae, dfnorm_MAST, dfnorm_Rhizaria, comboTax, file="ReNorm_bytax_08022018.RData")