# Make input files for GO_MWU

In [1]:
pwd

/home/ksilliman/Projects/CommonG/Analysis/2bRAD/paper-oly-mbdbs-gen/analyses


In [61]:
loci = read.csv("allLoci.GO.csv",header = T)

In [8]:
head(loci)

contig.allLoci,start.allLoci,end.allLoci,feature,start.feat,end.feat,Note,GO_term
Contig0,38973,38973,mRNA,12485,93036,Note=Similar to WDR87: WD repeat-containing protein 87 (Homo sapiens OX%3D9606);,GO:0005515
Contig0,39226,39226,mRNA,12485,93036,Note=Similar to WDR87: WD repeat-containing protein 87 (Homo sapiens OX%3D9606);,GO:0005515
Contig0,39234,39234,mRNA,12485,93036,Note=Similar to WDR87: WD repeat-containing protein 87 (Homo sapiens OX%3D9606);,GO:0005515
Contig0,39252,39252,mRNA,12485,93036,Note=Similar to WDR87: WD repeat-containing protein 87 (Homo sapiens OX%3D9606);,GO:0005515
Contig0,41234,41234,mRNA,12485,93036,Note=Similar to WDR87: WD repeat-containing protein 87 (Homo sapiens OX%3D9606);,GO:0005515
Contig0,64124,64124,mRNA,12485,93036,Note=Similar to WDR87: WD repeat-containing protein 87 (Homo sapiens OX%3D9606);,GO:0005515


In [90]:
ids = paste(loci$contig.allLoci, loci$start.allLoci,sep=".")
allGOs = as.data.frame(cbind(ids, as.character(loci$GO_term)))
colnames(allGOs) <- c("ID","GO_term")
allGOs <- aggregate(data=allGOs,GO_term~ID,function(y) paste(unique(y),collapse =";"))
head(allGOs)

ID,GO_term
Contig0.38973,GO:0005515
Contig0.39226,GO:0005515
Contig0.39234,GO:0005515
Contig0.39252,GO:0005515
Contig0.41234,GO:0005515
Contig0.64124,GO:0005515


In [91]:
allGOs[which(allGOs$ID == "Contig113403.874"),]

Unnamed: 0,ID,GO_term
1635,Contig113403.874,GO:0003676;GO:0003677


In [99]:
write.table(allGOs,"allLoci.GO.mwu",sep="\t",quote = F,row.names = F, col.names=F)

## MACAU 

In [92]:
m75 <- read.csv("macau/macau75.GO.csv")
m75.ids = paste(m75$contig.macau75, m75$start.macau75,sep=".")
head(m75.ids)


In [93]:
m75.sig <- as.data.frame(cbind(as.character(allGOs$ID), rep(0,nrow(allGOs))), stringsAsFactors=F)
colnames(m75.sig) <- c("gene","sig")
head(m75.sig)

gene,sig
Contig0.38973,0
Contig0.39226,0
Contig0.39234,0
Contig0.39252,0
Contig0.41234,0
Contig0.64124,0


In [94]:
m75.sig[which(m75.sig$gene %in% m75.ids),]$sig <- 1

In [95]:
m75.sig[which(m75.sig$gene %in% m75.ids),]

Unnamed: 0,gene,sig
1635,Contig113403.874,1
4592,Contig16063.8073,1
7863,Contig1877.5033,1
10060,Contig19785.13546,1
10901,Contig2013.4861,1
11261,Contig202.26526,1
17758,Contig24215.9978,1
18565,Contig24745.37011,1
20348,Contig25646.1510,1
21606,Contig267.26160,1


In [101]:
write.table(m75.sig,"macau/macau75.sig.csv",sep=",",quote = F,row.names = F, col.names=T)

### Run GO_MWU

In [106]:
input="macau75.sig.csv" # two columns of comma-separated values: gene id, continuous measure of significance. To perform standard GO enrichment analysis based on Fisher's exact test, use binary measure (0 or 1, i.e., either sgnificant or not).
goAnnotations="allLoci.GO.mwu" # two-column, tab-delimited, one line per gene, multiple GO terms separated by semicolon. If you have multiple lines per gene, use nrify_GOtable.pl prior to running this script.
goDatabase="~/Projects/OA_Ostrea/Analysis/DGE/MWU/go.obo" # download from http://www.geneontology.org/GO.downloads.ontology.shtml
goDivision="MF" # either MF, or BP, or CC
source("~/Projects/OA_Ostrea/Analysis/DGE/MWU/gomwu.functions.R")

In [107]:
gomwuStats(input, goDatabase, goAnnotations, goDivision,
	perlPath="perl", # replace with full path to perl executable if it is not in your system's PATH already
	largest=0.15,  # a GO category will not be considered if it contains more than this fraction of the total number of genes
	smallest=3,   # a GO category should contain at least this many genes to be considered
	clusterCutHeight=0.25 # threshold for merging similar (gene-sharing) terms. See README for details.
#	Alternative="g" # by default the MWU test is two-tailed; specify "g" or "l" of you want to test for "greater" or "less" instead. 
#	Module=TRUE,Alternative="g" # un-remark this if you are analyzing a SIGNED WGCNA module (values: 0 for not in module genes, kME for in-module genes). In the call to gomwuPlot below, specify absValue=0.001 (count number of "good genes" that fall into the module)
#	Module=TRUE # un-remark this if you are analyzing an UNSIGNED WGCNA module 
)

Binary classification detected; will perform Fisher's test
0  GO terms at 10% FDR


In [None]:
results=gomwuPlot(input,goAnnotations,goDivision,
	#absValue=-log(0.05,10),  # genes with the measure value exceeding this will be counted as "good genes". Specify absValue=0.001 if you are doing Fisher's exact test for standard GO enrichment or analyzing a WGCNA module (all non-zero genes = "good genes").
	absValue=-log(0.05),
	level1=0.1, # FDR threshold for plotting. Specify level1=1 to plot all GO categories containing genes exceeding the absValue.
	level2=0.05, # FDR cutoff to print in regular (not italic) font.
	level3=0.01, # FDR cutoff to print in large bold font.
	txtsize=1.4,    # decrease to fit more on one page, or increase (after rescaling the plot so the tree fits the text) for better "word cloud" effect
	treeHeight=0.5, # height of the hierarchical clustering tree
#	colors=c("dodgerblue2","firebrick1","skyblue","lightcoral") # these are default colors, un-remar and change if needed
)
# manually rescale the plot so the tree matches the text 
# if there are too many categories displayed, try make it more stringent with level1=0.05,level2=0.01,level3=0.001.  

# text representation of results, with actual adjusted p-values
results