---
# Plot effect of CNVs on marker genes
---

/cluster/projects/pughlab/projects/BTSCs_scRNAseq/Manuscript_G607removed/NatCan_Rebuttal/MarkerGenes_CNVs

Reference notebook: Visualize_CNVs_Cluster_June2019

In [2]:
setwd("~/Desktop/H4H/pughlab/projects/BTSCs_scRNAseq/Manuscript_G607removed/NatCan_Rebuttal/MarkerGenes_CNVs")

----
## 1.0 Data cleaning
----

#### 1.1 Format marker gene file

#### 1.2 Make gene position file with chromosome arm

#### 1.3 Matrix of clusters x chr arm

Use cutoffs from Extended Data Figure 3

In [None]:
head(gene.pos)
head(avg(avg.cnv.df))

dat <- list()
arms <- unique(gene.pos$arm)

for (i in 1:length(arms)){
    
    print(arms[i])
    genes <- rownames(gene.pos[as.character(gene.pos$arm) == arms[i], ])
    sub <- avg.cnv.df[rownames(avg.cnv.df) %in% genes, ]
    dat[[arms[i]]] <- colMeans(sub)
}

dat <- do.call(rbind, dat)
dat2 <- dat
dat[dat >= 0.17] <- 1
dat[dat <= -0.15] <- -1
dat[dat > -0.2 & dat < 1] <- 0
dat <- dat[!rownames(dat) == "NA", ]
dat[is.na(dat)] <- 0

saveRDS(dat, file = "IntraGSCcluster_chrarm_binned.rds")

----
## 2.0 Re-plot binary CNV heatmap
---

Use Cutoffs from WGS benchmarking to 

In [3]:
library(pheatmap)

In [4]:
dat <- readRDS("IntraGSCcluster_chrarm_binned.rds")

In [48]:
pheatmap(t(dat),
         color = colorRampPalette(c("blue", "white", "red"))(50),
         cluster_cols = FALSE,
         cluster_rows = FALSE,
         file = "BinnedChrarms_heatmap.jpeg",
         width = 6,
         height = 11.5
         )

----
## 3.0 Proportion marker genes in CNVs
---

In [5]:
AllEqual <- structure(function(
	##title<< 
	## Check if all values in a vector are the same
	##description<<
	## This function is used to check if all values in a vector are equal. It can be used for example to check if a time series contains only 0 or NA values.
	
	x
	### numeric, character vector, or time series of type ts
) {
	res <- FALSE
	x <- na.omit(as.vector(x))
	if (length(unique(x)) == 1 | length(x) == 0) res <- TRUE
	return(res)
	### The function returns TRUE if all values are equal and FALSE if it contains different values.
},ex=function(){
# check if all values are equal in the following vectors:
AllEqual(1:10)
AllEqual(rep(0, 10))
AllEqual(letters)
AllEqual(rep(NA, 10))
})

In [177]:
### read in marker genes
markergenes <- readRDS("Markers_Exp_CNVs.rds")
dat <- readRDS("IntraGSCcluster_chrarm_binned.rds")
genepos <- readRDS("GenePosition_Chrarm.rds")
genepos <- genepos[!is.na(genepos$arm), ]

##remove marker genes not in CNV analysis
markergenes <- markergenes[!is.na(markergenes$ChromosomeArm), ]
#head(markergenes)

### Figure out which regions are variable across clusters

In [180]:
samples <- as.character(unique(markergenes$Sample))
samples

In [189]:
samples <- as.character(unique(markergenes$Sample))

clusters <- c()
numMarkers <- c()
numRegions <- c()
numCNV <- c()
fishers <- c()

for (i in 1:length(samples)){
    
    sub <- dat[ ,grep(samples[i], colnames(dat))]

    ### define variable regions
    var.reg <- c()
    for (j in 1:nrow(sub)){
        var.reg[j] <- AllEqual(sub[j, ])
    }
    sub <- sub[!var.reg, ]
    var.reg <- rownames(sub)
    numRegions <- append(numRegions, length(var.reg))
    if(length(var.reg) == 0){print(samples[i])} else {print(var.reg)}
    
    ### subset marker gene matrix to sample
    marker.sub <- markergenes[markergenes$Sample == samples[i], ]
    #marker.sub

    clust <- as.character(unique(marker.sub$Cluster))
    ### how many in variable regions with marker genes per cluster
    for (x in 1:length(clust)){
    
        clusters <- append(clusters, clust[x])
        d <- marker.sub[marker.sub$Cluster == clust[x], ]
        numCNV <- append(numCNV, sum(d$ChromosomeArm %in% var.reg == TRUE))
        #no <- sum(d$ChromosomeArm %in% var.reg == FALSE)
        numMarkers <- append(numMarkers, nrow(d))
        
         ### do fishers exact test
        topleft <- sum(d$ChromosomeArm %in% var.reg == TRUE)
        bottomleft <- nrow(d) - topleft
        genes.in.var <- genepos[as.character(genepos$arm) %in% var.reg, ]
        topright <- as.numeric(table(genes.in.var$Gene %in% d$Gene)["FALSE"]) 

        #not marker gene and not in var region
        genes.NOT.in.var <- genepos[!as.character(genepos$arm) %in% var.reg, ]
        bottomright <- as.numeric(table(genes.NOT.in.var$Gene %in% d$Gene)["FALSE"]) 
    
        con <- matrix(c(topleft, bottomleft, topright, bottomright), ncol = 2)
        fishers <- append(fishers, fisher.test(con, alternative = "greater")$p)
        #fishers <- append(fishers, chisq.test(con, alternative = "greater")$p)
    }

    ## calculate fishers p
    
}

[1] "7p" "8q" "9p" "9q"
[1] "9p"  "20p" "19p"
[1] "7p"  "9p"  "19p" "22q"
[1] "6p"  "6q"  "7p"  "18q"
[1] "7p"  "17q" "20q"
[1] "6q"  "17p" "17q" "19q"
[1] "8q"  "19p"
[1] "4p"  "7p"  "10p" "10q" "12q" "13q" "19p" "19q"
 [1] "1q"  "6p"  "6q"  "7p"  "9q"  "10p" "10q" "13q" "16q" "20p" "19p" "19q"
[13] "21q"
[1] "7q"  "10p" "10q" "13q" "20p" "20q" "19p" "19q"
[1] "7p"  "7q"  "18q" "19q"
 [1] "3p"  "4p"  "4q"  "7p"  "7q"  "8q"  "9p"  "14q" "20p" "20q" "19p" "19q"
[1] "7q"  "9q"  "13q" "19p" "19q"
 [1] "1q"  "2p"  "2q"  "4p"  "4q"  "7p"  "9q"  "11p" "13q" "16q" "18q" "20p"
[13] "20q" "19p" "19q" "21q"
[1] "2p"  "10p" "19q"
[1] "7p"  "9q"  "12p" "12q" "13q" "18q" "19q" "21q"
[1] "6p"  "6q"  "9p"  "10p" "10q" "12q"
[1] "9q"  "18q" "19p"
[1] "7q"  "10p" "18q"
[1] "7q"  "17p" "19q"
[1] "5p"  "6q"  "10p" "10q" "15q" "17p" "17q" "22q"
 [1] "4p"  "4q"  "7p"  "7q"  "9p"  "10q" "16q" "17p" "20p" "20q" "19p" "22q"


In [190]:
plot.dat <- data.frame(clusters, numMarkers, numCNV, fishers)
#remove <- c("BT147_L", 
#            "BT67_L",
#            "BT84_L", 
#            "BT89_L", 
#            "G566_L", 
#            "G799_L", 
#            "G946-K_L"
#           ) #samples with no variable regions
#plot.dat$Sample <- gsub('.{0,3}$', '', plot.dat$clusters)
#plot.dat <- plot.dat[!plot.dat$Sample %in% remove, ]

plot.dat$propCNV <- plot.dat$numCNV / plot.dat$numMarkers
head(plot.dat)


Unnamed: 0_level_0,clusters,numMarkers,numCNV,fishers,propCNV
Unnamed: 0_level_1,<fct>,<int>,<int>,<dbl>,<dbl>
1,BT127_L_C1,3307,339,3.155183e-06,0.10250983
2,BT127_L_C2,727,84,0.0007349326,0.11554333
3,BT48_L_C1,668,49,0.002141111,0.07335329
4,BT48_L_C2,581,18,0.983209,0.03098107
5,BT73_L_C1,3226,319,3.91937e-05,0.09888407
6,BT73_L_C2,264,35,0.002374066,0.13257576


In [202]:
cols <- ifelse(fishers < 0.05, "darkblue", "grey")
pdf("~/Desktop/propMarkersCNV.pdf", height = 22, width = 5)
par(oma=c(4,4,0,0))
barplot(plot.dat$propCNV, 
        names = plot.dat$clusters, 
        las = 2, 
        col=cols,
        horiz = T,
        cex.names = 1,
        xlim = c(0,0.6),
        xlab = "Proportion Marker Genes",
        #ylab = "Cluster",
        xpd = F
       )
dev.off()