# UGT2B7 Isoform Expression Analysis

In [None]:
install.packages("htmltools")
install.packages("BiocManager")
BiocManager::install("DESeq2")

In [27]:
library(htmltools)
library( "DESeq2" )
library(ggplot2)
library(magrittr)
library(tidyr)

In [28]:
# CRAN
library(dplyr)
library(tidyr)
library(ggplot2)
theme_set(theme_bw(base_size=14) + theme(strip.background = element_blank()))

# Bioconductor
library(DESeq2)
library(airway)

ERROR: Error in library(airway): there is no package called ‘airway’


In [80]:
countData <- read.csv("input/pcawg.rnaseq.transcript.UGT2B7.expr.counts.csv", header = TRUE, sep = ",")
head(countData)

Unnamed: 0_level_0,Feature,ca74074b.a75c.434c.82dc.e7b112d192ef,X910c8543.cc94.4499.82fe.365c24eac5b1,X22f1d19e.14f8.403e.9e48.663da7508bf2,f59f0b5c.a5ac.4859.a862.3425c623341d,fd0df2dd.fa09.4ec1.8996.c6c6aca9c2a6,f4cc72cf.6f47.4ebc.a9ab.9ba15b6fa411,X91953b3b.e9ea.49be.a403.92d7a97e4ebc,d69871fb.9d93.4a3f.b4ea.f53de743b25f,X50db517a.a5c7.431e.9439.cb5cffb9b954,⋯,X4a68e62e.ae0a.4eeb.a87a.8c67db008b6b,X633ab3db.0f9a.468e.9185.9d4a9cd582ad,X2cdd6339.a1d6.4c7c.889e.6570c64de489,X2c767be0.1e3f.496f.9b43.1401a354b2a2,X1520b711.9fdb.4494.8229.b4664ca14d97,d170e848.cdad.494c.b23c.ac16634412e4,X512c7b15.19e7.435c.8981.57be0f1ffd58,X77b373e5.eed3.4b03.bd9a.3a553fb0fca6,ddb26106.900b.4080.a28b.59927df9d525,bdab7f7f.35cc.4560.a9a0.2263e62afa6c
Unnamed: 0_level_1,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
1,ENST00000305231.7,3268,1983,2336,2148,4620,4111,3977,2524,7529,⋯,23628,8434,1448,1651,5672,2887,3332,5255,7079,879
2,ENST00000508661.1,75,20,345,53,188,148,88,60,543,⋯,497,277,152,47,342,50,39,82,324,57
3,ENST00000502942.1,6856,3167,1376,1602,3463,6012,2119,2193,6285,⋯,2118,6332,533,2764,9667,1168,3314,5359,6730,982


In [81]:
metaData <- read.csv('input/pcawg_formatted_rna_metadata_2.csv', header = TRUE, sep = ",")
head(metaData)

Unnamed: 0_level_0,id,exp
Unnamed: 0_level_1,<chr>,<chr>
1,ca74074b-a75c-434c-82dc-e7b112d192ef,No
2,910c8543-cc94-4499-82fe-365c24eac5b1,Yes
3,22f1d19e-14f8-403e-9e48-663da7508bf2,No
4,f59f0b5c-a5ac-4859-a862-3425c623341d,No
5,fd0df2dd-fa09-4ec1-8996-c6c6aca9c2a6,Yes
6,f4cc72cf-6f47-4ebc-a9ab-9ba15b6fa411,Yes


In [82]:
dds <- DESeqDataSetFromMatrix(countData=countData, 
                              colData=metaData, 
                              design=~exp, tidy = TRUE)

“some variables in design formula are characters, converting to factors”


In [83]:
dds <- estimateSizeFactors(dds)
dds <- estimateDispersionsGeneEst(dds)
dispersions(dds) <- mcols(dds)$dispGeneEst
dds <- nbinomWaldTest(dds)

In [84]:
res <- results(dds)

In [85]:
res

log2 fold change (MLE): exp Yes vs No 
Wald test p-value: exp Yes vs No 
DataFrame with 3 rows and 6 columns
                   baseMean log2FoldChange     lfcSE      stat     pvalue
                  <numeric>      <numeric> <numeric> <numeric>  <numeric>
ENST00000305231.7  5048.533       0.315014  0.199649   1.57784 0.11460239
ENST00000508661.1   202.808      -0.951054  0.301178  -3.15779 0.00158972
ENST00000502942.1  3370.100       0.332827  0.312270   1.06583 0.28649982
                        padj
                   <numeric>
ENST00000305231.7 0.17190359
ENST00000508661.1 0.00476917
ENST00000502942.1 0.28649982

In [86]:
pdf(file = "output/plot/exp_UGT2B7.pdf",   # The directory you want to save the file in
    width = 6, # The width of the plot in inches
    height = 4)

par(mfrow=c(1,3))

plotCounts(dds, gene="ENST00000508661.1", intgroup="exp")
plotCounts(dds, gene="ENST00000305231.7", intgroup="exp")
plotCounts(dds, gene="ENST00000502942.1", intgroup="exp")

dev.off()

In [93]:
goi <- c('ENST00000508661.1', 'ENST00000305231.7', 'ENST00000502942.1')

In [94]:
# tcounts <- t(log2((counts(dds[goi, ], normalized=TRUE, replaced=FALSE)+.5))) %>%
#   merge(colData(dds), ., by="row.names") %>%
#   gather(gene, expression, (ncol(.)-length(goi)+1):ncol(.))

In [99]:
tcounts <- t((counts(dds[goi, ], normalized=TRUE, replaced=FALSE))) %>%
  merge(colData(dds), ., by="row.names") %>%
  gather(gene, expression, (ncol(.)-length(goi)+1):ncol(.))

In [100]:
tcounts %>% 
  select(Row.names, exp, gene, expression) %>% 
  head

Unnamed: 0_level_0,Row.names,exp,gene,expression
Unnamed: 0_level_1,<I<chr>>,<fct>,<chr>,<dbl>
1,a75135f0.66e9.415a.a9ae.94e54d2439f0,Yes,ENST00000508661.1,146.4229
2,ad1457f5.c6e3.4493.a216.284a49983e4f,No,ENST00000508661.1,146.4229
3,b8862635.21cd.405b.b757.8ee924a7a826,No,ENST00000508661.1,721.5827
4,bdab7f7f.35cc.4560.a9a0.2263e62afa6c,No,ENST00000508661.1,146.4229
5,bf7a15fb.0afe.49c9.ae7b.f5fc9f47531f,No,ENST00000508661.1,1185.9699
6,c67df408.b68f.43e3.91c9.b81b0d822895,Yes,ENST00000508661.1,139.2086


In [101]:
pdf(file = "output/plot/exp_UGT2B7_box_linear.pdf",   # The directory you want to save the file in
    width = 6, # The width of the plot in inches
    height = 4)

ggplot(tcounts, aes(exp, expression)) + 
  geom_boxplot() + 
  facet_wrap(~gene, scales="free_y") + 
  labs(x="rRE Expanded", 
       y="Expression (log normalized counts)")



dev.off()

In [98]:
pdf(file = "output/plot/exp_UGT2B7_violin.pdf",   # The directory you want to save the file in
    width = 6, # The width of the plot in inches
    height = 4)

ggplot(tcounts, aes(exp, expression)) + 
  geom_violin() + 
  facet_wrap(~gene, scales="free_y") + 
  labs(x="rRE Expanded", 
       y="Expression (log normalized counts)")

dev.off()