In [47]:
suppressPackageStartupMessages({
    library(dplyr)
    library(ggplot2)
    library(biomaRt)
    library(dplyr)
})

In [48]:
counts <- read.table("CCLE_RNAseq_rsem_genes_tpm_20180929.txt.gz", header = TRUE, row.names = 1) 

In [49]:
# Select cells of interest
patterns <- c("Raji", "BDCM", "Jurkat")
selected_columns <- colnames(counts)[apply(sapply(patterns, function(p) grepl(p, colnames(counts), ignore.case = TRUE)), 1, any)]
counts <- counts[, selected_columns]

# Clean up data
rownames(counts) <- gsub("\\..*", "", rownames(counts))
counts <- as.matrix(counts)
head(counts)

Unnamed: 0,BDCM_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,JURKAT_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,RAJI_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE
ENSG00000000003,0.21,0.01,0.05
ENSG00000000005,0.0,0.0,0.0
ENSG00000000419,62.89,52.84,62.54
ENSG00000000457,8.03,4.96,3.76
ENSG00000000460,19.71,34.04,12.04
ENSG00000000938,119.26,0.54,117.44


In [50]:
mart <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")
ensembl_annot <- getBM(attributes=c("ensembl_gene_id","external_gene_name"),
                       filters = "ensembl_gene_id",
                       values = rownames(counts), 
                       mart = mart)
head(ensembl_annot)

Unnamed: 0_level_0,ensembl_gene_id,external_gene_name
Unnamed: 0_level_1,<chr>,<chr>
1,ENSG00000000003,TSPAN6
2,ENSG00000000005,TNMD
3,ENSG00000000419,DPM1
4,ENSG00000000457,SCYL3
5,ENSG00000000460,FIRRM
6,ENSG00000000938,FGR


In [51]:
cols <- as.data.frame(rownames(counts))
new_names <- left_join(cols, ensembl_annot, by = c("rownames(counts)" = "ensembl_gene_id"))
rownames(counts) <- new_names$external_gene_name
count_df <- as.data.frame(counts)
head(count_df)

Unnamed: 0_level_0,BDCM_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,JURKAT_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,RAJI_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>
TSPAN6,0.21,0.01,0.05
TNMD,0.0,0.0,0.0
DPM1,62.89,52.84,62.54
SCYL3,8.03,4.96,3.76
FIRRM,19.71,34.04,12.04
FGR,119.26,0.54,117.44


In [56]:
genes <- c(
  "CD3G",
  "CD274",
  "PDCD1",
  "PDCD1LG2"

)

In [57]:
filtered_df <- count_df[rownames(count_df) %in% genes, ]
filtered_df

Unnamed: 0_level_0,BDCM_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,JURKAT_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,RAJI_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>
CD274,32.45,1.44,0.67
CD3G,0.0,95.19,0.05
PDCD1,2.12,1.03,0.0
PDCD1LG2,9.88,0.02,0.15


In [60]:
filtered_df <- count_df %>%
    dplyr::filter(rownames(.) %in% genes)%>%
    dplyr::mutate(is_lil = grepl("^LIL", rownames(.))) %>% 
    dplyr::arrange(desc(is_lil), rownames(.)) %>%
    dplyr::select(-is_lil) %>%
    dplyr::rename(
        `BDCM Counts (tpm)` = BDCM_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,
        `RAJI Counts (tpm)` = RAJI_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,
        `JURKAT Counts (tpm)` = JURKAT_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE)
filtered_df

Unnamed: 0_level_0,BDCM Counts (tpm),JURKAT Counts (tpm),RAJI Counts (tpm)
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>
CD274,32.45,1.44,0.67
CD3G,0.0,95.19,0.05
PDCD1,2.12,1.03,0.0
PDCD1LG2,9.88,0.02,0.15


In [61]:
write.table(filtered_df, "result.csv", sep = ",", row.names = FALSE)