# Data Exploration and Analysis
on TCGA (The Cancer Genome Atlas) and CCLE (Cancer Cell Line Encyclopedia)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os.path

## Load the data
in the same directory there has to be a folder named data with filtered_17713_gene_names.csv, CCLE_expression_full.csv and EB++AdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.xena in it.

In [10]:
filter_list = pd.read_csv("data/filtered_17713_gene_names.csv")
filter_list_stripped = filter_list["# Gene"].str.split(' \(').str[0]
filter_list_stripped

if not os.path.isfile("data/ccle.csv"):
    print("load ccle from full file. Code is ineficient, this takes a while")
    ccle_full = pd.read_csv("data/CCLE_expression_full.csv", index_col=0)
    ccle_full.columns = ccle_full.columns.str.split(' \(').str[0]
    filtered_columns = [col for col in ccle_full.columns if filter_list_stripped.str.contains(col).any()]
    ccle = ccle_full[filtered_columns]
    ccle.to_csv("data/ccle.csv")
    del ccle_full
else:
    ccle = pd.read_csv("data/ccle.csv", index_col=0)

if not os.path.isfile("data/tcga.csv"):
    print("load tcga from full file")
    tcga_full = pd.read_csv("data/EB++AdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.xena", sep="\t", index_col=0)
    tcga = tcga_full[tcga_full.index.isin(filter_list_stripped)]
    tcga = tcga.T
    tcga
    tcga.to_csv("data/tcga.csv")
    del tcga_full
else:
    tcga = pd.read_csv("data/tcga.csv", index_col=0)

del filter_list
del filter_list_stripped

ccle_metadata_full = pd.read_csv("data/sample_info.csv")
ccle_metadata = ccle_metadata_full[ccle_metadata_full["DepMap_ID"].isin(ccle.index)]
ccle_metadata = ccle_metadata.set_index(ccle_metadata["DepMap_ID"])
ccle_metadata = ccle_metadata.drop(columns="DepMap_ID")

In [6]:
tcga

Unnamed: 0,A1BG,A1CF,A2M,A2ML1,A4GALT,A4GNT,AAAS,AACS,AADAC,AADACL2,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
TCGA-OR-A5J1-01,4.12,0.00,13.34,5.80,7.58,0.00,11.12,10.31,0.00,0.00,...,7.53,7.21,4.44,8.46,10.04,0.57,9.34,10.85,10.18,9.22
TCGA-OR-A5J2-01,3.41,0.00,13.27,0.00,7.64,0.00,10.56,9.94,1.14,0.00,...,8.05,8.78,5.86,8.13,11.54,5.02,10.19,11.58,10.89,9.65
TCGA-OR-A5J3-01,4.44,0.67,12.81,1.47,6.25,1.75,10.30,12.34,6.76,0.00,...,6.52,7.58,5.35,8.96,9.84,0.67,9.66,11.38,10.53,8.78
TCGA-OR-A5J5-01,10.73,1.35,11.52,5.66,8.55,0.83,11.55,9.92,7.23,0.00,...,8.03,9.72,4.23,7.69,9.80,3.66,9.12,11.21,10.16,9.01
TCGA-OR-A5J6-01,9.23,0.00,13.23,1.08,10.60,0.64,10.21,10.53,8.31,1.42,...,6.03,6.00,3.79,6.89,9.81,3.14,9.64,9.47,9.64,8.90
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-CG-4472-01,4.00,,15.03,0.33,8.45,4.59,8.76,8.59,,,...,8.20,8.82,6.33,9.00,10.26,2.09,9.82,11.71,10.77,10.17
TCGA-CG-4474-01,5.28,,14.41,-0.77,7.64,5.46,9.89,8.61,,,...,9.54,10.05,5.34,8.45,9.92,4.18,9.57,12.06,11.40,9.75
TCGA-CG-4475-01,4.15,,14.35,0.97,8.92,0.75,8.95,9.48,,,...,9.12,9.36,5.91,9.12,10.36,2.25,9.86,11.60,10.42,8.89
TCGA-CG-4476-01,4.70,,15.06,0.29,8.05,6.52,8.83,9.61,,,...,9.02,9.25,5.53,8.86,10.11,0.38,9.73,11.69,10.90,10.01


In [7]:
ccle

Unnamed: 0,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,GCLC,NFYA,...,CCDC39.3,ARHGAP11B,POLR2J3.2,POLR2J3.3,SLFN12L.2,SLFN12L.3,C2orf27A,C8orf44,NPBWR1,CDR1
ACH-001113,4.331992,0.000000,7.364397,2.792855,4.470537,0.028569,1.226509,3.042644,6.499686,4.739848,...,1.560715,1.641546,5.590362,5.612352,0.028569,0.000000,2.963474,2.786596,0.275007,0.000000
ACH-001289,4.566815,0.584963,7.106537,2.543496,3.504620,0.000000,0.189034,3.813525,4.221104,3.481557,...,1.327687,2.247928,2.989139,4.732812,0.000000,0.000000,2.472488,2.825786,0.014355,0.000000
ACH-001339,3.150560,0.000000,7.379032,2.333424,4.227279,0.056584,1.310340,6.687061,3.682573,3.273516,...,0.275007,2.056584,2.833902,4.810957,0.124328,0.028569,2.980025,2.372952,0.084064,0.042644
ACH-001538,5.085340,0.000000,7.154109,2.545968,3.084064,0.000000,5.868143,6.165309,4.489928,3.956986,...,0.555816,1.664483,3.181103,3.702658,0.275007,0.000000,2.713696,2.925999,0.028569,0.000000
ACH-000242,6.729145,0.000000,6.537607,2.456806,3.867896,0.799087,7.208381,5.569856,7.127014,4.568032,...,0.333424,3.173127,4.141596,4.785551,0.042644,0.000000,1.275007,1.130931,0.464668,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ACH-000285,0.056584,0.000000,6.604071,3.264536,4.972693,0.411426,0.097611,0.704872,4.829850,5.178715,...,0.695994,3.667892,3.673556,6.325171,0.000000,0.014355,3.095924,1.739848,0.000000,0.042644
ACH-002669,3.109361,0.000000,7.031219,1.541019,3.664483,0.014355,3.624101,6.805292,4.472488,4.397118,...,0.150560,1.687061,4.553975,5.073392,0.000000,0.000000,2.493135,2.073820,0.014355,0.000000
ACH-001858,4.390943,0.000000,7.013127,1.887525,3.252476,0.028569,3.286881,6.902074,5.410748,3.401903,...,0.310340,1.867896,5.085340,4.492494,0.250962,0.000000,3.687061,3.095924,1.464668,0.526069
ACH-001997,5.057017,0.000000,7.814935,2.538538,3.893362,0.028569,4.078951,6.971429,4.469886,3.463361,...,0.443607,1.263034,4.871844,4.595146,0.014355,0.000000,2.424922,2.553361,0.000000,0.000000


comments regarding tcga and ccle: 
tcga data has 11069 rows. On the website they write once it has 11069 entries and once 11060. I have currently not found any metadata regarding which samples are cancer and which not.

ccle has metadata regarding which samples belong to which cell line. Also, ccle has fewer genes as some were _not_ present in this dataset.

## Data Analysis of CCLE


In [18]:
ccle_metadata

Unnamed: 0_level_0,cell_line_name,stripped_cell_line_name,CCLE_Name,alias,COSMICID,sex,source,RRID,WTSI_Master_Cell_ID,sample_collection_site,...,lineage_sub_subtype,lineage_molecular_subtype,default_growth_pattern,model_manipulation,model_manipulation_details,patient_id,parent_depmap_id,Cellosaurus_NCIt_disease,Cellosaurus_NCIt_id,Cellosaurus_issues
DepMap_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ACH-000016,SLR 21,SLR21,SLR21_KIDNEY,,,,Academic lab,CVCL_V607,,kidney,...,,,,,,PT-JnARLB,,Clear cell renal cell carcinoma,C4033,
ACH-000032,MHH-CALL-3,MHHCALL3,MHHCALL3_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,,,Female,DSMZ,CVCL_0089,,bone_marrow,...,b_cell,,,,,PT-p2KOyI,,Childhood B acute lymphoblastic leukemia,C9140,
ACH-000033,NCI-H1819,NCIH1819,NCIH1819_LUNG,,,Female,Academic lab,CVCL_1497,,lymph_node,...,NSCLC_adenocarcinoma,,,,,PT-9p1WQv,,Lung adenocarcinoma,C3512,
ACH-000043,Hs 895.T,HS895T,HS895T_FIBROBLAST,,,Female,ATCC,CVCL_0993,,fibroblast,...,,,2D: adherent,,,PT-rTUVZQ,,Melanoma,C3224,
ACH-000049,HEK TE,HEKTE,HEKTE_KIDNEY,,,,Academic lab,CVCL_WS59,,kidney,...,,,,immortalized,,PT-qWYYgr,,,,No information is available about this cell li...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ACH-002045,HO-1-u-1,HO1U1,HO1U1_UPPER_AERODIGESTIVE_TRACT,,,Male,RIKEN,CVCL_2784,,upper_aerodigestive_tract,...,oral,,2D: adherent,,,PT-a3mx1w,,Floor of mouth squamous cell carcinoma,C4041,
ACH-002059,P30/OHK,P30OHK,P30OHK_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,,,Female,RIKEN,CVCL_1631,,bone_marrow,...,b_cell,,2D: suspension,,,PT-reOMOW,,Childhood B acute lymphoblastic leukemia,C9140,
ACH-002062,SLVL,SLVL,SLVL_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,,,Female,RIKEN,CVCL_3169,,haematopoietic_and_lymphoid_tissue,...,b_cell,,2D: suspension,,,PT-2hETfh,,Splenic marginal zone lymphoma,C4663,
ACH-002067,NOS-1,NOS1,NOS1_BONE,,,Male,RIKEN,CVCL_1610,,bone,...,,,2D: adherent,,,PT-EcVEYs,,Osteosarcoma,C9145,
