# Data Exploration and Analysis
on TCGA (The Cancer Genome Atlas) and CCLE (Cancer Cell Line Encyclopedia)

In [3]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pacmap import PaCMAP
import matplotlib.colors as mcolors
from src.raw_data_loader import *
from src.data_visualization import combined_data_pacmap

## Load the data
in the same directory there has to be a folder named data with filtered_17713_gene_names.csv, CCLE_expression_full.csv and EB++AdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.xena in it.

In [4]:
ccle, tcga, ccle_metadata, tcga_metadata = ccle_tcga_loader()

In [3]:
ccle_metadata["primary_disease"].unique()

In [4]:
tcga_metadata["cgc_case_primary_site"].unique()

In [5]:
ccle_metadata.columns

In [6]:
# Define mapping of cancer types such that ccle adopts the naming scheme of tcga. IMPORTANT: this mapping is not 100% correct
mapping = {
    'Ovarian Cancer': 'Ovary',
    'Leukemia': 'Blood',
    'Colon/Colorectal Cancer': 'Colorectal',
    'Skin Cancer': 'Skin',
    'Bladder Cancer': 'Bladder',
    'Lung Cancer': 'Lung',
    'Kidney Cancer': 'Kidney',
    'Breast Cancer': 'Breast',
    'Pancreatic Cancer': 'Pancreas',
    'Myeloma': 'Bone Marrow',
    'Brain Cancer': 'Brain',
    'Sarcoma': 'Mesenchymal',
    'Lymphoma': 'Lymph Nodes',
    'Bone Cancer': 'Bone',
    'Non-Cancerous': 'Non-Cancerous',  # No direct match
    'Thyroid Cancer': 'Thyroid',
    'Neuroblastoma': 'Nervous System',
    'Prostate Cancer': 'Prostate',
    'Rhabdoid': 'Rhabdoid',  # No direct match
    'Gastric Cancer': 'Stomach',
    'Unknown': 'Unknown',  # No direct match, NaNs in tcga are changed to 'Unknown'
    'Gallbladder Cancer': 'Gallbladder Cancer',  # No direct match
    'Endometrial/Uterine Cancer': 'Uterus',
    'Head and Neck Cancer': 'Head and Neck',
    'Bile Duct Cancer': 'Bile Duct',
    'Esophageal Cancer': 'Esophagus',
    'Liver Cancer': 'Liver',
    'Cervical Cancer': 'Cervix',
    'Eye Cancer': 'Eye',
    'Adrenal Cancer': 'Adrenal Gland',
    'Liposarcoma': 'Liposarcoma',  # No direct match
    'Embryonal Cancer': 'Embryonal Cancer',  # No direct match
    'Teratoma': 'Teratoma'  # No direct match
}


In [7]:
tcga_combined = tcga.copy()

# label all entries as tcga
tcga_combined["dataset"] = np.full_like(tcga_combined.index, "tcga")

# assign the gender
tcga_combined["gender"] = tcga_metadata["gdc_cases.demographic.gender"]

# age
tcga_combined["age"] = tcga_metadata["cgc_case_age_at_diagnosis"]

# disease
tcga_combined["primary_disease"] = tcga_metadata['cgc_case_primary_site'].fillna('Unknown')

In [8]:
ccle_combined = ccle.copy()

# label all entries as ccle
ccle_combined["dataset"] = np.full_like(ccle_combined.index, "ccle")

# assign the gender
ccle_combined["gender"] = ccle_metadata["sex"].str.lower()

# age
ccle_combined["age"] = pd.to_numeric(ccle_metadata["age"], errors="coerce")

# disease
ccle_combined["primary_disease"] = ccle_metadata['primary_disease'].map(mapping)

In [9]:
tcga_metadata["gdc_cases.project.project_id"].unique()

In [None]:
[print(col, np.unique(ccle_metadata[col].astype(str))) for col in ccle_metadata.columns]

In [None]:
ccle_metadata["source"]

In [None]:
tcga_combined

In [None]:
ccle_combined

In [9]:
data = pd.concat([tcga_combined, ccle_combined], axis=0)
data

In [None]:
data.to_csv("data/data.csv")

comments regarding tcga and ccle: 
tcga data has 11069 rows. On the website they write once it has 11069 entries and once 11060. I have currently not found any metadata regarding which samples are cancer and which not.

ccle has metadata regarding which samples belong to which cell line. Also, ccle has fewer genes as some were _not_ present in this dataset.

## Basic Data Analysis

In [15]:

tcga_genes = tcga.shape[1]
ccle_genes = ccle.shape[1]

# Create bar plot
plt.bar(x=("TCGA", "CCLE"), height=(tcga_genes, ccle_genes))

# Add values to the bars
for i, value in enumerate((tcga_genes, ccle_genes)):
    plt.text(i, value + 0.05, str(value), ha='center', va='bottom')

# Add labels and title
plt.ylabel('Number of Genes')
plt.title('Genes in Dataset')

In [16]:
fig, ax1 = plt.subplots()

tcga_samples = tcga.shape[0]
ccle_cell_lines = ccle.shape[0]

# Create bar plot
plt.bar(x=("TCGA samples (reduced to the \n samples available in metadata)", "CCLE cell line samples"), height=(tcga_samples, ccle_cell_lines))

# Add values to the bars
for i, value in enumerate((tcga_samples, ccle_cell_lines)):
    plt.text(i, value + 0.05, str(value), ha='center', va='bottom')

# Add labels and title
plt.ylabel('Samples')
plt.title('Samples for each dataset')

In [12]:
number_nans = ccle.isna().sum().sum() + tcga.isna().sum().sum()
total_values = (ccle.shape[0] + tcga.shape[0]) * ccle.shape[1]
plt.bar(x=("{} NaN".format(number_nans), "{} total values".format(total_values)), height=[number_nans, total_values])
plt.title('Number of NaNs in CCLE and TCGA')



## Data Analysis of CCLE


In [17]:
ccle_metadata

In [18]:
ccle_metadata.columns
# most interesting: lineage lineage_subtype sex, ...

In [19]:
print("Gene expression dataset entries: {}, unique dataset entries: {}".format(len(ccle), len(ccle.drop_duplicates())))
print("unique DepMap_IDs: {}".format(len(ccle_metadata.index.unique())))
print("unique cell lines: {}".format(len(ccle_metadata["cell_line_name"].unique())))
print("unique stripped lines: {}".format(len(ccle_metadata["stripped_cell_line_name"].unique())))

cell_line_name is not a unique (contains NaN and one duplicate: U-251 MG), stripped_cell_line_name is. The two U-251 MG are from different tissues, have different stripped_cell_line_name and have different gene expression -> not real duplicates

In [20]:

embedding = PaCMAP(n_components=2, n_neighbors=None, MN_ratio=0.5, FP_ratio=2.0) 
ccle_transformed = embedding.fit_transform(ccle, init="pca")

fig, ax = plt.subplots(1, 1, figsize=(6, 6))

# select what should be color coded, e.g. sex, disease, ...
to_highlight = "primary_or_metastasis"
id_mapping = {id: i for i, id in enumerate(ccle_metadata[to_highlight].unique())}

for id, label in id_mapping.items():
    subset_indices = ccle_metadata[to_highlight] == id
    subset_transformed = ccle_transformed[subset_indices]
    ax.scatter(subset_transformed[:, 0], subset_transformed[:, 1], cmap="Spectral", label=id, s=0.6)

# Create a legend
if len(id_mapping) < 40:
    ax.legend(title=to_highlight, markerscale=10, bbox_to_anchor=(1.05, 1), loc='upper left')
else:
    plt.title(to_highlight)

## Data Analysis of TCGA
TCGA contains NaN

In [21]:
# Step 2: Drop rows with NaN values
tcga_clean = tcga.dropna()
tcga_metadata_clean = tcga_metadata.loc[tcga_clean.index]

embedding = PaCMAP(n_components=2, n_neighbors=None, MN_ratio=0.5, FP_ratio=2.0) 
tcga_transformed = embedding.fit_transform(tcga_clean, init="pca")

fig, ax = plt.subplots(1, 1, figsize=(6, 6))

# select what should be color coded, e.g. sex, disease, ...
to_highlight = "cgc_case_primary_site"
id_mapping = {id: i for i, id in enumerate(tcga_metadata_clean[to_highlight].fillna('Unknown').unique())}

for id, label in id_mapping.items():
    subset_indices = tcga_metadata_clean[to_highlight].fillna('Unknown') == id
    subset_transformed = tcga_transformed[subset_indices]
    ax.scatter(subset_transformed[:, 0], subset_transformed[:, 1], cmap="Spectral", label=id, s=0.6)

# Create a legend
if len(id_mapping) < 40:
    ax.legend(title=to_highlight, markerscale=10, bbox_to_anchor=(1.05, 1), loc='upper left')
else:
    plt.title(to_highlight)

## Data Analysis on the combined dataset

In [22]:
occurrences = data["gender"].value_counts()

# Plotting the occurrences as a bar plot
occurrences.plot(kind='bar')
plt.xlabel('Values')
plt.ylabel('Occurrences')
plt.title('Occurrences of Gender')
plt.show()

### pacmap plots on combined data
select labeling and how the data should be preprocessed

In [10]:
viz_preprocessing = {
    "only_most_variant": 5000, # can be a number or None
    "z_score_norm": "per_gene" # can be None, per_gene or per_sample
}

In [37]:
# Using tab20 colormap with 20 colors
colormap = plt.colormaps.get_cmap('tab20')

# Generating colors from the colormap
num_categories = 20
colors = [colormap(i) for i in range(num_categories)]
print(colors)

In [41]:
combined_data_pacmap(data, viz_preprocessing)