<a href="https://colab.research.google.com/github/sbpatel2009/NGS-Panel-Analyzer/blob/master/TCGA_Glioma_Clinical_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **INTRODUCTION**
This file uploads, views, examines, and cleans the combined low grade glioma and glioblastoma clinical dataset generated by the TCGA.

The clinical data was downloaded from cBioPortal:  https://www.cbioportal.org/study/clinicalData?id=lgggbm_tcga_pub

**Relevant publications**

1.   [Comprehensive genomic characterization defines human glioblastoma genes and core pathwaysExit Disclaimer
Nature. 2008;455(7216):1061-1068. doi:10.1038/nature07385](https://www.nature.com/articles/nature07385)
2.   [The somatic genomic landscape of glioblastomaExit Disclaimer Cell. 2013;155(2):462-477. doi:10.1016/j.cell.2013.09.034 ](https://www.cell.com/cell/fulltext/S0092-8674(13)01208-7)
3.   [Comprehensive, Integrative Genomic Analysis of Diffuse Lower-Grade GliomasExit Disclaimer
N Engl J Med. 2015;372(26):2481-2498. doi:10.1056/NEJMoa1402121 ](https://www.nejm.org/doi/full/10.1056/NEJMoa1402121)
4.  [Molecular Profiling Reveals Biologically Discrete Subsets and Pathways of Progression in Diffuse Glioma.  Cell. 2016 Jan 28;164(3):550-63. doi: 10.1016/j.cell.2015.12.028.](https://www.nejm.org/doi/full/10.1056/NEJMoa1402121)

In [1]:
!pip install scikit-survival  #https://scikit-survival.readthedocs.io/en/stable/index.html



In [2]:
#LOAD THE RELEVANT PACKAGES
# Import supporting modules
import numpy as np
import glob, os
import pandas as pd
import seaborn as sns
%matplotlib inline
import matplotlib.pylab as plt
import sklearn  #  Scikit-learn, one of the best and most documented machine learning libaries for Python
import sksurv

In [None]:
file_name = "lgggbm_tcga_pub_clinical_data.tsv"
directory = "https://drive.google.com/drive/folders/1W50b4nOhoVZv6JQDdZjteyfVuEU3La65"
file_path = directory + "/" + file_name
url = "https://drive.google.com/file/d/1J6vWyVk4_f83qrgFsMEu-kLZe775y7GA/view?usp=sharing"
data = pd.read_csv(filepath_or_buffer = url, sep = "\t")

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
uploaded.keys()

In [None]:
import io
data = pd.read_csv(io.BytesIO(uploaded["lgggbm_tcga_pub_clinical_data.tsv"]))
data

In [None]:
data

In [None]:
print("shape =", data.shape)
print("size =", data.size)

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
print(data.isna().sum() / data.shape[0] * 100)  #Print the percent of missing values per column

In [None]:
#Columns indicate whether or not a test was performd
print(data["HM27"].unique())
print(data["HM450"].unique())
print(data["Whole Exome Sequencing"].unique())
print(data["Whole Genome Sequencing"].unique())
print(data["RPPA"].unique())
print(data["SNP6"].unique())
print(data["U133a"].unique())
print(data["RNA-SEQ Data"].unique(), "\n\n")

#Show number of unique values in each column and then identify the columns with only 1 unique value
print(data.nunique(), "\n\n")
print(data[data.nunique()[(data.nunique() == 1)].index])

In [None]:
fig, axes = plt.subplots(3, 3, figsize=(20, 15))
fig.suptitle("Glioma Dataset Summary")

sns.distplot(ax = axes[0, 0], a = data["Diagnosis Age"], kde = False)
axes[0, 0].set_title("Age distribution")
axes[0, 0].set_ylabel("Number of patients")

axes[0, 1].pie(data["Sex"].value_counts(), labels = data["Sex"].value_counts().index, colors = sns.color_palette('pastel')[0:2], autopct='%.0f%%')
axes[0, 1].set_title("Sex distribution")

sns.distplot(ax = axes[0, 2], a = data["Karnofsky Performance Score"], kde = False)
axes[0, 2].set_title("Karnofsky Performance Score")
axes[0, 2].set_ylabel("Number of patients")

axes[1, 0].pie(data["Neoplasm Histologic Type Name"].value_counts(), labels = data["Neoplasm Histologic Type Name"].value_counts().index, colors = sns.color_palette('pastel')[0:4], autopct='%.0f%%')
axes[1, 0].set_title("Neoplasm Histologic Type Name")

axes[1, 1].pie(data["Neoplasm Histologic Grade"].value_counts(), labels = data["Neoplasm Histologic Grade"].value_counts().index, colors = sns.color_palette('pastel')[0:3], autopct='%.0f%%')
axes[1, 1].set_title("Neoplasm Histologic Grade")

axes[1, 2].pie(data["Original Subtype"].value_counts(), labels = data["Original Subtype"].value_counts().index, colors = sns.color_palette('pastel')[0:9], autopct='%.0f%%')
axes[1, 2].set_title("Original Subtype")

axes[2, 0].pie(data["IDH status"].value_counts(), labels = data["IDH status"].value_counts().index, colors = sns.color_palette('pastel')[0:4], autopct='%.0f%%')
axes[2, 0].set_title("IDH status")

axes[2, 1].pie(data["IDH-1P10Q Subtype"].value_counts(), labels = data["IDH-1P10Q Subtype"].value_counts().index, colors = sns.color_palette('pastel')[0:4], autopct='%.0f%%')
axes[2, 1].set_title("IDH-1P10Q Subtype")

axes[2, 2].pie(data["MGMT promoter status"].value_counts(), labels = data["MGMT promoter status"].value_counts().index, colors = sns.color_palette('pastel')[0:2], autopct='%.0f%%')
axes[2, 2].set_title("MGMT promoter status")

plt.show()

In [None]:
data["Overall Survival Status"].replace(to_replace = "1:DECEASED", value = True, inplace = True)
data["Overall Survival Status"].replace(to_replace = "0:LIVING", value = False, inplace = True)
data.dropna(axis = 0, inplace = True, subset = ["Overall Survival Status", "Overall Survival (Months)"])
y = list(zip(data["Overall Survival Status"], data["Overall Survival (Months)"]))

In [None]:
categorical_columns = ["Sex", "Neoplasm Histologic Type Name", "Neoplasm Histologic Grade", "Original Subtype", "IDH status", "MGMT promoter status"]
categorical_imputer = SimpleImputer(strategy = "most_frequent", copy = False)
categorical_imputed = pd.DataFrame(categorical_imputer.fit_transform(data[categorical_columns]))
categorical_imputed.columns = data[categorical_columns].columns
categorical_imputed

In [None]:
numerical_columns = ["Diagnosis Age", "Karnofsky Performance Score"]
numerical_imputer = SimpleImputer(strategy = "mean", copy = False)
numerical_imputed = pd.DataFrame(numerical_imputer.fit_transform(data[numerical_columns]))
numerical_imputed.columns = data[numerical_columns].columns
num_normalized = pd.DataFrame(Normalizer().fit_transform(numerical_imputed))
num_normalized.columns = numerical_imputed.columns
num_normalized

In [None]:
X = pd.concat([categorical_imputed, num_normalized], axis = 1)

In [None]:
X

In [None]:
ordinal = OrdinalEncoder(categories = [["G2", "G3", "G4"]])
ordinalEnc = pd.DataFrame(ordinal.fit_transform(X.iloc[:, 2:3]))
ordinalEnc.columns = X.iloc[:, 2:3].columns
ordinalEnc

In [None]:
OHE = OneHotEncoder(sparse = False)
OHencoded = pd.DataFrame(OHE.fit_transform(X.loc[:, ["Sex", "Neoplasm Histologic Type Name", "Original Subtype", "IDH status", "MGMT promoter status"]]))
OHencoded

In [None]:
pd.concat([ordinalEnc, OHencoded, num_normalized], axis = 1)

In [None]:
from sklearn.model_selection import cross_val_score

numerical_columns = ["Diagnosis Age", "Karnofsky Performance Score"]
nominal_columns = ["Sex", "Neoplasm Histologic Type Name", "Original Subtype", "IDH status", "MGMT promoter status"]
ordinal_columns = ["Neoplasm Histologic Grade"]
grade = ["G2", "G3", "G4"]

numerical_transformer = Pipeline(steps = [
    ("imputer1", SimpleImputer(strategy = "mean")),
    ("norm", Normalizer())
])

nominal_transformer = Pipeline(steps = [
    ("imputer2", SimpleImputer(strategy = "most_frequent")),
    ("OneHot", OneHotEncoder())
])

ordinal_transformer = Pipeline(steps = [
    ("imputer3", SimpleImputer(strategy = "most_frequent")),
    ("Ordinal", OrdinalEncoder(categories = [grade]))
])

preprocessor = ColumnTransformer(transformers = [
    ("numerical", numerical_transformer, numerical_columns),
    ("nominal", nominal_transformer, nominal_columns),
    ("ordinal", ordinal_transformer, ordinal_columns)
])

scores = -1 * cross_val_score(my_pipeline, X, y, CV = 5, scoring = "neg_mean_absolute_error")

In [None]:
time, survival_prob = kaplan_meier_estimator(data["Overall Survival Status"], data["Overall Survival (Months)"])
plt.step(time, survival_prob, where="post")
plt.ylabel("est. probability of survival $\hat{S}(t)$")
plt.xlabel("time $t$")

In [None]:
pip install scikit-survival