# Gene Expressions for different types of tumor
<div style="text-align: center;">
    <img src="images/gene_protein_cancer.jpg" alt="image" width="300" height="200">
</div>
[image src: https://www.cancer.gov/about-cancer/causes-prevention/genetics]

This project aims to identify different gene expressions associated to 5 types of tumor : 
- BRCA (Breast Cancer): Family of Genes (BRCA1 and BRCA2) are known as tumor suppresors. But mutation in these genes cause cancer.
- KIRC (Kidney Renal Clear Cell Carcinoma): 
- COAD (Colon Adenocarcinoma)
- LUAD (Lung Adenocarcinoma)
- PRAD (Prostate Adenocarcinoma)

The dataset is sourced from https://archive.ics.uci.edu/dataset/401/gene+expression+cancer+rna+seq.
The original dataset is published at https://www.synapse.org/Synapse:syn300013/discussion/threadId=5455. The Gene names in the dataset are dummy names. The actual gene names are at https://www.ncbi.nlm.nih.gov/gene, per this discussion thread https://www.synapse.org/Synapse:syn300013/discussion/threadId=5455. 

In [3]:
import warnings

warnings.filterwarnings("ignore")
DATA_ANALYSIS_DIR = "data-analysis/"

## Data Loading


In [4]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from src.utils.ArrayUtils import get_list_of_items_in_both_lists


In [None]:
# read the data from files
url_reconstructed = 'TCGA-PANCAN-HiSeq-801x20531/data.csv'

# url = 'https://drive.google.com/file/d/1VXyhDXpYT8G2Buhkc6kBjw93CLG1y1f0/view?usp=drive_link'
# # Use only the Id and reconstruct the URL
# url_reconstructed = 'https://drive.google.com/uc?id=' + url.split('/')[-2]

df = pd.read_csv(url_reconstructed)
tumor_df = pd.read_csv('TCGA-PANCAN-HiSeq-801x20531/labels.csv')
df

In [None]:
df.info()


In [None]:
print(f"Dataframe before adding the class column: {df.info()}")
print(f"Total # of columns: {len(df.columns)}")

In [None]:
# Copy over the Class column into the main dataframe
df = df.rename(columns={'Unnamed: 0': 'sample'})
tumor_df = tumor_df.rename(columns={'Unnamed: 0': 'sample'})
df['Class'] = np.where( (df['sample'] == tumor_df['sample']), tumor_df['Class'], df['sample'])

# df['Class'] = tumor_df['Class']
# print(f"Total # of columns: {len(df.columns)}")

df.drop(columns=['sample'], axis=1, inplace=True)

df.head()


## Exploratory Data Analysis (EDA)
### Data Cleaning and PreProcessing

Data : 
- No NANs as stated on the data source page


In [9]:
df.isnull().any().any()

np.False_

In [None]:
# Find columns with 0 values
zero_cols = df.columns[(df == 0).all()]
print(f"# of columns with all 0s in them : {zero_cols}")
df = df.drop(columns=zero_cols, axis=1)
print(f"# of columns with all 0s in them : {df.columns[(df == 0).all()]}")

In [11]:
# Post cleanup, write the data to file
df.to_csv(DATA_ANALYSIS_DIR + "df_PostColumnCleanup.csv")


In [None]:
# Class analysis
class_unique_vals = df['Class'].unique()
assert(len(class_unique_vals) == 5)

# print the distribution
print(f"Distribution of Class values: \n{df['Class'].value_counts()}")


In [13]:
# Split -
X = df.drop(columns=['Class'], axis=1)
y = df['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=df['Class'])


### Univariate and Multivariate Analysis
Given the large # of variables (columns), charting each out against other can be a challenge. Hence printing key insights such as stats to file.

In [14]:
from scipy import stats

df_desc = df.describe()
df_desc_t = df_desc.T
df_desc_t.to_csv(DATA_ANALYSIS_DIR + "df_describe.csv")

df_desc_zscore = df_desc.apply(stats.zscore)
df_desc_zscore.T.to_csv(DATA_ANALYSIS_DIR + "rawdata_zscore.csv")
# df_desc['upperbound'] = df_desc['mean'] + 3*(df_desc['std'])
# df_desc['lowerbound'] = df_desc['mean'] - 3*(df_desc['std'])


###### Identify Correlation across features / columns

In [24]:
# --- NOTE : This take quite some time to run 11+ mins
corr_df = X.corr()
corr_df.to_csv(DATA_ANALYSIS_DIR + 'features_corr_matrix.csv')


In [None]:
# Are there any Features with Correlation value= NaN ?
# -- In total there are 267 cols with 0 values, since they are deleted, result is zero
corr_nan_list = corr_df.columns[corr_df.isna().all()].tolist()
print(corr_nan_list)
corr_df.head()

### Feature Selection


### Dimensionality Reduction

In [None]:
# Here we run without reducing any dimensions, i.e directly on all of the data.



#### Check Point EDA : Observations thus far
- Data is clean
- Some of the columns have 0 values, which means


## Modeling

#### Random Forest

In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import plot_tree
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import auc
from sklearn.preprocessing import label_binarize
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import plotly.graph_objects as go
from plotly.subplots import make_subplots




In [17]:
# --- Use basic model
rd_classifier = RandomForestClassifier(random_state=42, oob_score=True)
rd_classifier.fit(X_train, y_train)
rd_preds = rd_classifier.predict(X_test)
report = classification_report(y_test, rd_preds)
rd_preds_prob = rd_classifier.predict_proba(X_test)

rd_oob_score = rd_classifier.oob_score_
rd_feature_imp = rd_classifier.feature_importances_



##### Model Evaluation

In [22]:
# Function to plot Confusion Matrix
def plot_confusion_matrix(conf_matrix, labels, title="Confusion Matrix"):

    conf_matrix = conf_matrix[::-1]
    # labels = labels[::-1]

    #heat map
    fig = go.Figure(data=go.Heatmap(
        z=conf_matrix,
        x=labels,
        y=labels[::-1], #reverse the order to align labels with way Conf matrix is output
        colorscale='Rainbow', # 'Hot', # 'YlOrRd', # 'YlGnBu', #'Viridis',
        texttemplate="%{z}",
        textfont={"size": 10}
    ))

    fig.update_layout(
        title_text = title,
        xaxis_title="Predicted Class",
        yaxis_title="Actual Class",
        # xaxis={'side': 'top'},
        # yaxis={'autorange': 'reversed'},
        width=500,
        height=500,
    )

    fig.show()

In [None]:
# - interpreting Results
print(f"Out of Bag score: {rd_oob_score}")
print(f"Feature Importances: {rd_feature_imp.sort()}")
print(f"Accuracy : {rd_classifier.score(X_test, y_test)}")
print("Classification report: \n", report)

cm = confusion_matrix(y_true=y_test, y_pred=rd_preds, labels=class_unique_vals)
plot_confusion_matrix(cm, labels=class_unique_vals, title='RandomForest without PCA : Confusion Matrix')


In [None]:
# Area under Curve - REF : https://www.geeksforgeeks.org/interpreting-random-forest-classification-results/
target_vals = y_test.unique()
# y_test_bin = label_binarize(y_test, classes=[0,1,2,3,4])
label_binzer = LabelBinarizer()
label_binzer.fit(y_test)
y_test_bin = label_binzer.transform(y_test)
# print(y_test_bin)
fpr = dict()
tpr = dict()
roc_auc = dict()

# print(f"y_test_bin : {len(y_test_bin[:, 0])}")
# print(f"rd_preds_prob : {rd_preds_prob[:, 0]}")
# print(f"y_test_bin : {len(y_test_bin[:, 1])}")
# print(f"rd_preds_prob : {rd_preds_prob[:, 1]}")


for index in range(len(target_vals)):
    fpr[index], tpr[index], _ = roc_curve(y_test_bin[:, index], rd_preds_prob[:, index])
    # print(f"FPR at {index}: \n{fpr[index]}")
    # print(f"TPR at {index}: \n{tpr[index]}")
    roc_auc[index] = auc(fpr[index], tpr[index])

# Plot ROC curve
plt.figure()
for index in range(len(target_vals)) :
    plt.plot(fpr[index], tpr[index], lw=2, label=f"ROC curve of class {target_vals[index]} (area = {roc_auc[index]:.2f})")

# plt.plot([0,1], [0,1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characterstic for Tumor classes')
plt.legend(loc="lower right")
plt.show()

