# General Preamble Code

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import mglearn
from IPython.display import display
import urllib.request
import tarfile
import io
import seaborn as sns
import warnings
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA

# Additional code for GE Dataset

In [2]:
# For GE, additional import code:
def load_tcga_data():
    url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00401/TCGA-PANCAN-HiSeq-801x20531.tar.gz'
    try:
        with urllib.request.urlopen(url) as response:
            with tarfile.open(fileobj=io.BytesIO(response.read()), mode="r:gz") as tar:
                data_path = 'TCGA-PANCAN-HiSeq-801x20531/data.csv'
                labels_path = 'TCGA-PANCAN-HiSeq-801x20531/labels.csv'
                data_file = tar.extractfile(data_path)
                X = pd.read_csv(data_file, index_col=0)
                labels_file = tar.extractfile(labels_path)
                labels_df = pd.read_csv(labels_file, index_col=0)
                y = labels_df['Class']
                return X, y
    except Exception as e:
        print(f"Failed to download or process data. Error: {e}")
        return None, None
X, y = load_tcga_data()

# Question 1
- Standardize the training and test sets using StandardScaler into (X_train_scaled). How many samples are there in your training dataset?
- Train a KNeighborsClassifier (n_neighbors=5) on the standardized, full-dimensional training data.Report the model's accuracy on the standardized test set. Also use the %%time magic command to measure and report the total time it takes to fit the model and make predictions.
- Now, do the above step and train 2 additional KNNs on only the first 300 samples, and the first 450 samples in the datasets. Plot the accuracy and time taken with 300, 450, and 600 on the x-axis and the corresponding values of time and accuracy on the y-axis.
- Does this plot allow you to guesstimate the time and accuracy when there are 50,000 patients in the dataset? What challenges does a distance-based algorithm like KNN still face on a scaled, but very high-dimensional, dataset?

In [3]:
print("############ Assignment 7 Question 1 BEGIN ############")

print("############# Assignment 7 Question 1 END #############")

############ Assignment 7 Question 1 BEGIN ############
############# Assignment 7 Question 1 END #############


# Question 2
- Fit an instance of PCA on the standardized training data (X_train_scaled). Do not specify the number of components yet.
- Generate a "scree plot" by creating a bar chart of the explained_variance_ratio_ for the first 50 principal components. 
- Generate a cumulative explained variance plot for the first 50 components. On the same plot, add a horizontal line at y=0.90 to represent the 90% threshold.
- Based on your cumulative plot, approximately how many principal components are required to capture 90% of the total variance?


In [4]:
print("############ Assignment 7 Question 2 BEGIN ############")

print("############# Assignment 7 Question 2 END #############")

############ Assignment 7 Question 2 BEGIN ############
############# Assignment 7 Question 2 END #############


# Question 3
- Use the PCA model from Question 2 create the following 2D scatter plot. Color each point in the scatter plot according to its true cancer type (y_test). 
    - X-axis is PC1, y-axis PC2
    - X-axis is PC1, y-axis PC3 
    - X-axis PC2, y-axis PC3
    - X-axis PC1, y-axis PC4
- Based on your plots, how well does each combination of principal components separate the different cancer types? Why would it be impossible to create such an informative plot using any two of the original 20,000+ genes?


In [5]:
print("############ Assignment 7 Question 3 BEGIN ############")

print("############# Assignment 7 Question 3 END #############")

############ Assignment 7 Question 3 BEGIN ############
############# Assignment 7 Question 3 END #############


# Question 4
- Create a new PCA instance, setting n_components to the number you identified in Question 2d to capture 90% of the variance. Fit this new PCA on the standardized training data and use it to transform both the training and test sets.
- Train the same KNeighborsClassifier from Question 1 on this new, lower-dimensional PCA-transformed training data again for 300, 450, and 600 samples.
- Report the accuracy and use %%time to report the total time for fitting and prediction and compare the accuracy and total time of this model to your baseline model in Question and draw the same plot relative to different sample sizes.
- What does this "apples-to-apples" comparison tell you about the specific advantages of PCA for an algorithm like KNN, especially if extrapolated to a situation where we might have 50,000 patients in the data?


In [6]:
print("############ Assignment 7 Question 4 BEGIN ############")

print("############# Assignment 7 Question 4 END #############")

############ Assignment 7 Question 4 BEGIN ############
############# Assignment 7 Question 4 END #############


# Question 5
- Extract the loadings for the first principal component (PC1) from the components_ attribute of the PCA model fit in Question 2. 
- Find the top 10 genes with the largest absolute loading values and report their loading values. 
- Create a horizontal bar chart to visualize the loadings of these top 10 genes. 
- In a biological context, what does it mean for these specific genes to have high loading values on the first principal component? 

In [7]:
print("############ Assignment 7 Question 5 BEGIN ############")

print("############# Assignment 7 Question 5 END #############")

############ Assignment 7 Question 5 BEGIN ############
############# Assignment 7 Question 5 END #############


# Question 6
- Use the PCA model from Question 4 (explaining 90% variance) to inverse_ transform your X_train_pca data back to the ~20k dimensions. Calculate the Mean Squared Error (MSE) between the original X_train_scaled and this new reconstructed data. What does this error represent?
- To visualize this, create a line plot showing the expression levels of the first 200 genes for the first patient in the original X_train_scaled data. On the same plot, show the expression levels for the same patient from the reconstructed data.

In [8]:
print("############ Assignment 7 Question 6 BEGIN ############")

print("############# Assignment 7 Question 6 END #############")

############ Assignment 7 Question 6 BEGIN ############
############# Assignment 7 Question 6 END #############


# Question 7
- The Necessity of Scaling for PCA Explain, with reference to how PCA's algorithm is based on maximizing variance, why failing to scale features like gene expression levels would lead to a meaningless result. How does this differ from decision trees?


In [9]:
print("############ Assignment 7 Question 7 BEGIN ############")

print("############# Assignment 7 Question 7 END #############")

############ Assignment 7 Question 7 BEGIN ############
############# Assignment 7 Question 7 END #############


# Question 8
- Interpretability: Like question 5b, now write out the top 10 genes of the first 5 principal components trained in Question 2 and their corresponding loading values. If youâ€™re a data scientist with no biological background, and you are asked to explain the meaning of each of the principal components would you be able to explain it? How might a biologist use their expertise to explain each principal component better?

In [10]:
print("############ Assignment 7 Question 8 BEGIN ############")

print("############# Assignment 7 Question 8 END #############")

############ Assignment 7 Question 8 BEGIN ############
############# Assignment 7 Question 8 END #############


# Question 9
- Bioinformatics: This lab demonstrates a standard bioinformatics workflow: scaling -> PCA -> algorithmic processing. Explain why this approach is so powerful for finding meaningful patterns in massive genomic datasets and its importance for cancer research.

In [11]:
print("############ Assignment 7 Question 9 BEGIN ############")

print("############# Assignment 7 Question 9 END #############")

############ Assignment 7 Question 9 BEGIN ############
############# Assignment 7 Question 9 END #############
