# Gene Expression Cancer Classification
This notebook mirrors a real transcriptomics ML workflow.

In [None]:
!pip install datasets scikit-learn pandas numpy shap umap-learn torch matplotlib joblib

## Step 1: Load Dataset

In [None]:
from datasets import load_dataset
import pandas as pd

dataset = load_dataset("openlifescienceai/cancer-gene-expression")
df = pd.DataFrame(dataset["train"])
df.head()

## Step 2: Split Features & Labels

In [None]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

## Step 3: Normalize

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

## Step 4: PCA Reduction

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=50)
X_pca = pca.fit_transform(X_scaled)

## PCA Variance Plot

In [None]:
import matplotlib.pyplot as plt
plt.plot(pca.explained_variance_ratio_)
plt.title("PCA Variance Explained")
plt.show()

## Step 5: UMAP Visualization

In [None]:
import umap
reducer = umap.UMAP()
embedding = reducer.fit_transform(X_pca)
plt.scatter(embedding[:,0], embedding[:,1], c=y.astype('category').cat.codes)
plt.title("UMAP Visualization")
plt.show()

## Step 6: Train/Test Split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

## Step 7: Train Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=300)
model.fit(X_train, y_train)

## Step 8: Evaluation

In [None]:
from sklearn.metrics import classification_report
preds = model.predict(X_test)
print(classification_report(y_test, preds))

## Step 9: SHAP Explainability

In [None]:
import shap
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_train[:100])
shap.summary_plot(shap_values, X_train[:100])

## Step 10: Save Model

In [None]:
import joblib
joblib.dump(model, "../models/rf_model.pkl")