# Gene Expression Prediction: ALL vs AML Classification
## Model Deployment and Testing

This notebook loads trained models and provides prediction functionality for classifying gene expression data as ALL (Acute Lymphoblastic Leukemia) or AML (Acute Myeloid Leukemia).

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import joblib
import os
import json
from typing import List, Dict, Any

## 1. Load Trained Models and Artifacts

In [2]:
# Define paths
models_dir = '../models'

# Load model artifacts
model = joblib.load(f'{models_dir}/best_model_svm_rbf.pkl')
scaler = joblib.load(f'{models_dir}/scaler.pkl')
top_genes_indices = joblib.load(f'{models_dir}/top_genes_indices.pkl')
gene_metadata = joblib.load(f'{models_dir}/gene_metadata.pkl')

print("Model loaded successfully!")
print(f"\nModel: {gene_metadata['model_name']}")
print(f"Test Accuracy: {gene_metadata['test_accuracy']:.4f}")
print(f"CV Accuracy: {gene_metadata['cv_accuracy']:.4f}")
print(f"\nNumber of features (genes): {gene_metadata['n_features']}")
print(f"\nTop 10 genes used:")
for i, gene_name in enumerate(gene_metadata['top_genes_names'][:10], 1):
    print(f"  {i}. {gene_name[:70]}")

Model loaded successfully!

Model: SVM (RBF)
Test Accuracy: 0.6176
CV Accuracy: 0.9750

Number of features (genes): 50

Top 10 genes used:
  1. Leukotriene C4 synthase (LTC4S) gene
  2. Zyxin
  3. FAH Fumarylacetoacetate
  4. LYN V-yes-1 Yamaguchi sarcoma viral related oncogene homolog
  5. LEPR Leptin receptor
  6. CD33 CD33 antigen (differentiation antigen)
  7. Liver mRNA for interferon-gamma inducing factor(IGIF)
  8. PRG1 Proteoglycan 1; secretory granule
  9. GB DEF = Homeodomain protein HoxA9 mRNA
  10. DF D component of complement (adipsin)


## 2. Load Test Data

In [3]:
# Load test dataset
test_df = pd.read_csv('../data/data_set_ALL_AML_independent.csv')
labels_df = pd.read_csv('../data/actual.csv')

# Get expression columns
gene_info_cols = ['Gene Description', 'Gene Accession Number']
test_cols = test_df.columns.tolist()
test_expr_cols = [col for col in test_cols if col not in gene_info_cols and 'call' not in col.lower()]

# Extract expression matrix
X_test = test_df[test_expr_cols].T

# Get labels for test samples
test_labels = labels_df[labels_df['patient'] > 38].copy()
y_test = test_labels['cancer'].values

print(f"Test data loaded: {X_test.shape[0]} samples, {X_test.shape[1]} genes")
print(f"\nClass distribution:")
print(pd.Series(y_test).value_counts())

Test data loaded: 34 samples, 7129 genes

Class distribution:
ALL    20
AML    14
Name: count, dtype: int64


## 3. Define Prediction Function

In [4]:
def predict_cancer_type(gene_expression_data: np.ndarray) -> Dict[str, Any]:
    """
    Predict cancer type from gene expression data.
    
    Parameters:
    -----------
    gene_expression_data : np.ndarray
        Array of gene expression values (must have 7129 genes)
    
    Returns:
    --------
    dict : Prediction results with probabilities
    """
    # Validate input
    if gene_expression_data.shape[0] != 7129:
        raise ValueError(f"Expected 7129 genes, got {gene_expression_data.shape[0]}")
    
    # Select top genes
    X_selected = gene_expression_data[top_genes_indices].reshape(1, -1)
    
    # Scale features
    X_scaled = scaler.transform(X_selected)
    
    # Make prediction
    prediction = model.predict(X_scaled)[0]
    probabilities = model.predict_proba(X_scaled)[0]
    
    # Get class labels
    classes = model.classes_
    
    # Create result dictionary
    result = {
        'prediction': prediction,
        'confidence': float(max(probabilities)),
        'probabilities': {
            classes[0]: float(probabilities[0]),
            classes[1]: float(probabilities[1])
        }
    }
    
    return result

print("Prediction function defined successfully!")

Prediction function defined successfully!


## 4. Test Predictions on Sample Data

In [5]:
# Test on first 5 samples from test set
print("Testing predictions on sample data:")
print("=" * 80)

for i in range(min(5, len(X_test))):
    sample_data = X_test.iloc[i].values
    true_label = y_test[i]
    
    result = predict_cancer_type(sample_data)
    
    print(f"\nSample {i+1}:")
    print(f"  True Label:     {true_label}")
    print(f"  Prediction:     {result['prediction']}")
    print(f"  Confidence:     {result['confidence']:.2%}")
    print(f"  ALL Probability: {result['probabilities']['ALL']:.2%}")
    print(f"  AML Probability: {result['probabilities']['AML']:.2%}")
    print(f"  Correct:        {'✓' if result['prediction'] == true_label else '✗'}")

Testing predictions on sample data:

Sample 1:
  True Label:     ALL
  Prediction:     ALL
  Confidence:     95.37%
  ALL Probability: 95.37%
  AML Probability: 4.63%
  Correct:        ✓

Sample 2:
  True Label:     ALL
  Prediction:     ALL
  Confidence:     80.66%
  ALL Probability: 80.66%
  AML Probability: 19.34%
  Correct:        ✓

Sample 3:
  True Label:     ALL
  Prediction:     AML
  Confidence:     76.09%
  ALL Probability: 23.91%
  AML Probability: 76.09%
  Correct:        ✗

Sample 4:
  True Label:     ALL
  Prediction:     ALL
  Confidence:     94.23%
  ALL Probability: 94.23%
  AML Probability: 5.77%
  Correct:        ✓

Sample 5:
  True Label:     ALL
  Prediction:     ALL
  Confidence:     98.08%
  ALL Probability: 98.08%
  AML Probability: 1.92%
  Correct:        ✓


## 5. Evaluate Model Performance

In [6]:
# Evaluate on entire test set
predictions = []
true_labels = []

for i in range(len(X_test)):
    sample_data = X_test.iloc[i].values
    result = predict_cancer_type(sample_data)
    predictions.append(result['prediction'])
    true_labels.append(y_test[i])

# Calculate accuracy
correct = sum([1 for i in range(len(predictions)) if predictions[i] == true_labels[i]])
accuracy = correct / len(predictions)

print("\nModel Performance on Test Set:")
print("=" * 80)
print(f"Total Samples:     {len(predictions)}")
print(f"Correct:           {correct}")
print(f"Incorrect:         {len(predictions) - correct}")
print(f"Accuracy:          {accuracy:.2%}")

# Confusion matrix
from sklearn.metrics import confusion_matrix, classification_report

cm = confusion_matrix(true_labels, predictions, labels=['ALL', 'AML'])
print("\nConfusion Matrix:")
print(pd.DataFrame(cm, index=['True ALL', 'True AML'], columns=['Pred ALL', 'Pred AML']))

print("\nDetailed Classification Report:")
print(classification_report(true_labels, predictions))


Model Performance on Test Set:
Total Samples:     34
Correct:           21
Incorrect:         13
Accuracy:          61.76%

Confusion Matrix:
          Pred ALL  Pred AML
True ALL        14         6
True AML         7         7

Detailed Classification Report:
              precision    recall  f1-score   support

         ALL       0.67      0.70      0.68        20
         AML       0.54      0.50      0.52        14

    accuracy                           0.62        34
   macro avg       0.60      0.60      0.60        34
weighted avg       0.61      0.62      0.62        34



## 6. Single Sample Prediction Example

This demonstrates how to make a prediction for a single patient's gene expression profile.

In [7]:
# Example: Predict for a specific sample
sample_index = 0  # Change this to test different samples
sample_patient = X_test.iloc[sample_index]

print(f"Patient Sample #{sample_index + 1}")
print("=" * 80)

# Make prediction
result = predict_cancer_type(sample_patient.values)

print(f"\nPrediction Results:")
print(f"  Predicted Cancer Type: {result['prediction']}")
print(f"  Confidence Level:      {result['confidence']:.2%}")
print(f"\nDetailed Probabilities:")
for cancer_type, prob in result['probabilities'].items():
    print(f"  {cancer_type}: {prob:.4f} ({prob*100:.2f}%)")

# Show top contributing genes for this prediction
print(f"\nTop 5 Gene Expression Values Used:")
selected_genes = sample_patient.values[top_genes_indices]
for i in range(5):
    gene_name = gene_metadata['top_genes_names'][i]
    expression_value = selected_genes[i]
    print(f"  {i+1}. {gene_name[:60]}: {expression_value:.2f}")

Patient Sample #1

Prediction Results:
  Predicted Cancer Type: ALL
  Confidence Level:      95.37%

Detailed Probabilities:
  ALL: 0.9537 (95.37%)
  AML: 0.0463 (4.63%)

Top 5 Gene Expression Values Used:
  1. Leukotriene C4 synthase (LTC4S) gene: 1122.00
  2. Zyxin: 178.00
  3. FAH Fumarylacetoacetate: 627.00
  4. LYN V-yes-1 Yamaguchi sarcoma viral related oncogene homolog: 164.00
  5. LEPR Leptin receptor: 465.00


## 7. Export Prediction Function for API

In [8]:
# Save prediction function as a module
prediction_code = '''
import numpy as np
import joblib
from typing import Dict, Any

# Load model artifacts (done once at startup)
models_dir = 'models'
model = joblib.load(f'{models_dir}/best_model_svm_rbf.pkl')
scaler = joblib.load(f'{models_dir}/scaler.pkl')
top_genes_indices = joblib.load(f'{models_dir}/top_genes_indices.pkl')
gene_metadata = joblib.load(f'{models_dir}/gene_metadata.pkl')

def predict_cancer_type(gene_expression_data: np.ndarray) -> Dict[str, Any]:
    """
    Predict cancer type from gene expression data.
    
    Parameters:
    -----------
    gene_expression_data : np.ndarray or list
        Array of gene expression values (must have 7129 genes)
    
    Returns:
    --------
    dict : Prediction results with probabilities
    """
    # Convert to numpy array if needed
    if isinstance(gene_expression_data, list):
        gene_expression_data = np.array(gene_expression_data)
    
    # Validate input
    if gene_expression_data.shape[0] != 7129:
        raise ValueError(f"Expected 7129 genes, got {gene_expression_data.shape[0]}")
    
    # Select top genes
    X_selected = gene_expression_data[top_genes_indices].reshape(1, -1)
    
    # Scale features
    X_scaled = scaler.transform(X_selected)
    
    # Make prediction
    prediction = model.predict(X_scaled)[0]
    probabilities = model.predict_proba(X_scaled)[0]
    
    # Get class labels
    classes = model.classes_
    
    # Create result dictionary
    result = {
        'prediction': prediction,
        'confidence': float(max(probabilities)),
        'probabilities': {
            classes[0]: float(probabilities[0]),
            classes[1]: float(probabilities[1])
        },
        'model_info': {
            'name': gene_metadata['model_name'],
            'accuracy': gene_metadata['test_accuracy'],
            'n_features': gene_metadata['n_features']
        }
    }
    
    return result

def get_model_info() -> Dict[str, Any]:
    """Get information about the loaded model."""
    return {
        'model_name': gene_metadata['model_name'],
        'test_accuracy': gene_metadata['test_accuracy'],
        'cv_accuracy': gene_metadata['cv_accuracy'],
        'n_features': gene_metadata['n_features'],
        'total_genes_required': 7129,
        'top_genes': gene_metadata['top_genes_names']
    }
'''

# Save to file
with open('../predictor.py', 'w') as f:
    f.write(prediction_code)

print("Prediction module saved to: predictor.py")
print("This file can be imported by the FastAPI application")

Prediction module saved to: predictor.py
This file can be imported by the FastAPI application


## Summary

This notebook demonstrates:
1. Loading trained models and artifacts
2. Making predictions on new gene expression data
3. Evaluating model performance
4. Exporting prediction functionality for deployment

The prediction function can now be integrated into a FastAPI web application for production use.