In [8]:
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from ClassificationReport import ClassificationMetrics

import ClassificationReport

In [3]:
# Function to calculate precision. It's important for measuring exactness.
def calculate_precision(tp, fp):
    if (tp + fp) > 0:
        return tp / (tp + fp)  # True Positives divided by Total Predicted Positives
    else:
        return 0

# Function to calculate recall, which measures how well we capture all positives
def calculate_recall(tp, fn):
    if (tp + fn) > 0:
        return tp / (tp + fn)  # True Positives divided by Actual Total Positives
    else:
        return 0

# This function calculates F1 score, which is the harmonic mean of precision and recall
def calculate_f1_score(precision, recall):
    if (precision + recall) > 0:
        return 2 * (precision * recall) / (precision + recall)
    else:
        return 0

# Here ,we calculate the accuracy of our model
def calculate_accuracy(true_labels, pred_labels):
    correct_predictions = np.sum(true_labels == pred_labels)
    total_predictions = len(true_labels)
    return correct_predictions / total_predictions  # Correct predictions over all predictions

# This function builds the confusion matrix, useful for visualizing TP, FP, FN, TN
def confusion_matrix(true_labels, pred_labels, classes):
    class_index = {k: i for i, k in enumerate(classes)}  # Create a map of class to index
    matrix = np.zeros((len(classes), len(classes)), dtype=int)  # Init matrix of zeros
    for true, pred in zip(true_labels, pred_labels):
        true_idx = class_index[true]
        pred_idx = class_index[pred]
        matrix[true_idx][pred_idx] += 1  # Increment the cell for each prediction
    return matrix

# Generate and print out the classification report
def classification_report_categorical(true_labels, pred_labels):
    classes = np.unique(np.concatenate([true_labels, pred_labels]))  # Getting all classes
    cm = confusion_matrix(true_labels, pred_labels, classes)
    accuracy = calculate_accuracy(true_labels, pred_labels)  # Overall accuracy
    print(f"{'Class':<30}{'Precision':<10}{'Recall':<10}{'F1-Score':<10}") 
    for i, class_name in enumerate(classes):
        tp = cm[i, i]
        fp = np.sum(cm[:, i]) - tp
        fn = np.sum(cm[i, :]) - tp
        precision = calculate_precision(tp, fp)
        recall = calculate_recall(tp, fn)
        f1_score = calculate_f1_score(precision, recall)
        # Printing each class with metrics
        print(f"{class_name:<30}{precision:<10.2f}{recall:<10.2f}{f1_score:<10.2f}")
    print(f"\n{'Overall Accuracy:':<30}{accuracy:.2f}")

In [4]:
# Load the data
X = np.load('../data/processed/X.npy')
y = np.load('../data/processed/y.npy', allow_pickle=True)

In [5]:
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

# list all unique subtypes
unique_types = np.unique(y)
print("Unique types:", unique_types)

Shape of X: (563, 57915)
Shape of y: (563,)
Unique types: ['ER+' 'ER+HER2+' 'ER+HER2+ LN metastasis' 'HER2+' 'TNBC'
 'TNBC LN metastasis']


In [9]:
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Perform 5-fold cross-validation
cv_scores = cross_val_score(rf_classifier, X_scaled, y, cv=5)
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Score:", cv_scores.mean())

# Train the classifier on the training data
rf_classifier.fit(X_train, y_train)

# Predict on the testing data
y_pred = rf_classifier.predict(X_test)

# Evaluate the classifier performance
#print(classification_report(y_test, y_pred, num_classes))
#classification_report_categorical(y_test, y_pred)
metrics = ClassificationMetrics(y_test, y_pred)
metrics.report()

#print("Accuracy on test data: ", accuracy_score(y_test, y_test))
#print("\nClassification Report:\n", classification_report(y_test, y_pred))
#print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Cross-Validation Scores: [0.7079646  0.84070796 0.84070796 0.83928571 0.61607143]
Mean CV Score: 0.7689475347661189
Class                         Precision Recall    F1-Score  
ER+                           1.00      1.00      1.00      
ER+HER2+                      0.75      0.50      0.60      
ER+HER2+ LN metastasis        0.78      0.70      0.74      
HER2+                         0.96      0.89      0.92      
TNBC                          0.76      0.97      0.85      
TNBC LN metastasis            0.83      0.45      0.59      

Overall Accuracy:             0.86
