In [7]:
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler


In [4]:
# Load the data
X = np.load('./data/processed/X.npy')
y = np.load('./data/processed/y.npy', allow_pickle=True)

In [10]:
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

# list all unique subtypes
unique_types = np.unique(y)
print("Unique types:", unique_types)

Shape of X: (563, 57915)
Shape of y: (563,)
Unique types: ['ER+' 'ER+HER2+' 'ER+HER2+ LN metastasis' 'HER2+' 'TNBC'
 'TNBC LN metastasis']


In [8]:
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)  # 100 trees in the forest

# Train the classifier on the training data
rf_classifier.fit(X_train, y_train)

# Predict on the testing data
y_pred = rf_classifier.predict(X_test)

# Evaluate the classifier performance
print("Accuracy on test data: ", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy on test data:  0.8584070796460177

Classification Report:
                         precision    recall  f1-score   support

                   ER+       1.00      1.00      1.00        23
              ER+HER2+       0.75      0.50      0.60         6
ER+HER2+ LN metastasis       0.78      0.70      0.74        10
                 HER2+       0.96      0.89      0.92        27
                  TNBC       0.76      0.97      0.85        36
    TNBC LN metastasis       0.83      0.45      0.59        11

              accuracy                           0.86       113
             macro avg       0.85      0.75      0.78       113
          weighted avg       0.87      0.86      0.85       113


Confusion Matrix:
 [[23  0  0  0  0  0]
 [ 0  3  0  0  3  0]
 [ 0  1  7  1  1  0]
 [ 0  0  0 24  3  0]
 [ 0  0  0  0 35  1]
 [ 0  0  2  0  4  5]]
