In [47]:
from sklearn.model_selection import train_test_split, RepeatedKFold
from sklearn.preprocessing import StandardScaler

from sklearn.feature_selection import RFECV

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, matthews_corrcoef, confusion_matrix
from sklearn.base import clone
import pandas as pd
import numpy as np

In [48]:
df = pd.read_csv("radiomic_features.csv")

In [49]:
X = df.drop(["ID", "idh_mutated", "gbma"], axis=1)
y = df["idh_mutated"]

In [50]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Feature filtering

In [51]:
def sensit_recall(res):
    tn, fp, fn, tp = res.ravel()
    return tp / (tp + fn)

def precision(res):
    tn, fp, fn, tp = res.ravel()
    return tp/ (tp + fp)

def specificity(res):
    tn, fp, fn, tp = res.ravel()
    return tn / (tn + fp)

def f1(res):
    prec = precision(res)
    recall= sensit_recall(res)
    return (2 * prec * recall) / (precision + recall)

def accuracy(res):
    tn, fp, fn, tp = res.ravel()
    return (tp + tn) / (tp + fp + fn + tn)

def calculate_mcc(res):
    tn, fp, fn, tp = res.ravel()
    nominator = tp * tn - fp * fn
    denum = ((tp + fp)*(tp + fn)*(tn + fp) * (tn+fn)) ** (1/2)
    return nominator / denum

def print_metrics(conf_matrix):
    print(f"Accuracy: {accuracy(conf_matrix):.3f}")
    print(f"Specificity: {specificity(conf_matrix):.3f}")
    print(f"Precision: {precision(conf_matrix):.3f}")
    print(f"Sensitivity (Recall): {sensit_recall(conf_matrix):.3f}")

    mcc = calculate_mcc(conf_matrix)
    print(f"\nMCC: {mcc:.3f}")


In [52]:
class ModelTraining:
    def __init__(self, estimator, scaler, data_X, data_y, random_state=42):
        self.data_X, self.data_y = data_X, data_y
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(data_X, data_y, test_size=0.33, random_state=random_state)
        
        self.original_estimator = clone(estimator)
        self.estimator = estimator
        self.original_scaler = clone(scaler)
        self.scaler = scaler

        self.feature_columns = data_X.columns

    def fit_scaler(self, X=None):
        if X is None:
            X = self.X_train
        return self.scaler.fit_transform(X)

    def scale_features(self, X=None):
        if X is None:
            X = self.X_test
        
        return self.scaler.transform(X)
    
    def train(self, X=None, y=None):
        if X is None or y is None:
            X = self.X_train
            y = self.y_train

        self.estimator.fit(X, y)

    def cross_validate(self, X=None, y=None, cv_model=None, random_state=42, select_features=10, features_strict=True, repeats=2):
        if X is None or y is None:
            X = self.data_X
            y = self.data_y
        if cv_model is None:
            cv_model = RepeatedKFold(n_splits=3, n_repeats=repeats, random_state=random_state)
        
        confusion_matrices = []

        self.estimator = clone(self.original_estimator)

        for i, (train, test) in enumerate(cv_model.split(X, y)):
            train_X, train_y = X.iloc[train, :], y[train]
            test_X, test_y = X.iloc[test, :], y[test]

            self.scaler = clone(self.original_scaler)
            train_X = self.fit_scaler(train_X)
            test_X = self.scale_features(test_X)

            # Feature Selection
            self.estimator = clone(self.original_estimator)
            print(f"{i+1}. Split: Selecting features")
            feature_mask = self.select_features(train_X, train_y, select_features=select_features, features_strict=features_strict, random_seed=random_state)
            train_X = train_X[:, feature_mask]
            test_X = test_X[:, feature_mask]

            estimator = clone(self.original_estimator)

            estimator.fit(train_X, train_y)
            test_predictions = estimator.predict(test_X)

            confusion_matrices.append(confusion_matrix(test_y, test_predictions))
        
        accuracies = [accuracy(conf_matrix) for conf_matrix in confusion_matrices]
        print("Results on test splits:")
        print("Accuracies:", accuracies)

        conf_matrix = np.array(confusion_matrices).mean(axis=0)
        print("Averaged confusion matrix:\n", conf_matrix)
        
        print_metrics(conf_matrix)
        return conf_matrix

    
    def predict(self, X=None):
        if X is None:
            X = self.X_test
        return self.estimator.predict(X)
    
    def select_features(self, X=None, y=None, step=0.01, select_features=10, features_strict=True, random_seed=None):
        # By default, RFECV returns at least min_features_to_select (but can return more)
        # This can be avoided by setting features_strict=True

        if X is None or y is None:
            X = self.X_train
            y = self.y_train

        self.selector = RFECV(self.estimator, step=step, min_features_to_select=select_features)
        self.selector.fit(X, y)

        if features_strict:
            worst_ranking = np.sort(self.selector.ranking_)[select_features]
            best_features_mask = self.selector.ranking_ <= worst_ranking
            if random_seed is None:
                # Important: this can remove optimal features
                return best_features_mask & (np.cumsum(best_features_mask) <= select_features)
            else:
                np.random.seed(random_seed)
                true_indices = np.where(best_features_mask)[0]
                selected_indices = np.random.choice(true_indices, size=select_features, replace=False)
                new_mask = np.zeros_like(best_features_mask, dtype=bool)
                new_mask[selected_indices] = True
                return new_mask

        return self.selector.support_


    def train_and_evaluate(self, select_features=10, features_strict=True):
        self.X_train = self.fit_scaler()
        print("Scaler fitted")
        self.X_test = self.scale_features()
        print("Test features scaled")

        # Feature selection
        if select_features is not None:
            feature_mask = self.select_features(self.X_train, select_features=select_features, features_strict=features_strict)
            self.X_train = self.X_train[:, feature_mask]
            self.X_test = self.X_test[:, feature_mask]
            
            print(f"Selected {sum(feature_mask)} features.")

            if len(feature_mask) == len(self.feature_columns):
                print("Selected features:")
                print("\n".join(self.feature_columns[feature_mask]))

        # Model Training
        self.estimator = clone(self.original_estimator)
        self.train(self.X_train, self.y_train)
        print("Training finished")
        
        # Training results
        train_predictions = self.predict(self.X_train)
        train_matrix = confusion_matrix(self.y_train, train_predictions)
        print("\nTraining results:")
        print_metrics(train_matrix)

        # Test results
        test_predictions = self.predict(self.X_test)
        test_matrix = confusion_matrix(self.y_test, test_predictions)
        print("\n\nTest:")
        print("Confusion Matrix:")
        print(test_matrix)

        print_metrics(test_matrix)

## Logistic Regression

In [53]:
conf_matrices_log_reg = []
solvers_penalties = [
    ("saga", "elasticnet"),
    ("saga", "l1"),
    ("lbfgs", "l2"),
    ("sag", "l2"), 
    ("liblinear", "l2"),
    ("liblinear", "l1")
]

for solver, penalty in solvers_penalties:
    
    
    if penalty == "elasticnet":
        l1_ratio = 0.3
    else:
        l1_ratio = None

    #for random_state in range(20, 150, 10):
    log_reg = ModelTraining(LogisticRegression(solver=solver, penalty=penalty, l1_ratio=l1_ratio), StandardScaler(), data_X=X, data_y=y)

    print("Logistic Regression:")
    conf_matrix_log_reg = log_reg.cross_validate(X, y, select_features=140, random_state=42, repeats=5)
    
    conf_matrices_log_reg.append(np.round(conf_matrix_log_reg, 2))

    print_metrics(conf_matrix_log_reg)


Logistic Regression:
1. Split: Selecting features




2. Split: Selecting features




3. Split: Selecting features




4. Split: Selecting features




5. Split: Selecting features




6. Split: Selecting features




7. Split: Selecting features




8. Split: Selecting features




9. Split: Selecting features




10. Split: Selecting features




11. Split: Selecting features




12. Split: Selecting features




13. Split: Selecting features




14. Split: Selecting features




15. Split: Selecting features




Results on test splits:
Accuracies: [0.8, 0.7, 0.75, 0.6, 0.85, 0.8, 0.7, 0.75, 0.7, 0.7, 0.75, 0.6, 0.8, 0.65, 0.8]
Averaged confusion matrix:
 [[7.73333333 2.26666667]
 [3.13333333 6.86666667]]
Accuracy: 0.730
Specificity: 0.773
Precision: 0.752
Sensitivity (Recall): 0.687

MCC: 0.462
Accuracy: 0.730
Specificity: 0.773
Precision: 0.752
Sensitivity (Recall): 0.687

MCC: 0.462
Logistic Regression:
1. Split: Selecting features




2. Split: Selecting features




3. Split: Selecting features




4. Split: Selecting features




5. Split: Selecting features




6. Split: Selecting features




7. Split: Selecting features




8. Split: Selecting features




9. Split: Selecting features




10. Split: Selecting features




11. Split: Selecting features




12. Split: Selecting features




13. Split: Selecting features




14. Split: Selecting features




15. Split: Selecting features




Results on test splits:
Accuracies: [0.8, 0.75, 0.7, 0.6, 0.75, 0.75, 0.8, 0.75, 0.7, 0.65, 0.65, 0.65, 0.8, 0.7, 0.75]
Averaged confusion matrix:
 [[7.4 2.6]
 [3.  7. ]]
Accuracy: 0.720
Specificity: 0.740
Precision: 0.729
Sensitivity (Recall): 0.700

MCC: 0.440
Accuracy: 0.720
Specificity: 0.740
Precision: 0.729
Sensitivity (Recall): 0.700

MCC: 0.440
Logistic Regression:
1. Split: Selecting features
2. Split: Selecting features
3. Split: Selecting features
4. Split: Selecting features
5. Split: Selecting features
6. Split: Selecting features
7. Split: Selecting features
8. Split: Selecting features
9. Split: Selecting features
10. Split: Selecting features
11. Split: Selecting features
12. Split: Selecting features
13. Split: Selecting features
14. Split: Selecting features
15. Split: Selecting features
Results on test splits:
Accuracies: [0.75, 0.7, 0.8, 0.7, 0.75, 0.8, 0.7, 0.75, 0.65, 0.7, 0.8, 0.65, 0.75, 0.65, 0.75]
Averaged confusion matrix:
 [[7.4        2.6       ]
 [2.866666



2. Split: Selecting features




3. Split: Selecting features




4. Split: Selecting features




5. Split: Selecting features




6. Split: Selecting features




7. Split: Selecting features




8. Split: Selecting features




9. Split: Selecting features




10. Split: Selecting features




11. Split: Selecting features




12. Split: Selecting features




13. Split: Selecting features




14. Split: Selecting features




15. Split: Selecting features




Results on test splits:
Accuracies: [0.75, 0.7, 0.8, 0.7, 0.65, 0.8, 0.7, 0.75, 0.7, 0.7, 0.8, 0.65, 0.75, 0.7, 0.8]
Averaged confusion matrix:
 [[7.33333333 2.66666667]
 [2.73333333 7.26666667]]
Accuracy: 0.730
Specificity: 0.733
Precision: 0.732
Sensitivity (Recall): 0.727

MCC: 0.460
Accuracy: 0.730
Specificity: 0.733
Precision: 0.732
Sensitivity (Recall): 0.727

MCC: 0.460
Logistic Regression:
1. Split: Selecting features
2. Split: Selecting features
3. Split: Selecting features
4. Split: Selecting features
5. Split: Selecting features
6. Split: Selecting features
7. Split: Selecting features
8. Split: Selecting features
9. Split: Selecting features
10. Split: Selecting features
11. Split: Selecting features
12. Split: Selecting features
13. Split: Selecting features
14. Split: Selecting features
15. Split: Selecting features
Results on test splits:
Accuracies: [0.75, 0.75, 0.8, 0.75, 0.7, 0.8, 0.75, 0.8, 0.7, 0.7, 0.75, 0.65, 0.75, 0.7, 0.8]
Averaged confusion matrix:
 [[7.5333333

In [54]:
for matrix, params in zip(conf_matrices_log_reg, solvers_penalties):
    print(params)
    print_metrics(matrix)

('saga', 'elasticnet')
Accuracy: 0.730
Specificity: 0.773
Precision: 0.752
Sensitivity (Recall): 0.687

MCC: 0.462
('saga', 'l1')
Accuracy: 0.720
Specificity: 0.740
Precision: 0.729
Sensitivity (Recall): 0.700

MCC: 0.440
('lbfgs', 'l2')
Accuracy: 0.727
Specificity: 0.740
Precision: 0.733
Sensitivity (Recall): 0.713

MCC: 0.453
('sag', 'l2')
Accuracy: 0.730
Specificity: 0.733
Precision: 0.731
Sensitivity (Recall): 0.727

MCC: 0.460
('liblinear', 'l2')
Accuracy: 0.743
Specificity: 0.753
Precision: 0.748
Sensitivity (Recall): 0.733

MCC: 0.486
('liblinear', 'l1')
Accuracy: 0.730
Specificity: 0.740
Precision: 0.735
Sensitivity (Recall): 0.720

MCC: 0.460


## Support Vector Machines

In [55]:
conf_matrices_svm = []

#penalties_losses = [("l2", "squared_hinge"), ("l2", "hinge"), ("l1", "squared_hinge")]
# l1 penalty with squared_hinge loss does not converge

penalties_losses = [("l2", "squared_hinge"), ("l2", "hinge")]

tol = 0.0001
select_features = 140

for penalty, loss in penalties_losses:
    dual = "auto"
    if penalty == "l1" or loss == "hinge":
        dual = False

    if penalty == "l2":
        dual = True
    

    print(f"Training with penalty {penalty} and loss {loss}...")
    svm = ModelTraining(LinearSVC(penalty=penalty, loss=loss, dual=dual, tol=tol), StandardScaler(), data_X=X, data_y=y)

    conf_matrix = svm.cross_validate(X, y, select_features=select_features, random_state=42, repeats=9)
    print_metrics(conf_matrix)
    print("\n\n")
    conf_matrices_svm.append(conf_matrix)

Training with penalty l2 and loss squared_hinge...
1. Split: Selecting features
2. Split: Selecting features
3. Split: Selecting features
4. Split: Selecting features
5. Split: Selecting features
6. Split: Selecting features
7. Split: Selecting features
8. Split: Selecting features
9. Split: Selecting features
10. Split: Selecting features
11. Split: Selecting features
12. Split: Selecting features
13. Split: Selecting features
14. Split: Selecting features
15. Split: Selecting features
16. Split: Selecting features
17. Split: Selecting features
18. Split: Selecting features
19. Split: Selecting features
20. Split: Selecting features
21. Split: Selecting features
22. Split: Selecting features
23. Split: Selecting features
24. Split: Selecting features
25. Split: Selecting features
26. Split: Selecting features
27. Split: Selecting features
Results on test splits:
Accuracies: [0.8, 0.7, 0.8, 0.65, 0.75, 0.75, 0.7, 0.75, 0.7, 0.75, 0.8, 0.6, 0.75, 0.7, 0.75, 0.8, 0.7, 0.75, 0.7, 0.75, 0.

In [56]:
for matrix, params in zip(conf_matrices_svm, penalties_losses):
    print(params)
    print_metrics(matrix)
    print("\n\n")

('l2', 'squared_hinge')
Accuracy: 0.728
Specificity: 0.737
Precision: 0.732
Sensitivity (Recall): 0.719

MCC: 0.456



('l2', 'hinge')
Accuracy: 0.731
Specificity: 0.767
Precision: 0.749
Sensitivity (Recall): 0.696

MCC: 0.464





## Random Forest Classifier

In [57]:
clf = RandomForestClassifier(max_depth=4, n_estimators=80, random_state=42)
rfc = ModelTraining(clf, StandardScaler(), data_X=X, data_y=y)

print("Random Forest Classifiers:")
conf_matrix_rfc = rfc.cross_validate(X, y, select_features=10, random_state=42)

Random Forest Classifiers:
1. Split: Selecting features
2. Split: Selecting features
3. Split: Selecting features
4. Split: Selecting features
5. Split: Selecting features
6. Split: Selecting features
Results on test splits:
Accuracies: [0.7, 0.65, 0.7, 0.75, 0.65, 0.8]
Averaged confusion matrix:
 [[6.5        3.5       ]
 [2.33333333 7.66666667]]
Accuracy: 0.708
Specificity: 0.650
Precision: 0.687
Sensitivity (Recall): 0.767

MCC: 0.420


In [58]:
print("Random Forest Classifier:")
clf = RandomForestClassifier(max_depth=4, n_estimators=25, random_state=42)
rfc = ModelTraining(clf, StandardScaler(), data_X=X, data_y=y)
rfc.train_and_evaluate(select_features=10)

Random Forest Classifier:
Scaler fitted
Test features scaled
Selected 10 features.
Selected features:
original_firstorder_Kurtosis_DWI_bias_brain_segmentation
original_firstorder_Kurtosis_DWI_bias_tumor_segmentation
original_firstorder_Kurtosis_T1c_bias_brain_segmentation
original_firstorder_Kurtosis_T1c_bias_tumor_segmentation
original_firstorder_Maximum_ADC_brain_segmentation
original_firstorder_Maximum_ADC_tumor_segmentation
original_firstorder_Maximum_DTI_eddy_MD_brain_segmentation
original_firstorder_Maximum_DTI_eddy_MD_tumor_segmentation
original_firstorder_Maximum_DWI_bias_brain_segmentation
original_firstorder_Maximum_DWI_bias_tumor_segmentation
Training finished

Training results:
Accuracy: 0.975
Specificity: 1.000
Precision: 1.000
Sensitivity (Recall): 0.952

MCC: 0.951


Test:
Confusion Matrix:
[[7 4]
 [3 6]]
Accuracy: 0.650
Specificity: 0.636
Precision: 0.600
Sensitivity (Recall): 0.667

MCC: 0.302
