In [None]:
import pydicom as py
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd
import datetime
import skimage as sk
import sys
import scipy
from scipy import stats
from scipy.stats import skew, kurtosis
import radiomics
from radiomics import featureextractor
import SimpleITK as sitk



from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold


#from sklearn import metrics
from sklearn.metrics import make_scorer, confusion_matrix, precision_score, recall_score, accuracy_score,
f1_score, roc_auc_score, balanced_accuracy_score, matthews_corrcoef


from sklearn.ensemble import RandomForestClassifier

from sklearn.decomposition import IncrementalPCA
from umap import UMAP


from scipy.stats import gmean

In [None]:
root= "the root/path of the folders of the processed files"

folders= os.listdir(root)

folders= sorted([int(f) for f in folders if f.isdigit()])

print("\nFolder names:", folders)
print("\nNumber of images/folders:", len(folders))

In [None]:
def read(f):
    print("\n")
    path = root + str(f) + "/"
    #print(path)
    images, masks = [], []
    
    try:
        files_img = os.listdir(path)
        #print(files_img)
        files_img = [f1 for f1 in files_img if f1[-3:] == "npy"]
        #print(files_img)
        files_img= [int(f1[:-4]) for f1 in files_img if f1[:-4].isnumeric()== True]
        #print(files_img)
        files_img = sorted(files_img)  # Sort the file names to ensure they are processed serially
        #print(files_img)
        try:
            for f1 in files_img:
                images.append(np.load(path + "/" + str(f1) + ".npy"))
            print(f"Folder {f}: images loaded.")
            
            try:
                mask_files = os.listdir(path + "/modified_masks/")
                mask_files= [int(m1[:-4]) for m1 in mask_files if m1[:-4].isnumeric()== True]
                mask_files = sorted(mask_files)  # Sort the mask file names to ensure they are processed serially
                #print(mask_files)
                try:
                    for m in mask_files:
                        masks.append(np.load(path + "/modified_masks/" + str(m) + ".npy"))
                    print(f"Folder {f}: masks loaded.")
                except Exception as e:
                    print(f"Folder {f}: masks not loaded. Error: {e}")
            except Exception as e:
                print(f"No masks exist in the given path= {path}masks/. Error: {e}")
        except Exception as e:
            print(f"Folder {f}: images not loaded. Error: {e}")

    except Exception as e:
        print(f"No files exist in the given path= {path}. Error: {e}")
    
    return images, masks

In [None]:
#Read images and masks, and keep them in separate folders.

files= []

for f in folders[:]:
    files.append(read(str(f)))

images_th, masks_th= [x[0] for x in files], [x[1] for x in files]

In [None]:
#Check total images

total_images= 0
for i in range(len(images_th)):
    total_images= total_images+ len(images_th[i])

total_images

In [None]:
#Check total masks

total_masks= 0
for i in range(len(masks_th)):
    total_masks= total_masks+ len(masks_th[i])

total_masks

In [None]:
#Example plotting of an image and its mask.

for i in range(30, 50):
    j= i

    fig, axes= plt.subplots(1, 2, sharex= True, sharey= True, figsize= (10, 9), constrained_layout= True)
    #plt.subplots_adjust(left=0.1, right=0.9, bottom=0.1, top=0.9, wspace=0.1, hspace=0.2)
    
    axes[0].imshow(images_th[i][j], cmap= "gray")
    axes[0].set_title("Image")
    #axes[0].axis("off")
    
    axes[1].imshow(masks_th[i][j], cmap= "gray")
    axes[1].set_title("Mask")
    #axes[1].axis("off")
    plt.show()

In [None]:
#Images flattened for train data, i.e. the inputs.

x= []

for image in images_th:
    for sl in image:
        x.append(sl.flatten())

x= (np.array(x)).astype(np.float32)

print(np.shape(x))
print("Done")

In [None]:
                                                #CHECKS FOR INPUT DATA: IMAGES

In [None]:
#Checking if any image has the same value thoughout: Dark images.

x2= []
dark_x_index= []


for k, sl in enumerate(list(x)):
    if(np.min(sl) == np.max(sl)):
        print(f"Dark image at overall index= {k}: Minimum= {np.min(sl)}, Maximum= {np.max(sl)}")
        dark_x_index.append(k)
    else:
        x2.append(sl.flatten())


x2= (np.array(x2)).astype(np.float32)

print(np.shape(x2))
print("Done")

In [None]:
len(dark_x_index)

In [None]:
                                                #CHECKS FOR TARGET DATA: ANNOTATIONS

In [None]:
#To check how many pixels are annotated in each of the modified masks

annotations= []
for mask in masks_th:
    for m in mask:
        annotations.append(np.sum(m))

print("Done")

In [None]:
np.unique(annotations, return_counts= True)

In [None]:
# For number of annotated pixels in each of the modified masks, display the first 30 unique values and their counts

for i, j in zip(np.unique(annotations, return_counts= True)[0][:30], np.unique(annotations, return_counts= True)[1][:30]):
    print("Number of annotated pixels in a mask:", i, ", Total number of such masks:", j)

In [None]:
#Total annotated masks for each image: Annotation (=1) for accepting a mask as a mask: if number of pixels annotated as 1 in the mask is >= 10.

annots= []

for mask in masks_th:
    a= []
    for m in mask:
        if(np.sum(m)>= 10):
            a.append(1)
        else:
            a.append(0)
    annots.append(a)

In [None]:
#See how many are actual masks with bleed (1) per image

for a in annots:
    print(len(a))

In [None]:
#Annotations are turned into the Target variable.

y= []
for a in annots:
    for i in a:
        y.append(i)

np.unique(y, return_counts= True)

In [None]:
                                                    #AFTER REMOVING DARK IMAGES

In [None]:
print(f"\nDark images indices:\n{dark_x_index}")
print(f"\nSize of x after removal of dark images: {np.shape(x2)}")


y2= []
for index, value in enumerate(y):
    if index not in dark_x_index:
        y2.append(value)
    else:
        print(index, value)

#y2= [value for index, value in enumerate(y) if index not in dark_x_index]

print(f"Size of y after removal of dark images: {np.shape(y2)}\n")

In [None]:
np.unique(y2, return_counts= True)

In [None]:
x2_df= pd.DataFrame(x2, columns= range(np.shape(x2)[1]))
x2_df.columns = x2_df.columns.astype(str)
x2_df

In [None]:
                                                #PYRADIOMICS for radiomics features

In [None]:
#Reshaping the images to 512x512 format for its use in Pyradiomics

reshaped_x = []

for sl in x2:
    reshaped_x.append(sl.reshape((512, 512)))

np.shape(reshaped_x)

In [None]:
#PYRADIOMICS for FIRST ORDER features

background_value = -350

fo_features= []

for i, image in enumerate(reshaped_x[:]):
    try:
        #print("\nImage=", i)
        image_array= image
        
        # Step 1: Create mask for non-background pixels
        foreground_mask = (image_array != background_value).astype(np.uint8)
    
        # Step 2: Convert image and mask to SimpleITK images
        image_sitk = sitk.GetImageFromArray(image_array)
        mask_sitk = sitk.GetImageFromArray(foreground_mask)
    
        # Step 3: Initialize the PYRADIOMICS feature extractor
        extractor = featureextractor.RadiomicsFeatureExtractor()
        
        # Step 4: Enable only first-order features
        extractor.disableAllFeatures()
        extractor.enableFeatureClassByName('firstorder')
    
        # Step 5: Extract features
        features = extractor.execute(image_sitk, mask_sitk)
    
    
        f= []
        for feature_name, value in list(features.items())[-18:]:     #The index of the first order features from the bottom of the list
            #print(f"{feature_name}: {value}")
            f.append(float(value))
    
        #print("\n")
        fo_features.append(f)

    except Exception as e:
        print(e)

print("\nDone.")

In [None]:
#PYRADIOMICS for GLCM features

background_value = -350

glcm_features= []

for i, image in enumerate(reshaped_x[:]):
    try:
        #print("\nImage=", i)
        image_array= image
        
        # Step 1: Create mask for non-background pixels
        foreground_mask = (image_array != background_value).astype(np.uint8)
    
        # Step 2: Convert image and mask to SimpleITK images
        image_sitk = sitk.GetImageFromArray(image_array)
        mask_sitk = sitk.GetImageFromArray(foreground_mask)
    
        # Step 3: Initialize the PYRADIOMICS feature extractor
        extractor = featureextractor.RadiomicsFeatureExtractor()
        
        # Step 4: Enable only GLCM features
        extractor.disableAllFeatures()
        extractor.enableFeatureClassByName('glcm')
    
        # Step 5: Extract features
        features = extractor.execute(image_sitk, mask_sitk)
    
    
        f= []
        for feature_name, value in list(features.items())[-24:]:   #The index of the GLCM features from the bottom of the list
            #print(f"{feature_name}: {value}")
            f.append(float(value))
    
        #print("\n")
        glcm_features.append(f)

    except Exception as e:
        print(e)

print("\nDone.")

In [None]:
#PYRADIOMICS for GLRLM features

background_value = -350

glrlm_features= []

for i, image in enumerate(reshaped_x[:]):
    try:
        #print("\nImage=", i)
        image_array= image
        
        # Step 1: Create mask for non-background pixels
        foreground_mask = (image_array != background_value).astype(np.uint8)
    
        # Step 2: Convert image and mask to SimpleITK images
        image_sitk = sitk.GetImageFromArray(image_array)
        mask_sitk = sitk.GetImageFromArray(foreground_mask)
    
        # Step 3: Initialize the PYRADIOMICS feature extractor
        extractor = featureextractor.RadiomicsFeatureExtractor()
        
        # Step 4: Enable only GLRLM features
        extractor.disableAllFeatures()
        extractor.enableFeatureClassByName('glrlm')
    
        # Step 5: Extract features
        features = extractor.execute(image_sitk, mask_sitk)
    
    
        f= []
        for feature_name, value in list(features.items())[-16:]:   #The index of the GLRLM features from the bottom of the list
            #print(f"{feature_name}: {value}")
            f.append(float(value))
    
        #print("\n")
        glrlm_features.append(f)

    except Exception as e:
        print(e)

print("\nDone.")

In [None]:
#PYRADIOMICS for GLSZM features

background_value = -350

glszm_features= []

for i, image in enumerate(reshaped_x[:]):
    try:
        #print("\nImage=", i)
        image_array= image
        
        # Step 1: Create mask for non-background pixels
        foreground_mask = (image_array != background_value).astype(np.uint8)
    
        # Step 2: Convert image and mask to SimpleITK images
        image_sitk = sitk.GetImageFromArray(image_array)
        mask_sitk = sitk.GetImageFromArray(foreground_mask)
    
        # Step 3: Initialize the PYRADIOMICS feature extractor
        extractor = featureextractor.RadiomicsFeatureExtractor()
        
        # Step 4: Enable only GLSZM features
        extractor.disableAllFeatures()
        extractor.enableFeatureClassByName('glszm')
    
        # Step 5: Extract features
        features = extractor.execute(image_sitk, mask_sitk)
    
    
        f= []
        for feature_name, value in list(features.items())[-16:]:   #The index of the GLRLM features from the bottom of the list
            #print(f"{feature_name}: {value}")
            f.append(float(value))
    
        #print("\n")
        glszm_features.append(f)

    except Exception as e:
        print(e)

print("\nDone.")

In [None]:
#PYRADIOMICS for GLDM features

background_value = -350

gldm_features= []

for i, image in enumerate(reshaped_x[:]):
    try:
        #print("\nImage=", i)
        image_array= image
        
        # Step 1: Create mask for non-background pixels
        foreground_mask = (image_array != background_value).astype(np.uint8)
    
        # Step 2: Convert image and mask to SimpleITK images
        image_sitk = sitk.GetImageFromArray(image_array)
        mask_sitk = sitk.GetImageFromArray(foreground_mask)
    
        # Step 3: Initialize the PYRADIOMICS feature extractor
        extractor = featureextractor.RadiomicsFeatureExtractor()
        
        # Step 4: Enable only GLDM features
        extractor.disableAllFeatures()
        extractor.enableFeatureClassByName('gldm')
    
        # Step 5: Extract features
        features = extractor.execute(image_sitk, mask_sitk)
    
    
        f= []
        for feature_name, value in list(features.items())[-14:]:   #The index of the GLDM features from the bottom of the list
            #print(f"{feature_name}: {value}")
            f.append(float(value))
    
        #print("\n")
        gldm_features.append(f)

    except Exception as e:
        print(e)

print("\nDone.")

In [None]:
#PYRADIOMICS for NGTDM features

background_value = -350

ngtdm_features= []

for i, image in enumerate(reshaped_x[:]):
    try:
        #print("\nImage=", i)
        image_array= image
        
        # Step 1: Create mask for non-background pixels
        foreground_mask = (image_array != background_value).astype(np.uint8)
    
        # Step 2: Convert image and mask to SimpleITK images
        image_sitk = sitk.GetImageFromArray(image_array)
        mask_sitk = sitk.GetImageFromArray(foreground_mask)
    
        # Step 3: Initialize the PYRADIOMICS feature extractor
        extractor = featureextractor.RadiomicsFeatureExtractor()
        
        # Step 4: Enable only NGTDM features
        extractor.disableAllFeatures()
        extractor.enableFeatureClassByName('ngtdm')
    
        # Step 5: Extract features
        features = extractor.execute(image_sitk, mask_sitk)
    
    
        f= []
        for feature_name, value in list(features.items())[-5:]:   #The index of the NGTDM features from the bottom of the list
            #print(f"{feature_name}: {value}")
            f.append(float(value))
    
        #print("\n")
        ngtdm_features.append(f)

    except Exception as e:
        print(e)

print("\nDone.")

In [None]:
all_features= pd.concat([first_order, glcm, glrlm, glszm, gldm, ngtdm], axis=1)
x2_all= pd.concat([x2_df, all_features], axis= 1)
x2_all

In [None]:
#Dividing the all features-appended data to training, validation and testing

xtrain, xtemp, ytrain, ytemp= train_test_split(x2_all, y2, stratify= y2, test_size= 0.3, random_state= 11)

xval, xtest, yval, ytest= train_test_split(xtemp, ytemp, stratify= ytemp, test_size= 0.3, random_state= 31)

print(f"\nOriginal size= {x2_all.shape}")
print(f"Training size= {xtrain.shape}")
print(f"Validation size= {xval.shape}")
print(f"Testing size= {xtest.shape}")

print(f"\ny= {np.unique(y2, return_counts= True)}")
print(f"ytrain= {np.unique(ytrain, return_counts= True)}")
print(f"yval= {np.unique(yval, return_counts= True)}")
print(f"ytest= {np.unique(ytest, return_counts= True)}\n")

In [None]:
# Scaling using StandardScaler

std_scaler = StandardScaler()

xtrain_std = std_scaler.fit_transform(xtrain)  # Fit on training data, then transform
xval_std = std_scaler.transform(xval)  # Only transform the validation data
xtest_std = std_scaler.transform(xtest)  # Only transform the validation data

print("Standard Scaling Done")

In [None]:
#PCA on the training set and transformation on the validation and testing sets

pca= IncrementalPCA()

pca_xtrain= pca.fit_transform(xtrain_std[:, :262144]) #PCA only on the pixel data only
print(np.shape(pca_xtrain))

pca_xval= pca.transform(xval_std[:, :262144]) #transforming the validation data based on the PCs computed from the training data.
print(np.shape(pca_xval))

pca_xtest= pca.transform(xtest_std[:, :262144]) #transforming the testing data based on the PCs computed from the training data.
print(np.shape(pca_xtest))

In [None]:
# Extract radiomics data from xtrain_std (training data; similarly do for validation and testing data)

data = xtrain_std[:, 262144:]

# Extract column names from xtrain
column_names = xtrain.iloc[:, 262144:].columns

# Create the new DataFrame appending PCA reduced data and radiomics data

pca_xtrain2= pd.DataFrame(pca_xtrain, columns= range(np.shape(pca_xtrain)[1]))
pca_xtrain2.columns = pca_xtrain2.columns.astype(str)

xtrain_std_f = pd.DataFrame(data, columns=column_names)

xtrain_df= pd.concat([pca_xtrain2, xtrain_std_f], axis= 1)

xtrain_df