# Validate Masks and Extract Features
## Purpose
- Denoise masks by chosing largest segment (the myocardium) from each mask
- Save these masks as DCM images compatible with Pyradiomics
- Drop cases of poor imaging quality or segmentation failure before analysis
- Run Pyradiomics with chosen feature extraction parameters
## Prerequisites
- Please segment T1 map MRI from their original `dcm`. `medsam` was used in this study.
- Ensure segmentations are stored as single-channel, binary `png` masks.
## Data Import
- Import latest phenotyping classifications for the studies
- Import the contours generated by the latest (v4) finetuned MedSAM algorithm
- Specify output directory

In [None]:
from radiomics import featureextractor
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
import os
import re
import SimpleITK as sitk
from matplotlib.colors import LinearSegmentedColormap
import ipywidgets as widgets
from tqdm import tqdm

In [None]:
eligible_patients_path = ''#List UK Biobank EID's
imagemask_read_path = ''#Place patient image directories, named by EID, in this path.
outpath = ''#Where results will be stored.
pyradiomics_params_path = 'radiomic_extraction_parameters.yaml'

eligible_patients = pd.read_csv(eligible_patients_path)
os.makedirs(outpath , exist_ok = True)
all_filepaths = [os.path.join(r , f) for r , d , filenames in os.walk(imagemask_read_path) for f in filenames]
all_mask_paths = [p for p in all_filepaths if re.match(r'.*_myocardium.png$' , p)]

## Narrow to Largest Contiguous Mask from Each Model Output; Manually Define Image Quality and Segmentation Failures
- Please verify any cases of failed image quality (wrong axis or plane; excessive noise) or failed image segmentation (myocardium not captured in the largest contiguous contour).
    - Place these into the list below and justify.

In [None]:
colors = [(1, 1, 1, 0), (1, 1, 0, 0.7)]
cmap_name = 'transparent_yellow'
custom_cmap = LinearSegmentedColormap.from_list(cmap_name, colors)

def select_largest_contiguous(mask:np.ndarray) -> np.ndarray:
    """
    `select_largest_contiguous`
    Selects the single largest contiguous mask area from a mask segmentation. Returns this as an `ndarray` of equal size.
    """
    label_keys , label_image = cv2.connectedComponents(mask)
    idx , label_areas_px = np.unique(label_image[label_image>0] , return_counts = True)
    largest_label_key = idx[np.argmax(label_areas_px)]
    return (label_image == largest_label_key).astype(np.uint8)
def plot_largest_contiguous(value) -> None:
    """
    `plot_largest_contiguous`
    Overlay the largest contiguous image mask onto the patient DICOM for confirmation of accuracy
    """
    mask = cv2.imread(all_mask_paths[value])[:,:,0]
    mask_refined = select_largest_contiguous(mask)

    base_dcm = sitk.GetArrayFromImage(sitk.ReadImage(all_mask_paths[value].split('_myocardium.png')[0] + '.dcm'))

    plt.cla()
    plt.subplot(1,2,1)
    plt.imshow(base_dcm[0,:,:] , cmap='gray')
    plt.imshow(mask , cmap=custom_cmap)
    plt.subplot(1,2,2)
    plt.imshow(base_dcm[0,:,:] , cmap='gray')
    plt.imshow(mask_refined , cmap=custom_cmap)

slider = widgets.IntSlider(min=0, max=len(all_mask_paths), value=1)
widgets.interactive(plot_largest_contiguous, value=slider)

In [None]:
value_indices_to_exclude = [
    -1,#put justification here
]
print('Segmentation or image quality failure rate:')
print(f'{len(value_indices_to_exclude)} / {len(all_mask_paths)} failure rate ({len(value_indices_to_exclude)/len(all_mask_paths)*100}%)')
shmolli_quality_exclusion_df = pd.DataFrame({
    'mask_path':all_mask_paths,
    'passable_shmolli_quality': np.ones(len(all_mask_paths) , dtype = bool)
})
shmolli_quality_exclusion_df.loc[value_indices_to_exclude , 'passable_shmolli_quality'] = False
shmolli_quality_exclusion_df['eid'] = shmolli_quality_exclusion_df.mask_path.apply(lambda x : os.path.split(os.path.split(x)[0])[1])
pd.set_option('display.max_colwidth', None)
display(shmolli_quality_exclusion_df[~shmolli_quality_exclusion_df.passable_shmolli_quality])
shmolli_quality_exclusion_df.to_csv(os.path.join(outpath , 'shmolli_quality_exclusion.csv'))

## Generate DCM files for binary with same physical dimensions as a patient's original DICOM

In [None]:
image_mask_pairs = {}
for m in tqdm(all_mask_paths):
    dcm_impth = m.split('_myocardium.png')[0] + '.dcm'
    dcm_img = sitk.ReadImage(dcm_impth)
    dcm_dimension = dcm_img.GetDimension()
    dcm_size = dcm_img.GetSize()
    dcm_origin = dcm_img.GetOrigin()
    dcm_spacing = dcm_img.GetSpacing()
    dcm_direction = dcm_img.GetDirection()
    refinedmask = select_largest_contiguous(cv2.imread(m)[:,:,0])[:,:,np.newaxis]
    refinedmask_dcm = sitk.GetImageFromArray(np.transpose(refinedmask,(2,0,1)))
    assert refinedmask_dcm.GetSize() == dcm_size
    refinedmask_dcm.SetOrigin(dcm_origin)
    refinedmask_dcm.SetSpacing(dcm_spacing)
    refinedmask_dcm.SetDirection(dcm_direction)
    output_dcm_mask_path = outpath + m.split(imagemask_read_path)[-1].split('.png')[0]+'.dcm'
    if not os.path.exists(os.path.split(output_dcm_mask_path)[0]):
        os.makedirs(os.path.split(output_dcm_mask_path)[0] , exist_ok = True)
    sitk.WriteImage(refinedmask_dcm , fileName = output_dcm_mask_path)
    image_mask_pairs[dcm_impth] = output_dcm_mask_path

## Extract features using pyradiomics
- This uses feature extraction parameters as specified as `MR_2D_extraction.yaml`
- !! IMPORTANT: Please run this **twice.**
- After your first run: tune two hyperparameters in `radiomic_extraction_parameters.yaml`:
    - Update your `binWith` based on `range` such that $n_{bins}\epsilon(16 , 128)$
    - Update your `voxelArrayShift` such that $voxelArrayShift=3\sigma$
- Then, rerun extraction with these refined parameters. This extraction should yield a better result.

In [None]:
extractor = featureextractor.RadiomicsFeatureExtractor(pyradiomics_params_path)
result_table = pd.DataFrame()
for image , mask in tqdm(list(zip(image_mask_pairs.keys() , image_mask_pairs.values()))):
    result = pd.Series(extractor.execute(image , mask))
    result.name = image
    result_table = result_table.join(result , how='outer')
result_table.to_csv(os.path.join(outpath , 'extracted_features_table.csv'))