In [41]:
import sys; sys.path.append("..")
import warnings; warnings.filterwarnings('ignore')

from core import * 
from data_manipulation import Transform, RandomRotation, Flip, RandomCrop, normalize_imagenet, normalize_mura, center_crop
from utils import save_model, load_model, lr_loss_plot
from architectures import DenseNet121
from train_functions import OptimizerWrapper, TrainingPolicy, FinderPolicy, validate_multilabel, lr_finder, validate_binary, TTA_binary
import json

SEED = 42
R_PIX = 8
IDX = 10 # Emphysema
BATCH_SIZE = 16
EPOCHS = 30
TRANSFORMATIONS = [RandomRotation(arc_width=20), Flip(), RandomCrop(r_pix=R_PIX)]
NORMALIZE = True # ImageNet
FREEZE = True
GRADUAL_UNFREEZING = True
n_samples = [50,100,200,400,600,800,1000]



BASE_PATH = Path('../..')
PATH = BASE_PATH/'data'
CHESTXRAY_FOLDER = PATH/'ChestXRay-250'
CHEXPERT_FOLDER = PATH/'ChesXPert-250'

SAVE_DIRECTORY = Path('./models')

In [31]:
chesxray_train_df = pd.read_csv(PATH/"train_df.csv")
chesxray_valid_df = pd.read_csv(PATH/"val_df.csv")
chesxray_test_df = pd.read_csv(PATH/"test_df.csv")


chexpert_train_df = pd.read_csv(PATH/"CheXpert-v1.0-small/train.csv")
chexpert_valid_df = pd.read_csv(PATH/"CheXpert-v1.0-small/valid.csv")

chexpert_train_df = train_df[train_df['Frontal/Lateral']=="Frontal"]
chexpert_valid_df = valid_df[valid_df['Frontal/Lateral']=="Frontal"]

# Data frame labeled data subsetting

In [64]:
def decode_labels(df_col):
    return np.array(list(map(np.array, df_col.str.split(' ')))).astype(int)

def subset_df(df, amt=None, idx=IDX):
    
    lbls = decode_labels(df.Label)
    
    if amt is None: amt=2*lbls[:,idx].sum()
    
    pos_idxs = lbls[:,idx].astype(bool)

    neg = df[~pos_idxs].sample(n=amt//2, replace=False)
    pos = df[pos_idxs].sample(n=amt//2, replace=False)

    return pd.concat([neg, pos]).reset_index(drop=True)

In [65]:
chesxray_valid_df_balanced = subset_df(chesxray_valid_df, amt=None, idx=IDX)
chesxray_test_df_balanced = subset_df(chesxray_test_df, amt=None, idx=IDX)

amt = 1000
chesxray_train_df_balanced = subset_df(chesxray_train_df, amt, idx=IDX)

# Datasets

In [66]:
class LabeledDataSet(Dataset):
    """
    Basic Images DataSet

    Args:
        dataframe with data: image_file, label
    """

    def __init__(self, df, image_path, idx):
        self.image_files = df["ImageIndex"].values
        self.lables = np.array([obs.split(" ")[idx]
                                for obs in df.Label]).astype(np.float32)
        self.image_path = image_path

    def __getitem__(self, index):
        path = self.image_path / self.image_files[index]
        x = cv2.imread(str(path)).astype(np.float32)
        x = cv2.cvtColor(x, cv2.COLOR_BGR2RGB) / 255
        y = self.lables[index]
        return x, y

    def __len__(self):
        return len(self.image_files)
    
    
class UnlabeledDataSet(Dataset):
    """
    Basic Images DataSet

    Args:
        dataframe with data: image_file, label
    """

    def __init__(self, df, image_path):
        self.image_files = ['_'.join(p.split('/')[1:]) for p in df["Path"].values]
        self.image_path = image_path

    def __getitem__(self, index):
        path = self.image_path / self.image_files[index]
        x = cv2.imread(str(path)).astype(np.float32)
        x = cv2.cvtColor(x, cv2.COLOR_BGR2RGB) / 255

        return x, None

    def __len__(self):
        return len(self.image_files)

# Transformations

In [85]:
class UnlabeledTransform():
    """ Rotates an image by deg degrees

    Args:

        dataset: A base torch.utils.data.Dataset of images
        transforms: list with all the transformations involving randomnes

        Ex:
            ds_transform = Transform(ds, [random_crop(240, 240), rotate_cv()])

    """

    def __init__(self, dataset, transforms=None, normalize=True, seed=42, r_pix=8):
        self.dataset, self.transforms = dataset, transforms

        if normalize is True: self.normalize = normalize_imagenet
        elif normalize=='MURA': self.normalize = normalize_mura
        else: self.normalize = False

        self.center_crop = partial(center_crop, r_pix=r_pix)

        np.random.seed(seed)

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, index):
        """
        Do transformation when image is called.
        We are assuming the trainingvalidation set is read from a folder of images already
        noramlized and resized to before random-crop and after random-crop sizes respectively.

        """
        data, label = self.dataset[index]
        
        out = np.copy(data)

        if self.transforms:
            for choices, f in list(zip(self.choices, self.transforms)):
                args = {k: v[index] for k, v in choices.items()}
                out = f(out, **args)
        else:
            out=self.center_crop(im=out)
        
        data = self.center_crop(data)

        if self.normalize: 
            out = self.normalize(out)
            data = self.normalize(data)
            
        return np.rollaxis(out, 2), np.rollaxis(data, 2)


    def set_random_choices(self):
        """
        To be called at the begining of every epoch to generate the random numbers
        for all iterations and transformations.
        """
        self.choices = []
        x_shape = self.dataset[0][0].shape
        N = len(self)

        for t in self.transforms:
            self.choices.append(t.set_random_choices(N, x_shape))
  

# Wrapper & DataLoader

In [86]:
class DataBatches:
    '''
    Creates a dataloader using the specificed data frame with the dataset corresponding to "data".
    '''

    def __init__(self, df, transforms, shuffle, img_folder_path, idx=IDX, batch_size=16, num_workers=8,
                 drop_last=False, r_pix=8, normalize=True, seed=42, problem_type='supervised'):

        if problem_type=='supervised':
            self.dataset = Transform(LabeledDataSet(df, image_path=img_folder_path, idx=idx),
                                     transforms=transforms, normalize=normalize, seed=seed, r_pix=r_pix)
        elif problem_type=='unsupervised':
            self.dataset = UnlabeledTransform(UnlabeledDataSet(df, image_path=img_folder_path),
                                     transforms=transforms, normalize=normalize, seed=seed, r_pix=r_pix)
        self.dataloader = DataLoader(
            self.dataset, batch_size=batch_size, num_workers=num_workers, pin_memory=True,
            shuffle=shuffle, drop_last=drop_last
        )

    def __iter__(self): return ((x.cuda().float(), y.cuda().float()) for (x, y) in self.dataloader)

    def __len__(self): return len(self.dataloader)

    def set_random_choices(self):
        if hasattr(self.dataset, "set_random_choices"): self.dataset.set_random_choices()


In [87]:
labeled_training = DataBatches(chesxray_train_df_balanced, TRANSFORMATIONS, idx=IDX, shuffle=True, img_folder_path=CHESTXRAY_FOLDER, batch_size=16, num_workers=8,
                 drop_last=False, r_pix=8, normalize=True, seed=42, problem_type='supervised')

In [88]:
labeled_training.set_random_choices()
x,y = next(iter(labeled_training))
print(x.shape, y.shape)

torch.Size([16, 3, 234, 234]) torch.Size([16])


In [89]:
unlabeled_training = DataBatches(chexpert_train_df, TRANSFORMATIONS, shuffle=True, img_folder_path=CHEXPERT_FOLDER, batch_size=16, num_workers=8,
                 drop_last=False, r_pix=8, normalize=True, seed=42, problem_type='unsupervised')

In [90]:
unlabeled_training.set_random_choices()
x,y = next(iter(unlabeled_training))
print(x.shape, y.shape)

torch.Size([16, 3, 184, 230]) torch.Size([16, 3, 184, 230])
