In [None]:
import numpy as np
import pandas as pd
import os
import pickle
import torch
import sys

from PIL import Image
from skimage.io import imread
from skimage.transform import resize
from matplotlib import pyplot as plt
from tabulate import tabulate


In [None]:
# Mount drive
from google.colab import drive
drive.mount('/content/drive')

# fix the path
original_path = os.getcwd()
sys.path.append(os.path.join('.', '..'))
sys.path.append('/content/drive/My Drive/Deep_Learning_Project12/')
os.chdir(sys.path[-1])

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Import Data and Wrangling

In [None]:
data_files = os.listdir("Files")
  
labels = pd.read_csv("Files/dermx_labels.csv")
labels["image_path"] = [os.path.join(os.getcwd(),"Files", "images", f"{x}.jpeg") for x in labels["image_id"]]
labels.drop(columns = "Unnamed: 0", inplace = True)

labels.dropna().reset_index(drop = True)
labels = pd.get_dummies(labels, columns = ["area"])
labels["open_comedo"] = (labels["open_comedo"] > 0).astype(int)

features_target = pd.read_csv("Files/diseases_characteristics.csv")
features_target.rename(columns={"Unnamed: 0":"disease"},inplace=True)

# create on_hot for diagnosis and get features
one_hot = pd.get_dummies(labels["diagnosis"])
one_hot_encoding = [list(x) for x in one_hot.values]

labels["ts"] = one_hot_encoding

# get features as multi hot
features_touse = list(labels.columns[list(range(2,9)) + [10,11,12,13]])
labels["features"] = labels.loc[:, features_touse].values.tolist()

# map feature sequences to value
features_map = {}
for idx, feat in enumerate(labels["features"].apply(tuple).unique()):
  features_map[str(feat)] = idx

labels["features_label"] = labels["features"].apply(tuple).apply(str).map(features_map)

# get domain
domain = pd.read_csv("Files/diseases_characteristics.csv")
domain.rename(columns={"Unnamed: 0":"diagnosis"},inplace=True)
domain = pd.get_dummies(domain, columns = ["area"])
same_sort = ["diagnosis"] + features_touse
domain = domain[same_sort]  # same sorting

domain_one_hot = pd.get_dummies(domain["diagnosis"])

domain_one_hot_encoding = [list(x) for x in domain_one_hot.values]
domain["ts"] = domain_one_hot_encoding
feature_cols = domain.columns[1:12]
domain["features"] = domain.loc[:,feature_cols].values.tolist()

# add domain features (domain knowledge) to dataframe
tf = []
for i, row in labels.iterrows():
  disease = row["diagnosis"]
  true_features = domain.loc[domain.diagnosis == disease].features.tolist()[0]
  tf.append(true_features)
labels["domain_features"] = tf 

domain = domain.sort_values(by="diagnosis").reset_index(drop=True)

data = labels.copy()

In [None]:
#@title Some helpful functions
from sklearn.utils import class_weight
import ast

def add_no_match(df: pd.DataFrame):
  
  unique_data = [list(x) for x in set(tuple(x) for x in df.domain_features)]

  app = []
  for i, row in df.iterrows():
    for x in unique_data:
      tmp_row = row.copy()
      if row["domain_features"] == x:
        pass
      else:
        tmp_row["diagnosis"] = "no_match"
        tmp_row["domain_features"] = x
        app.append(tmp_row)

  # Create new data frame
  updated_df=df.append(app,ignore_index=True)
  
  # Update targets "ts"
  updated_df.drop(columns="ts")
  new_dummies = pd.get_dummies(updated_df["diagnosis"])
  new_dummies = [list(x) for x in new_dummies.values]
  updated_df["ts"] = new_dummies

  return updated_df

def unique_lists(data: list):
  return [list(x) for x in set(tuple(x) for x in data)]

def calc_multiclass_weights(df: pd.DataFrame, device):
  
  cls = sorted(df.diagnosis.unique())
  y = df.diagnosis.to_list()
  csw = class_weight.compute_class_weight('balanced', classes = cls, y = y)
  class_weights = torch.tensor(csw,dtype=torch.float).to(device)

  return class_weights

def calc_multilabel_weights(df: pd.DataFrame, device):
  ones_count = np.vstack(df["features"]).sum(axis = 0)
  zero_count = len(df) - ones_count
  feature_weights = zero_count/ones_count
  feature_weights = torch.tensor(feature_weights).to(device)
  
  return feature_weights



def save_splits(skf, x, y):
  train_idxs=[]
  test_idxs=[]
  for train_idx, test_idx in skf.split(x,y):
    train_idxs.append(train_idx)
    test_idxs.append(test_idx)

  splits = pd.DataFrame()
  splits["train"] = train_idxs
  splits["test"] = test_idxs

  save_name = "K_fold/splits.csv"
  with open(save_name,'w') as f:
    splits.to_csv(f)
    

def read_splits(path):
  return pd.read_csv(path, converters={1:ast.literal_eval,
                                       2:ast.literal_eval})

  


# Define Dataset Class for Images



In [None]:
from tqdm import tqdm
from torchvision import transforms
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split

class NaturalImageDataset(Dataset):
  def __init__(self, data, augment = False):

    dictator = 'features_label'    # What variable we use to upsample to match
    # upsample if augment
    if augment:
      sample_count = {}
      up_sampler = np.unique(data[dictator])
      for f in up_sampler:
          sample_count[f] = np.count_nonzero(data[dictator] == f)

      maxcount = np.max(list(sample_count.values()))
      for f in up_sampler:
          gapnum = maxcount - sample_count[f]
          temp_df = data.iloc[np.random.choice(np.where(data[dictator] == f)[0], size = gapnum)]
          data = data.append(temp_df, ignore_index = True)
      

    self.dataframe = data
    self.imgage_path = data["image_path"].values
    self.labels = data["ts"].values
    self.features = data["features"].values

    # transform image
    if augment:
      self.transform = transforms.Compose([
                                  transforms.Resize(256),
                                  transforms.CenterCrop(224),
                                  transforms.ToTensor(),
                                  transforms.RandomHorizontalFlip(p = 0.5),
                                  transforms.RandomVerticalFlip(p=0.5),
                                  transforms.ColorJitter(brightness = 0.1, contrast = 0.1),
                                  transforms.RandomAffine(degrees = 50, translate = (0.1, 0.1), scale = (0.9, 1.1)),
                                  transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
                              ])
    else:
      self.transform = transforms.Compose([
                                  transforms.Resize(256),
                                  transforms.CenterCrop(224),
                                  transforms.ToTensor(),
                                  transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
                              ])


    self.images = [self.transform(Image.open(img_path)) for img_path in tqdm(data["image_path"])]

  def __len__(self):
    return (len(self.images))

  def __getitem__(self, i):
    image = self.images[i]
    label = self.labels[i]
    feature = self.features[i]
    return image, torch.tensor(label, dtype=torch.long), torch.tensor(feature, dtype=torch.long)

# Model `DiseaseNet` for learning Domain Knowledge

In [None]:
# create the MTL network
from torch import nn
from torch import optim
import torchvision.models as models

# create the MTL network
from torch import nn
from torch import optim
import torchvision.models as models

class DiseaseNet(nn.Module):
    def __init__(self, num_classes, num_features):
        super(DiseaseNet, self).__init__()

        self.num_classes = num_classes
        self.num_features = num_features
        
        # modify resnet
        base_net = models.resnet50(pretrained=True)

        # Freeze all parameters of base network
        for param in base_net.parameters():
          param.requires_grad = False

        # Unfreeze last 2 layers
        for name, param in base_net.named_parameters():
          if (name.startswith("layer4")): #| (name.startswith("layer3"))
            param.requires_grad = True

        # Unfreeze all bn params
        for module in base_net.modules():
          if isinstance(module, nn.BatchNorm2d):
            for param in module.parameters():
              param.requires_grad = True
                

        # get head infeatures
        head_in = base_net.fc.in_features

        # strip out last layer
        base_net = nn.Sequential(*(list(base_net.children())[:-1]))

        # construct the base model
        self.base_model = nn.Sequential(
            base_net
        )

        # labels head part
        self.labels_head = nn.Sequential(
            nn.Dropout(p=0.8),
            nn.Linear(in_features = head_in, out_features = num_classes, bias=True)
        )

        # DISABLE FEATURES HEAD
        # features head part
        # self.features_head = nn.Sequential(
        #     nn.Dropout(p=0.3),
        #     nn.Linear(in_features = head_in, out_features=num_features, bias=True)
        # )

    def forward(self, x):

        # common part
        x = self.base_model(x)
        
        # flatten dimensions
        x = torch.flatten(x, 1) 

        # labels head part
        x_labels = self.labels_head(x)

        # DISABLE FEATURES HEAD
        # features head part
        # x_in = self.features_in(x)
        # x_features = self.features_out(x_in)
        #x_features = self.features_head(x)

        return x_labels #, x_features
    
    

# Define train loop for `DiseaseNet`

In [None]:
# Train the net
from HelperFunctions.project_utils import MTLTracker, Tracker, plot_MTL_progress, plot_tracker
from sklearn.metrics import accuracy_score, f1_score


def train_labels_net(net: DiseaseNet, criterion_labels, 
                    optimizer, device, trainloader: DataLoader,
                    validationloader: DataLoader = None, validation_on: bool = False,
                    num_epoch = 100, eval_every = 3,
                    plt_on: bool = False):



  # Initialize trackers
  labels_tracker = Tracker()

  for epoch in range(num_epoch):  
    
    # Train
    net.train()

    for i, x in enumerate(trainloader):
      input_batch, labels, _ = x
      input_batch, labels = input_batch.to(device), labels.to(device)
      
      output_labels = net(input_batch)
      
      # labels ------------------------------------------------------------
      target_labels = torch.argmax(labels,dim=1)
      probabilities = nn.functional.softmax(output_labels, dim = 1) 
      preds_labels = torch.argmax(probabilities,dim=1)

      loss_labels = criterion_labels(output_labels, target_labels)
      labels_tracker.batch_loss.append(loss_labels.item() / input_batch.size(0))

      acc_labels = f1_score(target_labels.cpu(), preds_labels.cpu(), average='weighted')
      labels_tracker.batch_acc.append(acc_labels)
      # -------------------------------------------------------------------

      optimizer.zero_grad()
      loss_labels.backward()
      optimizer.step()  
      
    # Update training values with batch results

    labels_tracker.train_update(epoch)

    # Validate
    if validation_on & ((epoch % eval_every == 0) | (epoch == num_epoch - 1)):
      net.eval() 
      with torch.no_grad(): 

        for i, v in enumerate(validationloader):
    
          input_batch, labels, _ = v
          input_batch, labels = input_batch.to(device), labels.to(device)

          output_labels = net(input_batch)

          # labels ------------------------------------------------------------
          target_labels = torch.argmax(labels,dim=1)
          probabilities = nn.functional.softmax(output_labels, dim = 1) 
          preds_labels = torch.argmax(probabilities,dim=1)
          
          loss_labels = criterion_labels(output_labels, target_labels)
          labels_tracker.batch_loss.append(loss_labels.item() / input_batch.size(0))

          acc_labels = f1_score(target_labels.cpu(), preds_labels.cpu(), average="weighted")
          labels_tracker.batch_acc.append(acc_labels)
          # -------------------------------------------------------------------

      labels_tracker.val_update(epoch)

      # plot status
      if plt_on & ((epoch % eval_every == 0) | (epoch == num_epoch - 1)):
        plot_tracker(labels_tracker, num_epoch)

  return labels_tracker



# Test `DiseaseNet`



In [None]:

def test_labels_net(net: DiseaseNet, testloader: DataLoader, device):

  label_probs = []
  label_preds = []
  label_targets = []

  feature_probs = []
  feature_preds = []
  feature_targets = []

  threshold = 0.5            

  for i, x in enumerate(testloader):
    input_batch, labels, _ = x
    input_batch, labels = input_batch.to(device), labels.to(device)

    output_labels = net(input_batch)
    tmp_batch_size = input_batch.size()[0]

    # labels ------------------------------------------------------------
    target_labels = torch.argmax(labels,dim=1)
    prob_labels = nn.functional.softmax(output_labels, dim = 1) 
    preds_labels = torch.argmax(prob_labels,dim=1)

    label_probs = [*label_probs,*prob_labels.cpu().detach().numpy()]
    label_preds = [*label_preds,*preds_labels.cpu().detach().numpy()]
    label_targets = [*label_targets,*target_labels.cpu().detach().numpy()]


  label_results = {"probs": label_probs,
                   "preds": label_preds,
                   "targets": label_targets}


  return label_results

    

# K-Fold Cross Validation of `DiseaseNet`



In [None]:
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
from HelperFunctions.project_utils import plot_tracker
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn import utils
import json

# move to GPU if possible
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Make a copy of labels dataset
data_df = data.copy()

NUM_FEATURES = len(data_df.features.to_list()[0])
NUM_CLASSES = len(data_df.diagnosis.unique())
BATCH_SIZE = 16


# Use this if creating new splits!
# ================================= #
# k = 5
# skf = StratifiedKFold(n_splits=k)
# _x = np.zeros(len(data_df))
# _y = pd.DataFrame(data_df.ts.to_list()).idxmax(axis=1)
# save_splits(skf, _x, _y)
# ================================= #

# READ IN FINAL SPLITS
splits = read_splits("K_fold/splits_FINAL_NA.csv")

k_epochs=35
k_idx = 0

#for train_idx, test_idx in skf.split(_x, _y):
for i, row in splits.iterrows():


  train_idx = row.train
  test_idx = row.test

  # Split train/test
  train_df = data_df.loc[train_idx]
  test_df = data_df.loc[test_idx]

  # Load into DomainDataSet class
  trainset = NaturalImageDataset(train_df, augment=True)
  testset = NaturalImageDataset(test_df)

  # Split into batches via DataLoader
  trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=True)
  testloader = DataLoader(testset, batch_size=len(testset))

  # Initialize Net
  net = DiseaseNet(NUM_CLASSES, 0)
  net.to(device)

  # Get parameters to update
  params_to_update = []
  for name,param in net.named_parameters():
    if param.requires_grad == True:
      params_to_update.append(param)
    

  class_weights = calc_multiclass_weights(trainset.dataframe, device)  # Adjust for data imbalance
  criterion_labels = nn.CrossEntropyLoss(weight = class_weights) #loss for labels
  
  optimizer = optim.SGD(params_to_update, lr = 0.00035, momentum = 0.9, weight_decay=0.05)

  # Train net
  l_tracker  = train_labels_net(net, criterion_labels, optimizer,
                              device, trainloader, validation_on=False,
                              num_epoch = k_epochs, plt_on = True)
  
  # Test net
  net.eval()
  with torch.no_grad():

    for _, x in enumerate(testloader):
      input_batch, labels, _ = x
      input_batch, labels = input_batch.to(device), labels.to(device)

      test_labels = net(input_batch)
      test_labels = test_labels.cpu().detach().numpy()



  run_info = {
      "k_run":                k_idx,
      "k_epochs":             k_epochs,

      "train_idx":            train_idx,
      "test_idx":             test_idx,
      
      "batch_size":           BATCH_SIZE,

      "batch_size":           BATCH_SIZE,
      "scalars":              [],

      "labels_tracker":       l_tracker.toJSON(),
      "features_tracker":     Tracker().toJSON(),
      "total_tracker":        Tracker().toJSON(),

      "test_labels":          test_labels.tolist(),
      "test_labels_targets":  labels.cpu().detach().numpy().tolist(),
    
      "test_features":          [],
      "test_features_targets":  labels.cpu().detach().numpy().tolist(),
      
  }

  save_name = f"K_fold/DiseaseNet_FINAL_kfold_NA_{i}.json"
  with open(save_name, "w") as f:
    json.dump(run_info,f)


  k_idx += 1

  


Using device: cuda:0


100%|██████████| 2380/2380 [01:42<00:00, 23.24it/s]
100%|██████████| 91/91 [00:03<00:00, 23.63it/s]
Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth


  0%|          | 0.00/97.8M [00:00<?, ?B/s]

100%|██████████| 2210/2210 [01:03<00:00, 35.02it/s]
100%|██████████| 91/91 [00:03<00:00, 28.96it/s]
100%|██████████| 2520/2520 [01:26<00:00, 29.24it/s]
100%|██████████| 91/91 [00:02<00:00, 38.72it/s]
100%|██████████| 2747/2747 [01:33<00:00, 29.23it/s]
100%|██████████| 90/90 [00:01<00:00, 57.15it/s]
100%|██████████| 2665/2665 [01:33<00:00, 28.51it/s]
100%|██████████| 90/90 [00:01<00:00, 74.00it/s]
