In [None]:
import os
import pickle
import torch
import sys

from PIL import Image
from skimage.io import imread
from skimage.transform import resize
from matplotlib import pyplot as plt
from tabulate import tabulate

In [None]:
# Mount drive
from google.colab import drive
drive.mount('/content/drive')

# fix the path
original_path = os.getcwd()
sys.path.append(os.path.join('.', '..'))
sys.path.append('/content/drive/My Drive/Deep_Learning_Project12/')
os.chdir(sys.path[-1])

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Import Data and Wrangling

In [None]:
import numpy as np
import pandas as pd

data_files = os.listdir("Files")
  
labels = pd.read_csv("Files/dermx_labels.csv")
labels["image_path"] = [os.path.join(os.getcwd(),"Files", "images", f"{x}.jpeg") for x in labels["image_id"]]
labels.drop(columns = "Unnamed: 0", inplace = True)

labels.dropna().reset_index(drop = True)
labels = pd.get_dummies(labels, columns = ["area"])
labels["open_comedo"] = (labels["open_comedo"] > 0).astype(int)

features_target = pd.read_csv("Files/diseases_characteristics.csv")
features_target.rename(columns={"Unnamed: 0":"disease"},inplace=True)

# create on_hot for diagnosis and get features
one_hot = pd.get_dummies(labels["diagnosis"])
one_hot_encoding = [list(x) for x in one_hot.values]

labels["ts"] = one_hot_encoding

# get features as multi hot
features_touse = list(labels.columns[list(range(2,9)) + [10,11,12,13]])
labels["features"] = labels.loc[:, features_touse].values.tolist()

# map feature sequences to value
features_map = {}
for idx, feat in enumerate(labels["features"].apply(tuple).unique()):
  features_map[str(feat)] = idx

labels["features_label"] = labels["features"].apply(tuple).apply(str).map(features_map)

# get domain
domain = pd.read_csv("Files/diseases_characteristics.csv")
domain.rename(columns={"Unnamed: 0":"diagnosis"},inplace=True)
domain = pd.get_dummies(domain, columns = ["area"])
same_sort = ["diagnosis"] + features_touse
domain = domain[same_sort]  # same sorting

domain_one_hot = pd.get_dummies(domain["diagnosis"])

domain_one_hot_encoding = [list(x) for x in domain_one_hot.values]
domain["ts"] = domain_one_hot_encoding
feature_cols = domain.columns[1:12]
domain["features"] = domain.loc[:,feature_cols].values.tolist()

# add domain features (domain knowledge) to dataframe
tf = []
for i, row in labels.iterrows():
  disease = row["diagnosis"]
  true_features = domain.loc[domain.diagnosis == disease].features.tolist()[0]
  tf.append(true_features)
labels["domain_features"] = tf 

domain = domain.sort_values(by="diagnosis").reset_index(drop=True)

data = labels.copy()


In [None]:
def add_domain(df: pd.DataFrame):
  domain = pd.read_csv("Files/diseases_characteristics.csv")
  domain.rename(columns={"Unnamed: 0":"diagnosis"},inplace=True)
  domain = pd.get_dummies(domain, columns = ["area"])
  same_sort = list(labels.columns[list(range(1,9)) + [10,11,12,13]])
  domain = domain[same_sort]  # same sorting

  domain_one_hot = pd.get_dummies(domain["diagnosis"])

  domain_one_hot_encoding = [list(x) for x in domain_one_hot.values]
  domain["ts"] = domain_one_hot_encoding
  feature_cols = domain.columns[1:12]
  domain["features"] = domain.loc[:,feature_cols].values.tolist()

  # add domain features (domain knowledge) to dataframe
  tf = []
  for i, row in df.iterrows():
    disease = row["diagnosis"]
    true_features = domain.loc[domain.diagnosis == disease].features.tolist()[0]
    tf.append(true_features)
  df["domain_features"] = tf 

  return df

# Reduce features


In [None]:
from HelperFunctions.project_utils import Tracker
from sklearn.utils import class_weight
import ast

def add_no_match(df: pd.DataFrame):
  
  unique_data = [list(x) for x in set(tuple(x) for x in df.domain_features)]

  app = []
  for i, row in df.iterrows():
    for x in unique_data:
      tmp_row = row.copy()
      if row["domain_features"] == x:
        pass
      else:
        tmp_row["diagnosis"] = "no_match"
        tmp_row["domain_features"] = x
        app.append(tmp_row)

  # Create new data frame
  updated_df=df.append(app,ignore_index=True)
  
  # Update targets "ts"
  updated_df.drop(columns="ts")
  new_dummies = pd.get_dummies(updated_df["diagnosis"])
  new_dummies = [list(x) for x in new_dummies.values]
  updated_df["ts"] = new_dummies

  return updated_df

def unique_lists(data: list):
  return [list(x) for x in set(tuple(x) for x in data)]

def map_domain_knowledge(df: pd.DataFrame):
  keys = df.diagnosis.unique().tolist()
  map = dict()
  for k in keys:
    map[k] = df.loc[data["diagnosis"] == k].domain_features.tolist()[0]
  return map

def calc_multiclass_weights(df: pd.DataFrame, device):
  
  cls = sorted(df.diagnosis.unique())
  y = df.diagnosis.to_list()
  csw = class_weight.compute_class_weight('balanced', classes = cls, y = y)
  class_weights = torch.tensor(csw,dtype=torch.float).to(device)

  return class_weights

def calc_multilabel_weights(df: pd.DataFrame, device):
  ones_count = np.vstack(df["features"]).sum(axis = 0)
  zero_count = len(df) - ones_count
  feature_weights = zero_count/ones_count
  feature_weights = torch.tensor(feature_weights).to(device)
  
  return feature_weights



def save_splits(skf, x, y):
  train_idxs=[]
  test_idxs=[]
  for train_idx, test_idx in skf.split(x,y):
    train_idxs.append(train_idx.tolist())
    test_idxs.append(test_idx.tolist())

  splits = pd.DataFrame()
  splits["train"] = train_idxs
  splits["test"] = test_idxs

  save_name = "K_fold/splits_FINAL_NA.csv"
  with open(save_name,'w') as f:
    splits.to_csv(f)
    

def read_splits(path):
  return pd.read_csv(path, converters={1:ast.literal_eval,
                                       2:ast.literal_eval})

def plt_tracker(tracker: Tracker, num_epoch):
    plt.figure(figsize=(14,8))
    epoch_ticks = range(0,num_epoch + 1, 5)

    # loss
    plt.subplot(1,2,1)
    plt.plot(tracker.train_iter, tracker.train_loss, label='Training loss')
    plt.plot(tracker.val_iter, tracker.val_loss, label='Validation loss')
    plt.title("Loss")
    plt.ylabel("Loss"), plt.xlabel("Epoch")
    plt.xticks(epoch_ticks)
    plt.legend()
    plt.grid()

    # acc
    plt.subplot(1,2,2)
    plt.plot(tracker.train_iter, tracker.train_acc, label='Training accuracy')
    plt.plot(tracker.val_iter, tracker.val_acc, label='Validation accuracy')
    plt.title("Accuracy")
    plt.ylabel("Accuracy"), plt.xlabel("Epoch")
    plt.xticks(epoch_ticks)
    plt.legend()
    plt.grid()

    plt.tight_layout()
    plt.show()


def calc_multiclass_weights(df: pd.DataFrame, device):
  
  cls = sorted(df.diagnosis.unique())
  y = df.diagnosis.to_list()
  csw = class_weight.compute_class_weight('balanced', classes = cls, y = y)
  class_weights = torch.tensor(csw,dtype=torch.float).to(device)

  return class_weights

def feature_intersect(domain, features):
    dom_feat = np.asarray(domain)
    curr_feat = np.asarray(features)
    
    ones=np.intersect1d(np.where(dom_feat==1), np.where(curr_feat==1))
    intersect = np.zeros(len(dom_feat),dtype=int)
    intersect[ones] = 1

    return intersect


# Define Dataset Class for features


In [None]:
from tqdm import tqdm
from torchvision import transforms
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split

def get_similarity(data, domain):
  data, domain = np.array(data), np.array(domain)
  ones = np.intersect1d(np.where(data==1),np.where(domain==1))
  zeros = np.intersect1d(np.where(data==0),np.where(domain==0))

  return (len(ones) + len(zeros)) / len(data)

class DomainDatset(Dataset):
  def __init__(self, data: pd.DataFrame, augment = False, dictator = "features_label"):
    
    # upsample if augment
    if augment:
      sample_count = {}
      up_sampler = np.unique(data[dictator])
      for f in up_sampler:
          sample_count[f] = np.count_nonzero(data[dictator] == f)

      maxcount = np.max(list(sample_count.values()))
      for f in up_sampler:
          gapnum = maxcount - sample_count[f]
          temp_df = data.iloc[np.random.choice(np.where(data[dictator] == f)[0], size = gapnum)]
          data = data.append(temp_df, ignore_index = True)


    self.dataframe = data
    self.data_input = data["features"].reset_index(drop=True)
    self.domain_input = data["domain_features"].reset_index(drop=True)
    self.target = data["ts"].reset_index(drop=True)

  def __len__(self):
    return (len(self.data_input))

  def __getitem__(self, i):
    
    target = self.target[i]
    
    domain_input = self.domain_input[i]
    data_input = self.data_input[i]
    
    #input = [*data_input, *domain_input] 
    #input = feature_intersect(domain_input, data_input)
    input = np.array(domain_input) + np.array(data_input)

    return torch.tensor(input, dtype=torch.float), torch.tensor(target, dtype=torch.long)

# Model `DomainNet` for learning Domain Knowledge

In [None]:
# create the MTL network
from torch import nn
from torch import optim
import torchvision.models as models

from tqdm import tqdm
from torchvision import transforms
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split


class DomainNet(nn.Module):

    def __init__(self, num_classes, num_features, num_hidden = 256):
        super(DomainNet, self).__init__()

        self.num_classes = num_classes
        self.num_features = num_features
        
        self.layer_1 = nn.Sequential(
            nn.Linear(in_features=num_features, out_features=num_hidden),
            nn.ReLU(),
        )

        self.layer_2 = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(in_features=num_hidden, out_features=num_classes)
        )


    def forward(self, x):

      x = self.layer_1(x)

      x = self.layer_2(x)

      return x
    

# Dataset for MTL Net


In [None]:


class NaturalImageDataset(Dataset):
  def __init__(self, data, augment = False, load_img=True, dictator="features_label"):

    dictator = 'features_label'    # What variable we use to upsample to match
    # upsample if augment
    if augment:
      sample_count = {}
      up_sampler = np.unique(data[dictator])
      for f in up_sampler:
          sample_count[f] = np.count_nonzero(data[dictator] == f)

      maxcount = np.max(list(sample_count.values()))
      for f in up_sampler:
          gapnum = maxcount - sample_count[f]
          temp_df = data.iloc[np.random.choice(np.where(data[dictator] == f)[0], size = gapnum)]
          data = data.append(temp_df, ignore_index = True)
      

    self.dataframe = data
    self.imgage_path = data["image_path"].values
    self.labels = data["ts"].values
    self.features = data["features"].values

    # transform image
    if augment:
      self.transform = transforms.Compose([
                                  transforms.Resize(256),
                                  transforms.CenterCrop(224),
                                  transforms.ToTensor(),
                                  transforms.RandomHorizontalFlip(p = 0.5),
                                  transforms.RandomVerticalFlip(p=0.5),
                                  transforms.ColorJitter(brightness = 0.1, contrast = 0.1),
                                  transforms.RandomAffine(degrees = 50, translate = (0.1, 0.1), scale = (0.9, 1.1)),
                                  transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
                              ])
    else:
      self.transform = transforms.Compose([
                                  transforms.Resize(256),
                                  transforms.CenterCrop(224),
                                  transforms.ToTensor(),
                                  transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
                              ])


    if load_img:
      self.images = [self.transform(Image.open(img_path)) for img_path in tqdm(data["image_path"])]

  def __len__(self):
    return (len(self.images))

  def __getitem__(self, i):
    image = self.images[i]
    label = self.labels[i]
    feature = self.features[i]
    return image, torch.tensor(label, dtype=torch.long), torch.tensor(feature, dtype=torch.long)

# Define train loop for `DomainNet`

In [None]:
# Train the net
from HelperFunctions.project_utils import Tracker, plot_tracker
from sklearn.metrics import accuracy_score, f1_score

def train_domain_net(net: DomainNet, criterion, optimizer, device,
                     trainloader: DataLoader, validationloader: DataLoader = None,
                     validation_on: bool = False, num_epoch = 100, eval_every = 3,
                     plt_on: bool = False):

  # Initialize tracker
  tracker = Tracker()

  for epoch in tqdm(range(num_epoch)):  
    # print("\r",end=f"Epoch: {epoch}", flush=True)
    # Train
    net.train()
    for i, x in enumerate(trainloader):
      input_batch, targets = x
      input_batch, targets = input_batch.to(device), targets.to(device)

      output = net(input_batch)

      # labels ------------------------------------------------------------
      true_class = torch.argmax(targets,dim=1)
      probabilities = nn.functional.softmax(output, dim = 1) 
      preds = torch.argmax(probabilities,dim=1)
      
      loss = criterion(output, true_class)
      tracker.batch_loss.append(loss.item() / input_batch.size(0))

      acc = f1_score(true_class.cpu(), preds.cpu(), average='weighted')
      tracker.batch_acc.append(acc)

      optimizer.zero_grad()
      loss.backward()
      optimizer.step()  

    # Update training values with batch results
    tracker.train_update(epoch)

    # Validate
    if validation_on & ((epoch % eval_every == 0) | (epoch == num_epoch - 1)):
      net.eval() 
      with torch.no_grad(): 

        for i, v in enumerate(validationloader):
    
          input_batch, targets = x
          input_batch, targets = input_batch.to(device), targets.to(device)

          output = net(input_batch)

          true_class = torch.argmax(targets,dim=1)
          probabilities = nn.functional.softmax(output, dim = 1) 
          preds = torch.argmax(probabilities,dim=1)
          
          loss = criterion(output, true_class)
          tracker.batch_loss.append(loss.item() / input_batch.size(0))

          acc = f1_score(true_class.cpu(), preds.cpu(), average="weighted")
          tracker.batch_acc.append(acc)

      tracker.val_update(epoch)

  if plt_on: plt_tracker(tracker, num_epoch)
  return tracker



# Test `DomainNet`



In [None]:

def test_domain_net(net: DomainNet, testloader: DataLoader, device):

  test_probs = []
  test_preds = []
  test_targets = []

  net.eval()
  with torch.no_grad():
    for i, x in enumerate(testloader):
        input_batch, one_hot_target = x
        input_batch = input_batch.to(device)

        output = net(input_batch)

        targets = torch.argmax(one_hot_target,dim=1)
        probs = nn.functional.softmax(output, dim = 1) 
        preds = torch.argmax(probs,dim=1)

        test_probs = [*test_probs, *probs.cpu().detach().numpy()]
        test_preds = [*test_preds, *preds.cpu().detach().numpy()]
        test_targets = [*test_targets, *targets.cpu().detach().numpy()]

        return {"probs": test_probs, "preds": test_preds, "targets": test_targets}

    

# K-Fold Cross Validation of `DomainNet`



In [None]:
# from sklearn.model_selection import train_test_split, StratifiedKFold, StratifiedShuffleSplit
# from sklearn import utils
# # Make a copy of labels dataset
# data_df = labels.copy()

# k = 5
# skf = StratifiedKFold(n_splits=k)
# _x = np.zeros(len(data_df))
# _y = pd.DataFrame(data_df.ts.to_list()).idxmax(axis=1)
# save_splits(skf, _x, _y)

In [None]:
from sklearn.model_selection import train_test_split, StratifiedKFold, StratifiedShuffleSplit
from sklearn import utils

# move to GPU if possible
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Make a copy of labels dataset
data_df = labels.copy()

trackers = []
results = []

splits = read_splits("K_fold/splits_FINAL_NA.csv")

NUM_FEATURES = len(data_df.features.to_list()[0])
NUM_CLASSES = len(data_df.diagnosis.unique())
BATCH_SIZE = 32

k_epochs=25
k_idx = 0

#for train_idx, test_idx in skf.split(_x, _y):
for i, row in splits.iterrows():

  train_idx = row.train
  test_idx = row.test

  # Split train/test
  train_df = data_df.loc[train_idx]
  test_df = data_df.loc[test_idx]

  # Load into DomainDataSet class
  trainset = DomainDatset(train_df, augment = True)
  testset = DomainDatset(test_df)

  # Split into batches via DataLoader
  trainloader = DataLoader(trainset, batch_size=BATCH_SIZE)
  testloader = DataLoader(testset, batch_size=len(testset.dataframe))

  # Initialize Net
  domain_net = DomainNet(NUM_CLASSES, NUM_FEATURES, 256)
  domain_net.to(device)

  # Define loss and optimizer
  class_weights = calc_multiclass_weights(trainset.dataframe, device)  # Adjust for data imbalance
  criterion = nn.CrossEntropyLoss(weight=class_weights)
  optimizer = optim.Adam(domain_net.parameters(), lr=3e-4, weight_decay = 0.01)

  # Train net
  k_tracker = train_domain_net(domain_net, criterion, optimizer, device, 
                               trainloader, validation_on=False, num_epoch = k_epochs, 
                               plt_on = False)
  trackers.append(k_tracker)

  # Test net
  k_results = test_domain_net(domain_net, testloader, device)
  results.append(k_results)

  k_idx += 1


Using device: cuda:0


100%|██████████| 25/25 [00:08<00:00,  2.82it/s]
100%|██████████| 25/25 [00:08<00:00,  3.02it/s]
100%|██████████| 25/25 [00:09<00:00,  2.69it/s]
100%|██████████| 25/25 [00:10<00:00,  2.46it/s]
100%|██████████| 25/25 [00:09<00:00,  2.51it/s]


# Kfold results `DomainNet`

In [None]:
from sklearn.metrics import classification_report, f1_score

k_acc = []
for k in range(5):
  res = results[k]

  k_acc.append(f1_score(res["targets"], res["preds"], average = "weighted"))

print(tabulate({
    "k": list(range(5)) + ["Mean", "Std"],
    "F1 score": k_acc + [np.mean(k_acc), np.std(k_acc)]
}))

----  -
0     1
1     1
2     1
3     1
4     1
Mean  1
Std   0
----  -
