In [1]:
import os
import pickle
import torch
import sys

from PIL import Image
from skimage.io import imread
from skimage.transform import resize
from matplotlib import pyplot as plt
from tabulate import tabulate

In [2]:
# Mount drive
from google.colab import drive
drive.mount('/content/drive')

# fix the path
original_path = os.getcwd()
sys.path.append(os.path.join('.', '..'))
sys.path.append('/content/drive/My Drive/Deep_Learning_Project12/')
os.chdir(sys.path[-1])

Mounted at /content/drive


# Import Data and Wrangling

In [3]:
import numpy as np
import pandas as pd

data_files = os.listdir("Files")
  
labels = pd.read_csv("Files/dermx_labels.csv")
labels["image_path"] = [os.path.join(os.getcwd(),"Files", "images", f"{x}.jpeg") for x in labels["image_id"]]
labels.drop(columns = "Unnamed: 0", inplace = True)

labels.dropna().reset_index(drop = True)
labels = pd.get_dummies(labels, columns = ["area"])
labels["open_comedo"] = (labels["open_comedo"] > 0).astype(int)

features_target = pd.read_csv("Files/diseases_characteristics.csv")
features_target.rename(columns={"Unnamed: 0":"disease"},inplace=True)

# create on_hot for diagnosis and get features
one_hot = pd.get_dummies(labels["diagnosis"])
one_hot_encoding = [list(x) for x in one_hot.values]

labels["ts"] = one_hot_encoding

# get features as multi hot
features_touse = list(labels.columns[list(range(2,9)) + [10,11,12,13]])
labels["features"] = labels.loc[:, features_touse].values.tolist()

# map feature sequences to value
features_map = {}
for idx, feat in enumerate(labels["features"].apply(tuple).unique()):
  features_map[str(feat)] = idx

labels["features_label"] = labels["features"].apply(tuple).apply(str).map(features_map)

# get domain
domain = pd.read_csv("Files/diseases_characteristics.csv")
domain.rename(columns={"Unnamed: 0":"diagnosis"},inplace=True)
domain = pd.get_dummies(domain, columns = ["area"])
same_sort = ["diagnosis"] + features_touse
domain = domain[same_sort]  # same sorting

domain_one_hot = pd.get_dummies(domain["diagnosis"])

domain_one_hot_encoding = [list(x) for x in domain_one_hot.values]
domain["ts"] = domain_one_hot_encoding
feature_cols = domain.columns[1:12]
domain["features"] = domain.loc[:,feature_cols].values.tolist()

# add domain features (domain knowledge) to dataframe
tf = []
for i, row in labels.iterrows():
  disease = row["diagnosis"]
  true_features = domain.loc[domain.diagnosis == disease].features.tolist()[0]
  tf.append(true_features)
labels["domain_features"] = tf 

domain = domain.sort_values(by="diagnosis").reset_index(drop=True)

data = labels.copy()


# Check corrections

In [22]:
from HelperFunctions.project_utils import KFoldResult
from sklearn.metrics import classification_report, f1_score
import seaborn as sns

k = 5

corr_results = []
corr_counts = np.zeros(shape=(k,6))
corr_f1 = []
mtl_f1 = []

test_len=[]

for i in range(k):

  target_labels = sorted(data.diagnosis.unique())

  # COLLECT RUN VARIABLES
  k_name = f"K_fold/Preds_Correction_FINAL_kfold_NA_{i}.json"
  res = KFoldResult(k_name)
  test_len.append(len(res.test_idx))
  true_labels = res.test_labels_targets
  true_features = res.test_features_targets

  mtl_probs = res.labels_probs.numpy()
  mtl_preds = res.labels_preds
  corrected_preds = res.corrected_preds

  # Cast to np.array 
  correction_preds = np.array(corrected_preds)
  change = (mtl_preds!=correction_preds).astype(int)

  corr_df = pd.DataFrame()
  corr_df["disease"] = [target_labels[x] for x in true_labels]  # Targets as labels
  corr_df["target"] = true_labels                               # Targets as [0/1]
  corr_df["mtl_pred"] = mtl_preds                               # MTL predictions
  corr_df["corrected_pred"] = correction_preds                  # Correction prediction
  corr_df["change"] = change                                    # Did it change? [0/1]


  # Collect change effects:
  # CORRECT => CORRECT (GOOD)   [CHANGE = 0] (correct_nochange)   - TTnC
  # WRONG   => CORRECT (GOOD)   [CHANGE = 1] (correct_change)     - FTC
  # CORRECT => WRONG (BAD)      [CHANGE = 1] (incorrect_change)   - TFC
  # WRONG   => WRONG (BAD)      [CHANGE = 1] (incorrect_change)   - FFC
  # WRONG   => SAME  (BAD)      [CHANGE = 0] (incorrect_nochange) - FFnC


  corr_df["TTnC"] = ((mtl_preds == true_labels) & (correction_preds == true_labels) & (change==0)).astype(int)
  corr_df["FTC"]  = ((mtl_preds != true_labels) & (correction_preds == true_labels) & (change==1)).astype(int)
  corr_df["TFC"]  = ((mtl_preds == true_labels) & (correction_preds != true_labels) & (change==1)).astype(int)
  corr_df["FFC"]  = ((mtl_preds != true_labels) & (correction_preds != true_labels) & (change==1)).astype(int)
  corr_df["FFnC"] = ((mtl_preds != true_labels) & (correction_preds != true_labels) & (change==0)).astype(int)



  total = corr_df.agg({"change": "sum",
                       "TTnC": "sum",
                        "FTC": "sum",
                        "TFC": "sum",
                        "FFC": "sum",
                        "FFnC": "sum"})


  corr_results.append(corr_df)
  corr_counts[i] = total.values
  corr_f1.append(f1_score(true_labels, correction_preds,average="weighted"))
  mtl_f1.append(f1_score(true_labels, mtl_preds,average="weighted"))
  
corr_df = pd.DataFrame(data=corr_counts, columns=["Change","TTnC","FTC","TFC","FFC","FFnC"])
corr_df["F1 MTL"] = mtl_f1
corr_df["F1 Correction"] = corr_f1

tab=tabulate(corr_df,headers=corr_df.columns.to_list(),floatfmt=".2f")
print(tab)

print("\nF1 score: Mean and std:\n")
print(tabulate({
    " ": ["MTL", "Correction"],
    "mean": [np.mean(mtl_f1), np.mean(corr_f1)],
    "std": [np.std(mtl_f1), np.std(corr_f1)]
}, headers = "keys", floatfmt=".2f"))


      Change    TTnC    FTC    TFC    FFC    FFnC    F1 MTL    F1 Correction
--  --------  ------  -----  -----  -----  ------  --------  ---------------
 0      3.00   64.00   2.00   1.00   0.00   24.00      0.71             0.72
 1      6.00   65.00   3.00   3.00   0.00   20.00      0.74             0.74
 2      7.00   66.00   4.00   3.00   0.00   18.00      0.75             0.76
 3      5.00   69.00   0.00   3.00   2.00   16.00      0.80             0.77
 4      4.00   65.00   2.00   1.00   1.00   21.00      0.73             0.74

F1 score: Mean and std:

              mean    std
----------  ------  -----
MTL           0.75   0.03
Correction    0.75   0.02


# Generate the correction effect as percentage

In [23]:
cts = corr_df[corr_df.columns.to_list()[:-2]].to_numpy()
lens = np.array(test_len)
perc=cts.transpose()/lens
perc=perc.transpose()

 
mp = perc.mean(axis=0)
sd = perc.std(axis=0)
d = pd.DataFrame(perc, columns=corr_df.columns.to_list()[:-2])
tab=tabulate(perc,headers=d.columns.to_list())
print(tab)
print("\nmean:")
print(np.round(mp*100,2))
print("std:")
print(sd*100)

   Change      TTnC        FTC        TFC        FFC      FFnC
---------  --------  ---------  ---------  ---------  --------
0.032967   0.703297  0.021978   0.010989   0          0.263736
0.0659341  0.714286  0.032967   0.032967   0          0.21978
0.0769231  0.725275  0.043956   0.032967   0          0.197802
0.0555556  0.766667  0          0.0333333  0.0222222  0.177778
0.0444444  0.722222  0.0222222  0.0111111  0.0111111  0.233333

mean:
[ 5.52 72.63  2.42  2.43  0.67 21.85]
std:
[1.54739735 2.1543583  1.45715331 1.07977937 0.88888889 2.95116374]


# How would the correction be if the MTLNet had 100% prediction of the features?

In [24]:
from HelperFunctions.project_utils import KFoldResult
from sklearn.metrics import classification_report, f1_score
import seaborn as sns

k = 5

corr_results = []
corr_counts = np.zeros(shape=(k,6))
corr_f1 = []
mtl_f1 = []

test_len=[]

for i in range(k):

  target_labels = sorted(data.diagnosis.unique())

  # COLLECT RUN VARIABLES
  k_name = f"K_fold/TrueFeatures_Correction_FINAL_kfold_NA_{i}.json"
  res = KFoldResult(k_name)
  test_len.append(len(res.test_idx))
  true_labels = res.test_labels_targets
  true_features = res.test_features_targets

  mtl_probs = res.labels_probs.numpy()
  mtl_preds = res.labels_preds
  corrected_preds = res.corrected_preds

  # Cast to np.array 
  correction_preds = np.array(corrected_preds)
  change = (mtl_preds!=correction_preds).astype(int)

  corr_df = pd.DataFrame()
  corr_df["disease"] = [target_labels[x] for x in true_labels]  # Targets as labels
  corr_df["target"] = true_labels                               # Targets as [0/1]
  corr_df["mtl_pred"] = mtl_preds                               # MTL predictions
  corr_df["corrected_pred"] = correction_preds                  # Correction prediction
  corr_df["change"] = change                                    # Did it change? [0/1]


  # Collect change effects:
  # CORRECT => CORRECT (GOOD)   [CHANGE = 0] (correct_nochange)   - TTnC
  # WRONG   => CORRECT (GOOD)   [CHANGE = 1] (correct_change)     - FTC
  # CORRECT => WRONG (BAD)      [CHANGE = 1] (incorrect_change)   - TFC
  # WRONG   => WRONG (BAD)      [CHANGE = 1] (incorrect_change)   - FFC
  # WRONG   => SAME  (BAD)      [CHANGE = 0] (incorrect_nochange) - FFnC


  corr_df["TTnC"] = ((mtl_preds == true_labels) & (correction_preds == true_labels) & (change==0)).astype(int)
  corr_df["FTC"]  = ((mtl_preds != true_labels) & (correction_preds == true_labels) & (change==1)).astype(int)
  corr_df["TFC"]  = ((mtl_preds == true_labels) & (correction_preds != true_labels) & (change==1)).astype(int)
  corr_df["FFC"]  = ((mtl_preds != true_labels) & (correction_preds != true_labels) & (change==1)).astype(int)
  corr_df["FFnC"] = ((mtl_preds != true_labels) & (correction_preds != true_labels) & (change==0)).astype(int)



  total = corr_df.agg({"change": "sum",
                       "TTnC": "sum",
                        "FTC": "sum",
                        "TFC": "sum",
                        "FFC": "sum",
                        "FFnC": "sum"})


  corr_results.append(corr_df)
  corr_counts[i] = total.values
  corr_f1.append(f1_score(true_labels, correction_preds,average="weighted"))
  mtl_f1.append(f1_score(true_labels, mtl_preds,average="weighted"))
  
corr_df = pd.DataFrame(data=corr_counts, columns=["Change","TTnC","FTC","TFC","FFC","FFnC"])
corr_df["F1 MTL"] = mtl_f1
corr_df["F1 Correction"] = corr_f1

tab=tabulate(corr_df,headers=corr_df.columns.to_list(),floatfmt=".2f")
print(tab)

print("\nF1 score: Mean and std:\n")
print(tabulate({
    " ": ["MTL", "Correction"],
    "mean": [np.mean(mtl_f1), np.mean(corr_f1)],
    "std": [np.std(mtl_f1), np.std(corr_f1)]
}, headers = "keys", floatfmt=".2f"))


FileNotFoundError: ignored

In [None]:
cts = corr_df[corr_df.columns.to_list()[:-2]].to_numpy()
lens = np.array(test_len)
perc=cts.transpose()/lens
perc=perc.transpose()

 
mp = perc.mean(axis=0)
sd = perc.std(axis=0)
d = pd.DataFrame(perc, columns=corr_df.columns.to_list()[:-2])
tab=tabulate(perc,headers=d.columns.to_list())
print(tab)
print("\nmean:")
print(np.round(mp*100,2))
print("std:")
print(sd*100)