### Import Libraries

In [90]:
import datetime, os, math, random
from random import shuffle
import numpy as np
import pickle
from tqdm import tqdm
from statistics import mode

# scikit-learn
from sklearn.metrics.pairwise import euclidean_distances, manhattan_distances, cosine_distances
from sklearn.metrics import auc, roc_curve, davies_bouldin_score, silhouette_score
from sklearn.cluster import KMeans

# Pandas
import pandas as pd

### Select Model

In [91]:
selected_model = 1 #int(input("Enter the number for: \n 1) VGG16 \n 2) Resnet101  \n 3) Densenet169 "))

### For traning speed, define DATASET_SIZE

In [92]:
DATASET_SIZE = 1

### Open extracted feature in pickle file

In [93]:
filepath = "../../pickle_files/al/x_ray/"
if selected_model == 1:
  filename = "x_ray_pca_vgg16.pickle"
elif selected_model == 2:
  filename = "x_ray_resnet101.pickle"
elif selected_model == 3:
  filename = "x_ray_densenet169.pickle"

file = filepath + filename
with open(file, 'rb') as handle:
  all_ft_dataset = pickle.load(handle)

### Sample the data and shuffle

In [94]:
# suffle the data
random.seed(42)
all_ft_dataset = all_ft_dataset[:4400]
shuffle(all_ft_dataset)

print("Total Dataset: {}".format(len(all_ft_dataset)))
print("Sample feature dataset Covid: {}".format(all_ft_dataset[0]))
print("Sample feature dataset Non-Covid: {}".format(all_ft_dataset[2325]))

# shrink for minimize training time
original_data_size = len(all_ft_dataset)
new_data_size = int(original_data_size * DATASET_SIZE)
ft_dataset = all_ft_dataset[:new_data_size]

print("Selected Dataset: {}".format(len(ft_dataset)))

Total Dataset: 4400
Sample feature dataset Covid: {'id': 3094, 'filepath': '../../dataset/xray/resized\\Noncovid\\NORMAL(447).png', 'image': array([ 6.2504688e+02, -4.0303204e+01,  9.9337494e+01, ...,
        7.3082373e-11,  1.7737714e-10,  7.5504554e-11], dtype=float32), 'label': 0}
Sample feature dataset Non-Covid: {'id': 4033, 'filepath': '../../dataset/xray/resized\\Noncovid\\PNEUMONIA(392).jpg', 'image': array([ 2.9877567e+02, -2.0280376e+02, -2.0804192e+02, ...,
        7.2032998e-11,  1.8084165e-10,  7.4162385e-11], dtype=float32), 'label': 0}
Selected Dataset: 4400


### Calculate mean features

In [95]:
# Function that returns the mean of each subclusters. (required as mean is the representative of that subcluster)
def mean_features(c_pos_features, c_neg_features):
  mpos_features = np.array([np.mean(i, axis=0) for i in c_pos_features])  # Mean of all positive subclusters
  mneg_features = np.array([np.mean(i, axis=0) for i in c_neg_features])  # Mean of all negative subclusters
  return mpos_features, mneg_features

### Function to update subcluster

In [96]:
# Function that updates the subcluster by concatenating the test data sample to the most similar subcluster.
def update_subclusters(query, closest_labels_from_model, id_pred, label_pred, n_neighbours, c_features, distances_of_data_to_cc, cluster_index):
  nearest_subcluster_index = np.argmin(distances_of_data_to_cc) # find nearest subcluster of the query
  # add query data to the nearest/most-similar subcluster
  c_features[nearest_subcluster_index] = np.concatenate((c_features[nearest_subcluster_index], np.expand_dims(query["image"], axis=0)), axis=0)

  id_pred[cluster_index].append(query["id"])
  # closest_labels_from_model.count(1)/n_neighbours --> Percentage that the model predict the data as positive (required to calculate AUC ROC value)
  label_pred[cluster_index].append((query['id'], closest_labels_from_model.count(1)/n_neighbours))
  return c_features, id_pred, label_pred

### Find the class (pos or neg) from max frequency

In [97]:
def get_label_with_max_freq(closest_labels_from_model):
    return mode(closest_labels_from_model)

### Define a function to correct mispredictions

In [98]:
# Function to check the model predicted label with the ground truth and corrects only if it is a mistake
def correct_mispredictions(query, closest_labels_from_model, c_pos_features, c_neg_features, distances_of_data_to_neg_cc, distances_of_data_to_pos_cc, data_frame_mistake, mistake_index, corrected_count):
  if get_label_with_max_freq(closest_labels_from_model) != query["label"]:  # Misclassification: if model's decision is different than the ground truth.
    corrected_count += 1
    data_frame_mistake["Image name"].append(query["filepath"].split("/")[-1])  # Recording to save it as csv file
    data_frame_mistake["Mistake ID"].append(query['id'])
    data_frame_mistake["Original label"].append(query['label'])
    data_frame_mistake["Predicted label"].append(get_label_with_max_freq(closest_labels_from_model))
    data_frame_mistake["Mistake index"].append(mistake_index)
    if query["label"] == 0:
      c_neg_features.append(np.expand_dims(query["image"], axis=0))  # Appending negative image to negative cluster
    else:
      c_pos_features.append(np.expand_dims(query["image"], axis=0))  # Appending positive image to positive cluster

  else: # Correct classification by model: concatenating the feature to the closest subsample.
    if query['label'] == 0:
      nearest_subcluster_index = np.argmin(distances_of_data_to_neg_cc)  # find nearest subcluster of the query
      # add query data to the nearest/most-similar subcluster
      c_neg_features[nearest_subcluster_index] = np.concatenate((c_neg_features[nearest_subcluster_index], np.expand_dims(query["image"], axis=0)), axis=0)
    else:
      nearest_subcluster_index = np.argmin(distances_of_data_to_pos_cc)  # find nearest subcluster of the query
      # add query data to the nearest/most-similar subcluster
      c_pos_features[nearest_subcluster_index] = np.concatenate((c_pos_features[nearest_subcluster_index], np.expand_dims(query["image"], axis=0)), axis=0)

  return corrected_count, data_frame_mistake, c_pos_features, c_neg_features

### Define a function to calcuate distance

In [99]:
# query: raw dictionary (from pickle file)
# cluster_centers_dict: dictionary of {0: [], 1:[]} ==> {0: cc_neg_features, 1: cc_pos_features}
# distance_type: 1. Eucliddean, 2. Manhattan, 3. Cosine
# label_pred: predicted label
# c_pos_features
# c_neg_features
# n_neighbours: no. of neighbour
# corrected_count: count of coorection of mispredictions
# mistake_index: index of data to track the mistaken data
# data_frame_mistake: to save data_frame in CSV
# mentored_data: if mentored data or not
def distance(query,
             cluster_centers_dict,
             distance_type, id_pred,
             label_pred,
             c_pos_features,
             c_neg_features,
             n_neighbours,
             corrected_count,
             mistake_index,
             data_frame_mistake,
             mentored_data):
  expnd_query = np.expand_dims(query['image'], axis=0)
  distances_of_data_to_pos_cc, distances_of_data_to_neg_cc = [], []

  # len(cluster_centers_dict[0]) should have at least the number of sub-cluster
  # Calculating the distance using numpy (axis=1) to calculate all at ones
  if distance_type == 1: # Euclidean distance
    if len(cluster_centers_dict[0]) > 1:
      distances_of_data_to_neg_cc = np.linalg.norm(query['image'] - cluster_centers_dict[0], axis=1)
    elif len(cluster_centers_dict[0]) == 1:
      distances_of_data_to_neg_cc = [np.linalg.norm(query['image'] - cluster_centers_dict[0], axis=1)]
    if len(cluster_centers_dict[1]) > 1:
      distances_of_data_to_pos_cc = np.linalg.norm(query['image'] - cluster_centers_dict[1], axis=1)
    elif len(cluster_centers_dict[1]) == 1:
      distances_of_data_to_pos_cc = [np.linalg.norm(query['image'] - cluster_centers_dict[1], axis=1)]

  elif distance_type == 2: # Manhattan distance
    if len(cluster_centers_dict[0]) > 1:
      distances_of_data_to_neg_cc = np.squeeze(manhattan_distances(cluster_centers_dict[0], expnd_query))
    elif len(cluster_centers_dict[0]) == 1:
      distances_of_data_to_neg_cc = [np.squeeze(manhattan_distances(cluster_centers_dict[0], expnd_query))]
    if len(cluster_centers_dict[1]) > 1:
      distances_of_data_to_pos_cc = np.squeeze(manhattan_distances(cluster_centers_dict[1], expnd_query))
    elif len(cluster_centers_dict[1]) == 1:
      distances_of_data_to_pos_cc = [np.squeeze(manhattan_distances(cluster_centers_dict[1], expnd_query))]

  elif distance_type == 3: # Cosine distance
    if len(cluster_centers_dict[0]) > 1:
      distances_of_data_to_neg_cc = np.squeeze(cosine_distances(expnd_query, cluster_centers_dict[0]))
    elif len(cluster_centers_dict[0]) == 1:
      distances_of_data_to_neg_cc = [np.squeeze(cosine_distances(expnd_query, cluster_centers_dict[0]))]
    if len(cluster_centers_dict[1]) > 1:
      distances_of_data_to_pos_cc = np.squeeze(cosine_distances(expnd_query, cluster_centers_dict[1]))
    elif len(cluster_centers_dict[1]) == 1:
      distances_of_data_to_pos_cc = [np.squeeze(cosine_distances(expnd_query, cluster_centers_dict[1]))]

  pos_ditances_tup_list, neg_distances_tup_list = [], []
  for dist_single in distances_of_data_to_pos_cc:
    pos_ditances_tup_list.append((dist_single, 1))

  for dist_single in distances_of_data_to_neg_cc:
    neg_distances_tup_list.append((dist_single, 0))

  # concat all distances
  pos_ditances_tup_list.extend(neg_distances_tup_list)
  # sort distances from min to max result: ((0.1, 1), (0.2, 1), (0.3, 0), (0.4, 0), (0.5, 1))
  all_distances_tup = sorted(pos_ditances_tup_list)[:n_neighbours]

  # filter only n_neighbours elements
  # all_distances_tup = all_distances_tup[:n_neighbours]

  closest_labels_from_model = [label for (distance, label) in all_distances_tup]

  if mentored_data:
    (corrected_count,
     data_frame_mistake,
     c_pos_features,
     c_neg_features) = correct_mispredictions(query,
                                              closest_labels_from_model,
                                              c_pos_features,
                                              c_neg_features,
                                              distances_of_data_to_neg_cc,
                                              distances_of_data_to_pos_cc,
                                              data_frame_mistake,
                                              mistake_index,
                                              corrected_count
                                              )

  else:
    # label from model is negative
    if len(closest_labels_from_model) > 0 and get_label_with_max_freq(closest_labels_from_model) == 0:
      c_neg_features, id_pred, label_pred = update_subclusters(query,
                                                               closest_labels_from_model,
                                                               id_pred,
                                                               label_pred,
                                                               n_neighbours,
                                                               # neg params
                                                               c_neg_features,
                                                               distances_of_data_to_neg_cc,
                                                               cluster_index=0
                                                              )
    else:
      # label from model is positive
      c_pos_features, id_pred, label_pred = update_subclusters(query,
                                                               closest_labels_from_model,
                                                               id_pred,
                                                               label_pred,
                                                               n_neighbours,
                                                               # pos params
                                                               c_pos_features,
                                                               distances_of_data_to_pos_cc,
                                                               cluster_index=1
                                                              )

  return data_frame_mistake, corrected_count, id_pred, label_pred, c_pos_features, c_neg_features


### Define a function to calculate the classification metrices

In [100]:
def classification_metrices(id_gt, id_pred):
  TP, FP, FN, TN = 0, 0, 0, 0

  # TP --> when correctly classified covid
  for tp in id_pred[1]:
    if tp in id_gt[1]:
      TP += 1

  # TN --> when correctly classified healthy (non-covid)
  for tn in id_pred[0]:
    if tn in id_gt[0]:
      TN += 1

  # FP --> when incorrectly classified healthy (Classified healthy as covid)
  for fp in id_pred[1]:
    if fp in id_gt[0]:
      FP += 1

  # FN --> when missed covid classification (Covid cases missed)
  for fn in id_pred[0]:
    if fn in id_gt[1]:
      FN += 1

  accuracy = round((TP + TN) / (TP + TN + FP + FN), 3)
  if (TN + FP) > 0:
    specificity = round(TN / (TN + FP), 3)
  else:
    specificity = 0 # Infinity

  if (TP + FN) > 0:
    sensitivity = round((TP) / (TP + FN), 3)
  else:
    sensitivity = 0 # Infinity

  # f1_score = (2*precision*recall)/(precision + recall)

  print("TP: {}  FP: {}".format(TP, FP))
  print("FN: {}  TN: {}".format(FN, TN))

  return accuracy, specificity, sensitivity, TP, TN, FP, FN

### Define a function to calculate ROC AUC Curve

In [101]:
def roc_auc_curve(label_gt, label_pred):
  # contains (id, labels) tuple of binary class
  gt_labels = sorted(label_gt[0] + label_gt[1])

  # contains (id, labels) tuple of binary class --> sorted to match each element in gt_labels and pred_labels
  pred_labels = sorted(label_pred[0] + label_pred[1])
  y_test = [y for (x,y) in gt_labels] # Get only the labels
  y_scores = [y for (x,y) in pred_labels]
  fpr, tpr, threshold = roc_curve(y_test, y_scores)
  roc_auc = round(auc(fpr, tpr), 3)
  return roc_auc

### Define a function to calculate Cluster metrices

In [102]:
def cluster_metrices(neg_features, pos_features):
  print("Calculating Dunn's index...")
  dunn_index, davies_bouldin_index, silhouette_index = "NA", "NA", "NA"
  if len(neg_features) > 0 and len(pos_features) > 0:
    intra_dist1 = euclidean_distances(neg_features).max()
    intra_dist2 = euclidean_distances(pos_features).max()
    inter_dist = euclidean_distances(neg_features, pos_features).min()

    if intra_dist1 > intra_dist2:
      max_intra_dist= intra_dist1
    else:
      max_intra_dist = intra_dist2

    dunn_index = round(inter_dist / max_intra_dist, 3)

  print("dunn_index: ", dunn_index)

  # Davies Bouldin and Silhouette score from sklearn library.
  if len(neg_features) > 0 and len(pos_features) > 0:
    feature_all = np.concatenate((neg_features, pos_features))
    neg_labels = np.zeros(shape=(len(neg_features)),dtype=int)
    pos_labels = np.ones(shape=(len(pos_features)),dtype=int)

    label_all = np.concatenate((neg_labels, pos_labels))
    print("Calculating Davies Bouldin index...")
    davies_bouldin_index = round(davies_bouldin_score(feature_all, label_all), 3)
    print("davies_bouldin_index: ", davies_bouldin_index)

    print("Calculating Silhouette index...")
    silhouette_index = round(silhouette_score(feature_all, label_all), 3)
    print("silhouette_index: ", silhouette_index)

  return dunn_index, davies_bouldin_index, silhouette_index

### Function to flatter the features

In [103]:
# Required to calculate the clustering indices
def flatten_features(features):
  all_features = []
  for feature in features:
    for index in feature:
      all_features.append(index)
  return all_features

### Function to find sub cluster

In [104]:
# Method to create subclusters
def sub_clusters(features, n_clusters=5):
  # Number of cluster defined from elbow method
  # kmeans = KMeans(n_clusters=int(70*DATASET_SIZE), random_state=0, n_init="auto").fit(features)
  kmeans = KMeans(n_clusters=n_clusters, random_state=0, n_init="auto").fit(features)
  # list of labels for elements occuring in each cluster
  out_labels = kmeans.labels_
  # Form clusters of deep features of image
  # clusters = [np.squeeze(np.array(features)[[np.where(out_labels == i)[0]]], axis=0) for i in range(len(np.unique(out_labels)))]
  clusters = [np.array(features)[np.where(out_labels == i)[0]] for i in range(len(np.unique(out_labels)))]
  return kmeans.cluster_centers_, clusters

### Define a function to load dataset into three different segment (k-way n-shot)

In [105]:
# Method to return three sets (n, 1500, 3000)  of labeled dataset for experiment
def data_loader(dataset, n):
  labeled_data, unlabeled_data = [], []

  l_data = dataset[:n]                          # First case (0-40) // labeled + mentored
  ul_data = dataset[n:]                         # First case (40-1000) // unlabeled
  labeled_data.append(l_data)                   # labeled_data[0] => dataset[0-40]
  unlabeled_data.append(ul_data)                # unlabeled_data[0] => dataset[40-1000]

  size_second_set = int(1500 * DATASET_SIZE) # 1500 * 0.1 = 150
  sss = size_second_set
  l_data = dataset[sss: n + sss]                # Second case (150-190) // labeled + mentored
  ul_data = dataset[:sss] + dataset[n + sss:]   # Second case (0-150) + (190-1000) // unlabeled
  labeled_data.append(l_data)                   # labeled_data[1] => dataset[150-190]
  unlabeled_data.append(ul_data)                # unlabeled_data[1] => dataset[0-150] + dataset[190-1000]

  size_second_set = int(3000 * DATASET_SIZE) # 3000 * 0.1 = 300
  sss = size_second_set
  l_data = dataset[sss: n + sss]                # Third case (300-340) // labeled + mentored
  ul_data = dataset[:sss] + dataset[n + sss:]   # Third case (0-300) + (340-1000) // unlabeled
  labeled_data.append(l_data)                   # labeled_data[2] => dataset[300-340]
  unlabeled_data.append(ul_data)                # unlabeled_data[2] => dataset[0-300] + dataset[340-1000]
  return labeled_data, unlabeled_data

### Define a function to separate data into positive and negative samples

In [106]:
# Function that selects number of data samples and removes the selected data from the dataset. (required to select balanced positive and negative samples)
def data_separation2(dataset, taken_data_idx, label=None, data_sample=100):
  add_data = []
  for i, data in enumerate(dataset):
    if i in taken_data_idx:
      continue
    if label > -1 and dataset[i]["label"] == label:
      add_data.append(data)
      taken_data_idx.append(i)
    else:
      add_data.append(data)
      taken_data_idx.append(i)
    if len(add_data) == data_sample:
      break
  return add_data, taken_data_idx

### Alternative function to separate data

In [107]:
# Function that selects number of data samples and removes the selected data from the dataset. (required to select balanced positive and negative samples)
def data_separation(dataset, label, data_sample=100):
  add_data = []
  for i, data in enumerate(dataset):
    if data["label"] == label:
      add_data.append(data['image'])
      del dataset[i]
    if len(add_data) == data_sample:
      break

  return add_data

### Distance type (Euclidean Manhattan or Consine)

In [108]:
distance_type = 3 # int(input("Enter the number for: \n 1) Euclidean  \n 2) Manhattan \n 3) Cosine"))

### Model and Distance Name

In [109]:
if selected_model == 1:
  s_model = 'vgg16'
elif selected_model == 2:
  s_model = 'resnet101'
elif selected_model == 3:
  s_model = 'densenet169'

if distance_type == 1:
  s_distance = 'euclidean'
elif distance_type == 2:
  s_distance = 'manhattan'
elif distance_type == 3:
  s_distance = 'cosine'

### Active Learning Process

In [110]:
CURRENT_TIME =  str(round(datetime.datetime.now().timestamp()))
os.mkdir(f"./test_{s_model}_{s_distance}_{CURRENT_TIME}")
n_neighbours = int(15 * DATASET_SIZE)
labeled_size = [200, 400, 800, 1550]
labeled_size = [int(size * DATASET_SIZE) for size in labeled_size]
data_frame_metrix = {
  "Labeled data": [],
  "Dataset": [],
  "Accuracy": [],
  "Specificity": [],
  "Sensitivity": [],
  "AUC":[],
  "Dunn index": [],
  "Davies Bouldin": [],
  "Silhouette index":[],
  "TP":[],
  "TN":[],
  "FP":[],
  "FN":[],
  "pos_labeled_img":[],
  "neg_labeled_img":[],
  "corrected_count":[]
}

global_count = 0

for size in labeled_size:
  print("{} training with {} size of labled data{}".format('*'*15, size, '*'*15))
  labeled_data_sets, unlabeled_data_sets = data_loader(ft_dataset, size)

  # labeled_data_sets ==> three sets: [d1, d2, d3] ==> eg: [0-40, 320-360, 640-680]
  for dataset_type, labeled_data in enumerate(labeled_data_sets):
    global_count += 1
    print(f"============================== {global_count}/{len(labeled_size) * len(labeled_data_sets)} ==============================")
    data_frame_mistake = {
      "Image name": [],
      "Mistake index": [],
      "Mistake ID": [],
      "Original label": [],
      "Predicted label": []
    }

    pos_img, neg_img = 0, 0

    # collect the ground truth (label) of all the predicting images =>> key: 0 & 1 (class), value: tuple (data['id'], data['label']), required to calulate TP, FP, FN, TN
    label_gt = {0: [], 1: []}
    # collect the ground truth (id) of all the predicting images =>> key: 0 & 1 (class), value: ground truth id
    id_gt = {0: [], 1: []}

    # collect the predicted label for all the images =>> key: 0 & 1 (class), value: tuple(query['id'], decision_list.count(1)/n_neighbours)
    # Percentage of predicted positive class, required to calculate AUC/ROC value
    label_pred = {0: [], 1: []}
    # collect the predicted id for all the images =>> key: 0 & 1 (class), value: predicted ids, required to calulate TP, FP, FN, TN
    id_pred = {0: [],  1: []}

    # feature label =>> key: 0 & 1 (class), value: deep feature of image
    cluster_centers_dict = {0: [], 1: []}

    print(f"labeled data: {len(labeled_data)}, unlabled data: {len(unlabeled_data_sets[dataset_type])}")

    neg_labeled_img, pos_labeled_img = 0, 0
    for data in labeled_data:
        if data['label'] == 0:
            neg_labeled_img += 1
        else:
            pos_labeled_img += 1

    # select balanced labeled data (50% from positive and 50% from negative)
    sample_size = int(20 * DATASET_SIZE) # sample size of balanced_data
    fpositive = data_separation(labeled_data, 1, sample_size)  # Get the 'sample_size' positive features from 'labeled_data'
    fnegative = data_separation(labeled_data, 0, sample_size)  # Get the 'sample_size' negative features from 'labeled_data'

    print(f"balanced data: {2 * sample_size}, fpositive: {len(fpositive)}, fnegative: {len(fnegative)}")
    print(f"mentored data: {len(labeled_data)}")

    n_sub_clusters = math.ceil(5 * DATASET_SIZE) if DATASET_SIZE > 0.5 else 2
    print("Number of subclusters: {}".format(n_sub_clusters))
    cc_neg_features, c_neg_features = sub_clusters(fnegative, n_sub_clusters)  # Get the cluster center and negative clusters (Using K-means algorithm)
    cc_pos_features, c_pos_features = sub_clusters(fpositive, n_sub_clusters)  # Get the cluster center and positive clusters (Using K-means algorithm)

    corrected_count, mistake_index = 0, 2 * sample_size

    print(f" {'#' * 15} Mentoring {len(labeled_data)} data {'#' * 15}")
    # loop is for the mentored data --> Notice mentored_data=True in argument of the function call distance.
    for data in labeled_data:
      cluster_centers_dict = {0: cc_neg_features, 1: cc_pos_features}

      (data_frame_mistake,
       corrected_count,
       _,
       label_pred,
       c_pos_features,
       c_neg_features) = distance(data,
                                 cluster_centers_dict,
                                 distance_type,
                                 id_pred, # not being used in this case
                                 label_pred,
                                 c_pos_features,
                                 c_neg_features,
                                 n_neighbours,
                                 corrected_count,
                                 mistake_index,
                                 data_frame_mistake,
                                 mentored_data=True)

      cc_pos_features, cc_neg_features = mean_features(c_pos_features, c_neg_features)  # Get the mean of the features
      mistake_index += 1

    print(f" {'#' * 15} Mentoring {len(labeled_data)} data DONE!!! {'#' * 15}")

    data_f_mistake = pd.DataFrame.from_dict(data_frame_mistake)
    data_f_mistake.to_csv(f"./test_{s_model}_{s_distance}_{CURRENT_TIME}/mistake_{size}_d{dataset_type + 1}.csv", index=False)

    print(f" {'#' * 15} Training {len(unlabeled_data_sets[dataset_type])} unlabeled data {'#' * 15}")
    # loop is for the test data --> Notice mentored_data=False in argument of the function call distance.
    for data in tqdm(unlabeled_data_sets[dataset_type]):
      if data["label"] == 1:
        id_gt[1].append(data['id'])
        label_gt[1].append((data['id'], data['label'])) # Required to calulate TP, FP, FN, TN
      else:
        id_gt[0].append(data['id'])
        label_gt[0].append((data['id'], data['label']))

      cluster_centers_dict = {0: cc_neg_features, 1: cc_pos_features}

      (_,
       _,
       id_pred,
       label_pred,
       c_pos_features,
       c_neg_features) = distance(data,
                                 cluster_centers_dict,
                                 distance_type,
                                 id_pred,
                                 label_pred,
                                 c_pos_features,
                                 c_neg_features,
                                 n_neighbours,
                                 corrected_count, # not being used in this case
                                 mistake_index,
                                 data_frame_mistake,
                                 mentored_data=False)

      cc_pos_features, cc_neg_features = mean_features(c_pos_features, c_neg_features)   # Get the mean of the features

    print(f" {'#' * 15} Training {len(unlabeled_data_sets[dataset_type])} unlabeled data DONE!!! {'#' * 15}")

    accuracy, specificity, sensitivity, TP, TN, FP, FN = classification_metrices(id_gt, id_pred)

    # Flattened as required to calculate clustering indices
    flattened_neg_features = flatten_features(c_neg_features)
    flattened_pos_features = flatten_features(c_pos_features)

    dunn_index, davies_bouldin_index, silhouette_index = cluster_metrices(flattened_pos_features, flattened_neg_features)
    cl_auc = roc_auc_curve(label_gt, label_pred)

    data_frame_metrix["Labeled data"].append(size)
    data_frame_metrix["Dataset"].append(f"d_{dataset_type + 1}")
    data_frame_metrix["Accuracy"].append(accuracy)
    data_frame_metrix["Specificity"].append(specificity)
    data_frame_metrix["Sensitivity"].append(sensitivity)
    data_frame_metrix["AUC"].append(cl_auc)
    data_frame_metrix["Dunn index"].append(dunn_index)
    data_frame_metrix["Davies Bouldin"].append(davies_bouldin_index)
    data_frame_metrix["Silhouette index"].append(silhouette_index)
    data_frame_metrix["TP"].append(TP)
    data_frame_metrix["TN"].append(TN)
    data_frame_metrix["FP"].append(FP)
    data_frame_metrix["FN"].append(FN)
    data_frame_metrix["neg_labeled_img"].append(neg_labeled_img)
    data_frame_metrix["pos_labeled_img"].append(pos_labeled_img)
    data_frame_metrix["corrected_count"].append(corrected_count)

    print(f"Dataset: d_{dataset_type + 1} \t\t\t Labeled image: {size} \t\t Corrected count: {corrected_count}")
    print(f"Accuracy: {accuracy} \t\t Specificity: {specificity} \t\t Sensitivity: {sensitivity}")
    print(f"Dunn index: {dunn_index}")
    print(f"Davies Bouldin: {davies_bouldin_index}")
    print(f"Silhouette index: {silhouette_index}")
    print(f"AUC: {cl_auc}")

data_f_matrix = pd.DataFrame.from_dict(data_frame_metrix)
data_f_matrix.to_csv(f"./test_{s_model}_{s_distance}_{CURRENT_TIME}/model_evaluation.csv", index=False)

*************** training with 200 size of labled data***************
labeled data: 200, unlabled data: 4200
balanced data: 40, fpositive: 20, fnegative: 20
mentored data: 160
Number of subclusters: 5
 ############### Mentoring 160 data ###############
 ############### Mentoring 160 data DONE!!! ###############
 ############### Training 4200 unlabeled data ###############


100%|█████████████████████████████████████████████████████████████████████████████| 4200/4200 [00:28<00:00, 148.34it/s]


 ############### Training 4200 unlabeled data DONE!!! ###############
TP: 2010  FP: 249
FN: 19  TN: 1922
Calculating Dunn's index...
dunn_index:  0.162
Calculating Davies Bouldin index...
davies_bouldin_index:  2.599
Calculating Silhouette index...
silhouette_index:  0.124
Dataset: d_1 			 Labeled image: 200 		 Corrected count: 8
Accuracy: 0.936 		 Specificity: 0.885 		 Sensitivity: 0.991
Dunn index: 0.16200000047683716
Davies Bouldin: 2.599
Silhouette index: 0.12399999797344208
AUC: 0.94
labeled data: 200, unlabled data: 4200
balanced data: 40, fpositive: 20, fnegative: 20
mentored data: 160
Number of subclusters: 5
 ############### Mentoring 160 data ###############
 ############### Mentoring 160 data DONE!!! ###############
 ############### Training 4200 unlabeled data ###############


100%|█████████████████████████████████████████████████████████████████████████████| 4200/4200 [00:28<00:00, 148.98it/s]


 ############### Training 4200 unlabeled data DONE!!! ###############
TP: 2001  FP: 225
FN: 22  TN: 1952
Calculating Dunn's index...
dunn_index:  0.131
Calculating Davies Bouldin index...
davies_bouldin_index:  2.605
Calculating Silhouette index...
silhouette_index:  0.123
Dataset: d_2 			 Labeled image: 200 		 Corrected count: 8
Accuracy: 0.941 		 Specificity: 0.897 		 Sensitivity: 0.989
Dunn index: 0.13099999725818634
Davies Bouldin: 2.605
Silhouette index: 0.12300000339746475
AUC: 0.946
labeled data: 200, unlabled data: 4200
balanced data: 40, fpositive: 20, fnegative: 20
mentored data: 160
Number of subclusters: 5
 ############### Mentoring 160 data ###############
 ############### Mentoring 160 data DONE!!! ###############
 ############### Training 4200 unlabeled data ###############


100%|█████████████████████████████████████████████████████████████████████████████| 4200/4200 [00:29<00:00, 144.73it/s]


 ############### Training 4200 unlabeled data DONE!!! ###############
TP: 2004  FP: 37
FN: 21  TN: 2138
Calculating Dunn's index...
dunn_index:  0.121
Calculating Davies Bouldin index...
davies_bouldin_index:  2.659
Calculating Silhouette index...
silhouette_index:  0.114
Dataset: d_3 			 Labeled image: 200 		 Corrected count: 4
Accuracy: 0.986 		 Specificity: 0.983 		 Sensitivity: 0.99
Dunn index: 0.12099999934434891
Davies Bouldin: 2.659
Silhouette index: 0.11400000005960464
AUC: 0.5
*************** training with 400 size of labled data***************
labeled data: 400, unlabled data: 4000
balanced data: 40, fpositive: 20, fnegative: 20
mentored data: 360
Number of subclusters: 5
 ############### Mentoring 360 data ###############
 ############### Mentoring 360 data DONE!!! ###############
 ############### Training 4000 unlabeled data ###############


100%|█████████████████████████████████████████████████████████████████████████████| 4000/4000 [00:28<00:00, 141.84it/s]


 ############### Training 4000 unlabeled data DONE!!! ###############
TP: 1910  FP: 53
FN: 27  TN: 2010
Calculating Dunn's index...
dunn_index:  0.0
Calculating Davies Bouldin index...
davies_bouldin_index:  2.652
Calculating Silhouette index...
silhouette_index:  0.114
Dataset: d_1 			 Labeled image: 400 		 Corrected count: 15
Accuracy: 0.98 		 Specificity: 0.974 		 Sensitivity: 0.986
Dunn index: 0.0
Davies Bouldin: 2.652
Silhouette index: 0.11400000005960464
AUC: 0.995
labeled data: 400, unlabled data: 4000
balanced data: 40, fpositive: 20, fnegative: 20
mentored data: 360
Number of subclusters: 5
 ############### Mentoring 360 data ###############
 ############### Mentoring 360 data DONE!!! ###############
 ############### Training 4000 unlabeled data ###############


100%|█████████████████████████████████████████████████████████████████████████████| 4000/4000 [00:28<00:00, 142.39it/s]


 ############### Training 4000 unlabeled data DONE!!! ###############
TP: 1904  FP: 65
FN: 23  TN: 2008
Calculating Dunn's index...
dunn_index:  0.035
Calculating Davies Bouldin index...
davies_bouldin_index:  2.646
Calculating Silhouette index...
silhouette_index:  0.115
Dataset: d_2 			 Labeled image: 400 		 Corrected count: 19
Accuracy: 0.978 		 Specificity: 0.969 		 Sensitivity: 0.988
Dunn index: 0.03500000014901161
Davies Bouldin: 2.646
Silhouette index: 0.11500000208616257
AUC: 0.997
labeled data: 400, unlabled data: 4000
balanced data: 40, fpositive: 20, fnegative: 20
mentored data: 360
Number of subclusters: 5
 ############### Mentoring 360 data ###############
 ############### Mentoring 360 data DONE!!! ###############
 ############### Training 4000 unlabeled data ###############


100%|█████████████████████████████████████████████████████████████████████████████| 4000/4000 [00:28<00:00, 138.56it/s]


 ############### Training 4000 unlabeled data DONE!!! ###############
TP: 1901  FP: 140
FN: 27  TN: 1932
Calculating Dunn's index...
dunn_index:  0.162
Calculating Davies Bouldin index...
davies_bouldin_index:  2.628
Calculating Silhouette index...
silhouette_index:  0.119
Dataset: d_3 			 Labeled image: 400 		 Corrected count: 13
Accuracy: 0.958 		 Specificity: 0.932 		 Sensitivity: 0.986
Dunn index: 0.16200000047683716
Davies Bouldin: 2.628
Silhouette index: 0.11900000274181366
AUC: 0.992
*************** training with 800 size of labled data***************
labeled data: 800, unlabled data: 3600
balanced data: 40, fpositive: 20, fnegative: 20
mentored data: 760
Number of subclusters: 5
 ############### Mentoring 760 data ###############
 ############### Mentoring 760 data DONE!!! ###############
 ############### Training 3600 unlabeled data ###############


100%|█████████████████████████████████████████████████████████████████████████████| 3600/3600 [00:26<00:00, 133.57it/s]


 ############### Training 3600 unlabeled data DONE!!! ###############
TP: 1724  FP: 65
FN: 18  TN: 1793
Calculating Dunn's index...
dunn_index:  0.133
Calculating Davies Bouldin index...
davies_bouldin_index:  2.652
Calculating Silhouette index...
silhouette_index:  0.115
Dataset: d_1 			 Labeled image: 800 		 Corrected count: 21
Accuracy: 0.977 		 Specificity: 0.965 		 Sensitivity: 0.99
Dunn index: 0.13300000131130219
Davies Bouldin: 2.652
Silhouette index: 0.11500000208616257
AUC: 0.995
labeled data: 800, unlabled data: 3600
balanced data: 40, fpositive: 20, fnegative: 20
mentored data: 760
Number of subclusters: 5
 ############### Mentoring 760 data ###############
 ############### Mentoring 760 data DONE!!! ###############
 ############### Training 3600 unlabeled data ###############


100%|█████████████████████████████████████████████████████████████████████████████| 3600/3600 [00:27<00:00, 130.30it/s]


 ############### Training 3600 unlabeled data DONE!!! ###############
TP: 1728  FP: 51
FN: 12  TN: 1809
Calculating Dunn's index...
dunn_index:  0.035
Calculating Davies Bouldin index...
davies_bouldin_index:  2.65
Calculating Silhouette index...
silhouette_index:  0.115
Dataset: d_2 			 Labeled image: 800 		 Corrected count: 26
Accuracy: 0.983 		 Specificity: 0.973 		 Sensitivity: 0.993
Dunn index: 0.03500000014901161
Davies Bouldin: 2.65
Silhouette index: 0.11500000208616257
AUC: 0.998
labeled data: 800, unlabled data: 3600
balanced data: 40, fpositive: 20, fnegative: 20
mentored data: 760
Number of subclusters: 5
 ############### Mentoring 760 data ###############
 ############### Mentoring 760 data DONE!!! ###############
 ############### Training 3600 unlabeled data ###############


100%|█████████████████████████████████████████████████████████████████████████████| 3600/3600 [00:28<00:00, 127.84it/s]


 ############### Training 3600 unlabeled data DONE!!! ###############
TP: 1708  FP: 84
FN: 23  TN: 1785
Calculating Dunn's index...
dunn_index:  0.0
Calculating Davies Bouldin index...
davies_bouldin_index:  2.64
Calculating Silhouette index...
silhouette_index:  0.116
Dataset: d_3 			 Labeled image: 800 		 Corrected count: 22
Accuracy: 0.97 		 Specificity: 0.955 		 Sensitivity: 0.987
Dunn index: 0.0
Davies Bouldin: 2.64
Silhouette index: 0.11599999666213989
AUC: 0.997
*************** training with 1550 size of labled data***************
labeled data: 1550, unlabled data: 2850
balanced data: 40, fpositive: 20, fnegative: 20
mentored data: 1510
Number of subclusters: 5
 ############### Mentoring 1510 data ###############
 ############### Mentoring 1510 data DONE!!! ###############
 ############### Training 2850 unlabeled data ###############


100%|█████████████████████████████████████████████████████████████████████████████| 2850/2850 [00:25<00:00, 112.83it/s]


 ############### Training 2850 unlabeled data DONE!!! ###############
TP: 1357  FP: 17
FN: 22  TN: 1454
Calculating Dunn's index...
dunn_index:  0.128
Calculating Davies Bouldin index...
davies_bouldin_index:  2.667
Calculating Silhouette index...
silhouette_index:  0.112
Dataset: d_1 			 Labeled image: 1550 		 Corrected count: 38
Accuracy: 0.986 		 Specificity: 0.988 		 Sensitivity: 0.984
Dunn index: 0.12800000607967377
Davies Bouldin: 2.667
Silhouette index: 0.1120000034570694
AUC: 0.999
labeled data: 1550, unlabled data: 2850
balanced data: 40, fpositive: 20, fnegative: 20
mentored data: 1510
Number of subclusters: 5
 ############### Mentoring 1510 data ###############
 ############### Mentoring 1510 data DONE!!! ###############
 ############### Training 2850 unlabeled data ###############


100%|█████████████████████████████████████████████████████████████████████████████| 2850/2850 [00:25<00:00, 112.55it/s]


 ############### Training 2850 unlabeled data DONE!!! ###############
TP: 1377  FP: 20
FN: 12  TN: 1441
Calculating Dunn's index...
dunn_index:  0.129
Calculating Davies Bouldin index...
davies_bouldin_index:  2.667
Calculating Silhouette index...
silhouette_index:  0.113
Dataset: d_2 			 Labeled image: 1550 		 Corrected count: 38
Accuracy: 0.989 		 Specificity: 0.986 		 Sensitivity: 0.991
Dunn index: 0.1289999932050705
Davies Bouldin: 2.667
Silhouette index: 0.11299999803304672
AUC: 0.999
labeled data: 1400, unlabled data: 3000
balanced data: 40, fpositive: 20, fnegative: 20
mentored data: 1360
Number of subclusters: 5
 ############### Mentoring 1360 data ###############
 ############### Mentoring 1360 data DONE!!! ###############
 ############### Training 3000 unlabeled data ###############


100%|█████████████████████████████████████████████████████████████████████████████| 3000/3000 [00:28<00:00, 106.23it/s]


 ############### Training 3000 unlabeled data DONE!!! ###############
TP: 1414  FP: 16
FN: 20  TN: 1550
Calculating Dunn's index...
dunn_index:  0.111
Calculating Davies Bouldin index...
davies_bouldin_index:  2.673
Calculating Silhouette index...
silhouette_index:  0.112
Dataset: d_3 			 Labeled image: 1550 		 Corrected count: 41
Accuracy: 0.988 		 Specificity: 0.99 		 Sensitivity: 0.986
Dunn index: 0.11100000143051147
Davies Bouldin: 2.673
Silhouette index: 0.1120000034570694
AUC: 0.999
