In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.preprocessing import normalize
import sklearn.metrics as metrics

In [None]:
def load_training():
  images = np.load('/content/gdrive/MyDrive/Serao_Barigliano/data/train_tensor.npy')
  labels = np.load('/content/gdrive/MyDrive/Serao_Barigliano/data/train_labels.npy')
  return images, labels

def load_test():
  images = np.load('/content/gdrive/MyDrive/Serao_Barigliano/data/public_test_tensor.npy')
  labels = np.load('/content/gdrive/MyDrive/Serao_Barigliano/data/public_test_labels.npy')
  return images, labels

In [None]:
def frequency(dataset):
  _,counts = np.unique(dataset, return_counts=True)
  return counts.tolist() 

def binarize(images, labels, classes):
  '''
    Split the dataset for binary classification. Classes must be 'mass_calcification' or 'benign_malignant'.
    Params: images, labels, classes. 
  '''
  if not (classes == "mass_calcification" or classes == "benign_malignant"):
    print("Error: Wrong classes. Choose 'mass_calcification' or 'benign_malignant'.")

  first = []
  second = []

  for i in range(images.shape[0]):
    if classes == "mass_calcification":
      if labels[i] == 0:
        continue
      elif (labels[i] == 1 or labels[i] == 2):
        first.append(images[i])
      elif (labels[i] == 3 or labels[i] == 4):
        second.append(images[i])
    elif classes == "benign_malignant":
      if labels[i] == 0:
        continue
      elif (labels[i] == 1 or labels[i] == 3):
        first.append(images[i])
      elif (labels[i] == 2 or labels[i] == 4):
        second.append(images[i])
    
    res_labels = np.concatenate((np.zeros(len(first), dtype=np.int16),
                                np.ones(len(second), dtype=np.int16)))
    res_dataset = first + second
    res_dataset = np.array(res_dataset)
      
  return res_dataset, res_labels

def rescale(dataset):
  '''
    Applying rescale to dataset. Dataset must contain grayscale images.
    Params: dataaset. 
  '''
  res = [tensor*(1./65535) for tensor in dataset]
  return res

def shuffle_dataset(data, labels):
    '''
    Shuffle the dataset and labels in a consistent way.
    Params: data, labels. 
    '''
    data, labels = shuffle(data, labels)
    return data, labels

In [None]:
def properties(tensor):
  '''
    Print property of a tensor.
    Params: tensor 
  '''
  print('Properties: ')
  print(f'Shape: {tensor.shape}') 
  print(f'Min: {tensor.min()}')
  print(f'Max: {tensor.max()}')
  print(f'Diff: {tensor.max()-tensor.min()}')
  print(f'Type of elements: {tensor.dtype}')

In [None]:
def plot_sample(images, title):
  '''
    Plot first five grayscale images from (images). Title will be the title of the plot.
    Params: images, title
  '''
  fig, ax = plt.subplots(1, 5, figsize=(10,14))
  for i in range(5):
    ax[i].imshow(images[i], cmap='gray')
  ax[0].set_ylabel(title)
  plt.tight_layout()
  plt.show()

In [None]:
def plot_training(acc, loss, val_acc, val_loss, epochs, path='training_results_plot.png'):
  '''
    Plot the training and validation loss and accuracy curves and save the graphs.
    Accuracy and Loss data must be an array.
    Params: acc, loss, val_acc, val_loss, epochs, path.
  '''
  fig, (acc_g,loss_g) = plt.subplots(1,2, figsize=(12,5))
  x_axis = range(epochs)

  # accuracy curves
  acc_g.plot(x_axis, acc, color='tab:orange', linestyle='-', label='training', linewidth=1)
  acc_g.plot(x_axis, val_acc, color='tab:blue', linestyle='-', label='validation', linewidth=1)

  # loss curves
  loss_g.plot(x_axis, loss, color='tab:orange', linestyle='-', label='training', linewidth=1)
  loss_g.plot(x_axis, val_loss, color='tab:blue', linestyle='-',label='validation', linewidth=1)

  # accuracy graph style options
  acc_g.grid(axis='y', linestyle='-', linewidth=0.5)
  acc_g.set_xlabel('Epochs')
  acc_g.set_ylabel('Accuracy')
  acc_g.set_title('Accuracy', fontdict={'fontsize':14}, pad=20)
  acc_g.legend(loc='lower right')

  # loss graph style options
  loss_g.grid(axis='y', linestyle='-', linewidth=0.5)
  loss_g.set_xlabel('Epochs')
  loss_g.set_ylabel('Loss')
  loss_g.set_title('Loss', fontdict={'fontsize':14}, pad=20)
  loss_g.legend(loc='upper right')

  # layout adjustment
  plt.tight_layout()

  # save graph
  fig.savefig(path)

In [None]:
def plot_confusion_matrix(true_labels, predicted, class_names, path='confusion_matrix.png'):
  '''
    Plot the confusion matrix.
    Params: true_labels, predicted, class_names.
  '''
  confusion_matrix = metrics.confusion_matrix(true_labels, predicted)
  fig, map = plt.subplots()
  map = sn.heatmap(confusion_matrix, annot=True, xticklabels=class_names, yticklabels=class_names, fmt='d', cmap='Blues')
  map.set_title('Confusion Matrix')
  map.set_xlabel('Predicted')
  map.set_ylabel('True')
  plt.tight_layout()
  fig.savefig(path)

In [None]:
def evaluate_classifier(true_labels, predicted):
  '''
    Calculate the classifier performances.
    Params: true_labels, predicted.
  '''
  print('Performances: ')
  print('Accuracy: ', metrics.accuracy_score(true_labels, predicted))
  print('Precision: ', metrics.precision_score(true_labels, predicted, average='weighted'))
  print('Recall: ', metrics.recall_score(true_labels, predicted, average='weighted'))
  print('F1-score: ', metrics.f1_score(true_labels, predicted, average='weighted'))
  print('AUC: ', metrics.roc_auc_score(true_labels, predicted, average='weighted'))