In [None]:
import torch
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from torchvision.models.inception import InceptionOutputs
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay

In [None]:
def data_sampling(indices):
  return torch.utils.data.sampler.SubsetRandomSampler(indices)

In [None]:
#Data Preprocessing
def data_preprocess(data_path, sample_ratio, batch_size):
  # Create data transforms
  data_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))])

  # Get dataset from folder and apply data transforms
  dataset = datasets.ImageFolder(root = "{}data".format(data_path), transform = data_transforms)
    
  # Get a sample of the data randomly
  num_samples = int(len(dataset) * sample_ratio)
  indices = np.random.choice(range(len(dataset)), num_samples, replace = False)

  # Split the data into training, test, and validation sets
  train_size = int(0.7 * num_samples)
  test_size = int(0.2 * num_samples)
  val_size = num_samples - train_size - test_size

  train_indices = indices[ : train_size]
  test_indices = indices[train_size : train_size + test_size]
  val_indices = indices[train_size + test_size : ]

  samples = [data_sampling(i) for i in [train_indices, test_indices, val_indices]]

  # Create data loaders for training, test, and validation sets
  train_loader = DataLoader(dataset, batch_size = batch_size, sampler = samples[0], num_workers = 4, pin_memory = True)
  test_loader = DataLoader(dataset, batch_size = batch_size, sampler = samples[1], num_workers = 4, pin_memory = True)
  val_loader = DataLoader(dataset, batch_size = batch_size, sampler = samples[2], num_workers = 4, pin_memory = True)

  return dataset, train_loader, train_indices, test_loader, test_indices, val_loader, val_indices

In [1]:
def evaluate_model(model, dataloader, data_size, dtype, criterion, data_path, model_name):
  _loss, _pred, _true, _accuracy = 0.0, [], [], []
  model.eval()

  with torch.no_grad():
    for inputs, labels in dataloader:
      inputs = inputs.to(device)
      labels = labels.to(device)

      outputs = model(inputs)
      loss = criterion(outputs, labels)

      _loss += loss.item() * inputs.size(0)
      _, predicted = torch.max(outputs.data, 1)
      _pred.extend(predicted.cpu().numpy())
      _true.extend(labels.cpu().numpy())

  _loss /= len(data_size)
  _accuracy = accuracy_score(_true, _pred)  
  _recall = recall_score(_true, _pred, average='macro')
  _precision = precision_score(_true, _pred, average='macro')
  _fscore = f1_score(_true, _pred, average='macro')

  print('{}: Accuracy: {:.4f} | Loss: {:.4f} | Recall: {:.4f} | Precision: {:.4f} | F-score: {:.4f}'.format(dtype, _accuracy, _loss, _recall, _precision, _fscore))
  print("")

  if(dtype == "TEST"):
    cm = confusion_matrix(_true, _pred)
    plt.figure(figsize = (20, 20))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels = dataset.classes)
    disp.plot()
    plt.show()

  else:
    return _accuracy, _loss
  """  
    plt.imshow(cm, cmap = plt.cm.Blues)
    plt.title("{}_{}SET_CONFUSION_MATRIX".format(model_name, dtype))
    plt.colorbar()
    plt.savefig("{}_{}SET_CONFUSION_MATRIX.png".format(model_name, dtype))
    """

In [None]:
def train_model(model, criterion, optimizer, model_name, num_epochs):
  losses, accuracies, true, pred, v_accuracies, v_losses = [], [], [], [], [], []
  for epoch in range(num_epochs):
    train_loss, train_accuracy = 0.0, 0.0

    with tqdm(total=len(train_loader), desc=f'Epoch {epoch+1}/{num_epochs}', unit='batch') as pbar:
      for inputs, labels in train_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        _, preds = torch.max(outputs.logits if isinstance(outputs, InceptionOutputs) else outputs, dim = 1)
        loss = criterion(outputs.logits if isinstance(outputs, InceptionOutputs) else outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * inputs.size(0)
        train_accuracy += torch.sum(preds == labels.data)
        pred.extend(preds.cpu().numpy())
        true.extend(labels.cpu().numpy())

        pbar.set_postfix({'Accuracy': train_accuracy.item()/len(train_indices), 'Loss': train_loss/len(train_indices), 'Precision': precision_score(true, pred, average='macro'), 'Recall': recall_score(true, pred, average='macro'), 'F1 Score': f1_score(true, pred, average = 'macro')})
        pbar.update()      
    
    val_accuracy, val_loss = evaluate_model(model, val_loader, val_indices, 'VALIDATION', criterion, data_path, "ResNet18")

    v_accuracies.append(val_accuracy)
    v_losses.append(val_loss)
    losses.append(train_loss/len(train_indices))
    accuracies.append(train_accuracy.item()/len(train_indices))
  print(losses, v_losses)
  save_metrics(losses, accuracies, model_name)
  return losses, accuracies, v_accuracies, v_losses

In [None]:
def plot_TSNE(train_loader, device, model):
  #Obtain the TSNE Plot for the data
  features = []
  labels = []
  for images, targets in train_loader:
      images = images.to(device)
      targets = targets.to(device)
      with torch.no_grad():
          output = model(images)
          features.append(output.cpu().numpy())
          labels.append(targets.cpu().numpy())

  features = np.vstack(features)
  labels = np.concatenate(labels)

  tsne = TSNE(n_components=2, perplexity = 25, learning_rate = 600, n_iter = 900)
  tsne_features = tsne.fit_transform(features)

  tsne_df = pd.DataFrame(data=tsne_features, columns=['t-SNE 1', 't-SNE 2'])
  tsne_df['label'] = labels

  # Plot the t-SNE plot with seaborn
  sns.scatterplot(data=tsne_df, x='t-SNE 1', y='t-SNE 2', hue='label', palette='tab10')
  plt.title('t-SNE Plot')
  plt.show()

In [None]:
def plot_within_class_variance(dataset):
  #Get the class labels and the number of classes
  class_labels = dataset.classes
  num_classes = len(class_labels)

  #Get the number of images per class
  num_images_per_class = []
  for i in range(num_classes):
      class_indices = np.where(np.array(dataset.targets) == i)[0]
      num_images_per_class.append(len(class_indices))

  #Compute the mean and variance of the images per class
  mean_num_images = np.mean(num_images_per_class)
  var_num_images = np.var(num_images_per_class)

  #Plot the within-class variance
  fig, ax = plt.subplots()
  ax.bar(class_labels, num_images_per_class)
  ax.axhline(y=mean_num_images, linestyle='--', color='r', label='Mean')
  ax.axhspan(mean_num_images - np.sqrt(var_num_images), mean_num_images + np.sqrt(var_num_images),
            alpha=0.2, color='y', label='Variance')
  ax.legend()
  plt.xticks(rotation = 0)
  plt.ylabel('Number of Images')
  plt.xlabel('Classes')
  plt.title('Within-Class Variance Plot')
  plt.show()

In [None]:
def plot_model_curves(losses, accuracies, v_accuracies, v_losses):
  #Plotting the Loss and Accuracy Curves
  fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

  ax1.plot(losses, label = "Training Loss")
  ax1.plot(v_losses, label = "Validation Loss")
  ax1.set_xlabel('Epoch')
  ax1.set_ylabel('Loss')
  ax1.set_title('Training and Validation Loss Curve')
  ax1.legend()

  ax2.plot(accuracies, label = "Training Accuracy")
  ax2.plot(v_accuracies, label = "Validation Accuracy")
  ax2.set_xlabel('Epoch')
  ax2.set_ylabel('Accuracy')
  ax2.set_title('Training and Validation Accuracy Curve')
  ax2.legend()

  plt.show()