<a href="https://colab.research.google.com/github/sofia4009/Oral-disease/blob/main/Train_GoogleDS_Test_KaggleDS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Author: Sofia
Date: March 26
Subject: Training and getting experience with deep learning
Classification of dataset from kaggle:  https://www.kaggle.com/datasets/salmansajid05/oral-diseases
  - use stratified k-fold cross validation (with k = 5) to split dataset A
  - use the following deep neural networks for training, testing and comparing their performance
  - (do not use any data augmentation for now; just normalise all image pixel values to the range [0, 1];
  - resize all images to dimensions 112x112x3 for speeding up the training;
  - use categorical cross entropy as the loss function and f1 score as evaluation metric;
  - use the pretrained models on ImageNet):
    - ResNet18, ResNet50, ConvNeXt, EfficientNetB0, Transformers (i.e., ViT)

#Data Preparation

In [1]:
import pandas as pd
import numpy as np
import os
from PIL import Image
import cv2

import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
from pathlib import Path
from openpyxl import load_workbook
from openpyxl.drawing.image import Image as xlImage
from openpyxl import Workbook
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model
import h5py
import time


#Dataset Kaggle

In [2]:
# Initializations
image_size = (112,112)

# Specify the root directory where your images are located
main_DSA_directory = '/content/oral-diseases'

# Specify the output directory to save the processed images
Preprocessed_DSA_directory = '/content/decreased_oral_diseases'

# Specify the output directory to save the excel file
results_directory = '/content/drive/My Drive/QM/results.xlsx'
Plot_directory = '/content/drive/My Drive/QM/plots.xlsx'

In [3]:
# Install kaggle API client
!pip install -q kaggle

from google.colab import drive
drive.mount('/content/drive')

from google.colab import files

files.upload() #To prompt to upload the kaggle.json

# kaggle API client expects the file to be in ~/.kaggle
# so move it there
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/

# we need to set permissions
!chmod 600 /root/.kaggle/kaggle.json

# check your directory before downloading the datasets
!pwd

# download the required dataset from kaggle
!kaggle datasets download -d salmansajid05/oral-diseases

#If your file is a zip file you can unzip with the following code
import zipfile

with zipfile.ZipFile('oral-diseases.zip', "r") as z:
    z.extractall(main_DSA_directory)

Mounted at /content/drive


Saving kaggle.json to kaggle.json
/content
Dataset URL: https://www.kaggle.com/datasets/salmansajid05/oral-diseases
License(s): unknown
Downloading oral-diseases.zip to /content
 96% 226M/235M [00:02<00:00, 75.7MB/s]
100% 235M/235M [00:02<00:00, 98.7MB/s]


In [4]:
# Normalizing them to [0,1], Resizing to 112,112, 3
def normalize_and_resize_image(image, target_size):

    # Resize image into 112*112*3
    image = image.resize(target_size)

    # Normalize pixel values to the range [0, 1]
    image = np.array(image)
    image = image / 255.0

    # Convert the normalized numpy array back to PIL image
    image = Image.fromarray((image * 255).astype(np.uint8))

    return image

# Saving the normalized images into a new directory in Google colab with the same subdirectories and structure
def process_images_in_directory(directory, Preprocessed_DS_directory):
    for root, dirs, files in os.walk(directory):
        for file in files:
            # Check if the file has an image extension
            if file.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp')):
                # Construct the full path to the input image file
                image_path = os.path.join(root, file)
                if 'augmented' not in image_path and 'Caries_Gingivitus_ToothDiscoloration_Ulcer-yolo_annotated-Dataset' not in image_path:
                  # Open the image using PIL
                  image = Image.open(image_path)

                  # Ensure image is in RGB mode
                  image = image.convert("RGB")

                  # Normalize and Resize the image
                  #print(image_path)
                  processed_image = normalize_and_resize_image(image, image_size)

                  # Construct the full path to the output directory
                  output_subdirectory = os.path.relpath(root, directory)
                  output_path = os.path.join(Preprocessed_DS_directory, output_subdirectory)
                  os.makedirs(output_path, exist_ok=True)

                  # Save the processed image
                  filename = os.path.join(output_path, file)
                  processed_image.save(filename, format='JPEG')  # Change 'JPEG' to the desired format
                  #print(filename)
                  #if filename.find('augmented') == -1 or filename.find('Caries_Gingivitus_ToothDiscoloration_Ulcer-yolo_annotated-Dataset') == -1:
                  #np.save(filename, image)

In [5]:
# Call the function to process images in the directory
process_images_in_directory(main_DSA_directory, Preprocessed_DSA_directory)

In [6]:
file_count = sum(len(files) for _, _, files in os.walk(main_DSA_directory))
print(f"Number of Original Images: {file_count}")

file_count = sum(len(files) for _, _, files in os.walk(Preprocessed_DSA_directory))
print(f"Number of Preprocessed Images: {file_count}")

Number of Original Images: 15373
Number of Preprocessed Images: 5563


In [7]:
# Define transformations by Composing several transforms together
transform = transforms.Compose([
    transforms.Resize(image_size),
    #Convert a PIL Image or ndarray to tensor
    transforms.ToTensor()
])

# Use ImageFolder to load your dataset
dataset1 = torchvision.datasets.ImageFolder(root = Preprocessed_DSA_directory, transform=transform)

# Define number of classes and list of labels in the dataset
num_classes1 = len(dataset1.classes)
classes1 = dataset1.classes

# Define k-fold for cross-validation
k_folds = 5

# Seed (random_state) is set to initialize the random number generator while splitting the DataSet into k folds
skf_ds1 = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)

# Define hyperparameters to search over
learning_rates = [0.0001] #
batch_sizes = [32]#128,
optimizers = [ 'Adam'] #'SGD', 'RMSprop'

epochs = 25

best_model = None
best_f1 = 0.0
best_hyperparameters = None

# Define the path where you want to save the model weights in Google Drive
checkpoint_path = '/content/drive/My Drive/QM/model.h5'


#Dataset Attached

In [8]:
# Initializations
image_size = (112,112)

# Specify the root directory where your images are located
main_DSB_directory = '/content/New_Dataset' #'/content/oral-diseases'

# Specify the output directory to save the processed images
Preprocessed_DSB_directory = '/content' #'/content/decreased_oral_diseases'

# Specify the output directory to save the excel file
results_directory = '/content/drive/My Drive/QM/results.xlsx'
Plot_directory = '/content/drive/My Drive/QM/plots.xlsx'

In [9]:
import zipfile
from google.colab import drive

drive.mount('/content/drive/')

with zipfile.ZipFile('/content/drive/MyDrive/QM/New_Dataset.zip') as zipObj:
  members = [file for file in zipObj.namelist() if "__MACOSX" not in file]
  zipObj.extractall('/content/New_Dataset', members=members)

# Optionally, remove the __MACOSX directory if it was extracted
macosx_folder = os.path.join('/content/New_Dataset', "__MACOSX")
if os.path.exists(macosx_folder):
    os.rmdir(macosx_folder)


Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [10]:
# Normalizing them to [0,1], Resizing to 112,112, 3
def normalize_and_resize_image(image, target_size):

    # Resize image into 112*112*3
    image = image.resize(target_size)

    # Normalize pixel values to the range [0, 1]
    image = np.array(image)
    image = image / 255.0

    # Convert the normalized numpy array back to PIL image
    image = Image.fromarray((image * 255).astype(np.uint8))

    return image

# Saving the normalized images into a new directory in Google colab with the same subdirectories and structure
def process_images_in_directory(directory, Preprocessed_DS_directory):
    for root, dirs, files in os.walk(directory):
        for file in files:
            # Check if the file has an image extension
            if file.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp')):
                # Construct the full path to the input image file
                image_path = os.path.join(root, file)
                if 'Decay' not in image_path and 'Caries_Gingivitus_ToothDiscoloration_Ulcer-yolo_annotated-Dataset' not in image_path:
                  # Open the image using PIL
                  image = Image.open(image_path)

                  # Ensure image is in RGB mode
                  image = image.convert("RGB")

                  # Normalize and Resize the image
                  #print(image_path)
                  processed_image = normalize_and_resize_image(image, image_size)

                  # Construct the full path to the output directory
                  output_subdirectory = os.path.relpath(root, directory)
                  output_path = os.path.join(Preprocessed_DS_directory, output_subdirectory)
                  os.makedirs(output_path, exist_ok=True)

                  # Save the processed image
                  filename = os.path.join(output_path, file)
                  processed_image.save(filename, format='JPEG')  # Change 'JPEG' to the desired format
                  #print(filename)
                  #if filename.find('augmented') == -1 or filename.find('Caries_Gingivitus_ToothDiscoloration_Ulcer-yolo_annotated-Dataset') == -1:
                  #np.save(filename, image)

In [11]:
# Call the function to process images in the directory
process_images_in_directory(main_DSB_directory, Preprocessed_DSB_directory)

In [12]:
Preprocessed_DSB_directory = '/content/DataSet B'

file_count = sum(len(files) for _, _, files in os.walk(main_DSB_directory))
print(f"Number of Original Images: {file_count}")

file_count = sum(len(files) for _, _, files in os.walk(Preprocessed_DSB_directory))
print(f"Number of Preprocessed Images: {file_count}")

Number of Original Images: 539
Number of Preprocessed Images: 532


In [13]:
# Define transformations by Composing several transforms together
transform = transforms.Compose([
    transforms.Resize(image_size),
    #Convert a PIL Image or ndarray to tensor
    transforms.ToTensor()
])

Preprocessed_DSB_directory = '/content/DataSet B'

# Use ImageFolder to load your dataset
dataset2 = torchvision.datasets.ImageFolder(root = Preprocessed_DSB_directory, transform=transform)

# Define number of classes and list of labels in the dataset
num_classes2 = len(dataset2.classes)
classes2 = dataset2.classes

# Define k-fold for cross-validation
k_folds = 5

# Seed (random_state) is set to initialize the random number generator while splitting the DataSet into k folds
skf_ds2 = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)

# Define hyperparameters to search over
learning_rates = [0.0001] #
batch_sizes = [32]#128,
optimizers = [ 'Adam'] #'SGD', 'RMSprop'

epochs = 20

best_model = None
best_f1 = 0.0
best_hyperparameters = None

# Define the path where you want to save the model weights in Google Drive
checkpoint_path = '/content/drive/My Drive/QM/combined_model_fold1.h5'


In [14]:
# Define number of classes and list of labels in the dataset
num_classes = len(dataset1.classes)
classes = dataset1.classes

# Training the dataset Using Pre-trained models on ImageNet

#ResNet50

In [15]:
from torchvision.models import resnet50

In [None]:
# Save the results as a DataFrame to be saved in an Excel file
def Save_Results_to_Drive(results, results_directory):
    df = pd.DataFrame(results)

    # Check if the file exists
    if os.path.isfile(results_directory):
        # Load an existing Excel file
        existing_file = pd.read_excel(results_directory)
        # Append the new DataFrame to the existing file
        df = pd.concat([existing_file, df])

        # Remove the existing file
        os.remove(results_directory)

    # Write the DataFrame to the Excel file in the directory
    df.to_excel(results_directory, index=False)

    return df

In [None]:
# An empty list for storing the results
checkpoint_path = '/content/drive/My Drive/QM/combined_model_fold2.h5'

results = []
f1_vals = [0] * k_folds
f1_vals1 = [0] * k_folds
f1_vals2 = [0] * k_folds
results.append({'Model': 'ResNet50'})

# Initialize lists to store training and validation losses and accuracies
train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []

val_losses1 = []
val_accuracies1 = []

val_losses2 = []
val_accuracies2 = []

num_epochs = 20
counter = 0

for lr in learning_rates:
    for batch_size in batch_sizes:
      for optimizer_name in optimizers:
        print(f"Training with optimizer: {optimizer_name}, learning rate: {lr}, batch size: {batch_size}")
#       for fold, (train_idx, val_idx) in enumerate(skf_ds1.split(dataset1.imgs, dataset2.targets)):
        for fold, ((train1_idx, val1_idx), (train2_idx, val2_idx)) in enumerate(zip(skf_ds1.split(dataset1.imgs, dataset1.targets), skf_ds2.split(dataset2.imgs, dataset2.targets))):

            train_sampler = torch.utils.data.SubsetRandomSampler(train2_idx)
            val_sampler = torch.utils.data.SubsetRandomSampler(val1_idx)

            train_loader = torch.utils.data.DataLoader(dataset2, batch_size=batch_size, sampler=train_sampler)
            val_loader = torch.utils.data.DataLoader(dataset1, batch_size=batch_size, sampler=val_sampler)

            '''
            # Define the samplers
            train_sampler = torch.utils.data.SubsetRandomSampler(train2_idx)
            val_sampler = torch.utils.data.SubsetRandomSampler(val1_idx)

            # Define the DataLoaders
            train_loader = torch.utils.data.DataLoader(dataset1, batch_size=batch_size, sampler=train_sampler)
            val_loader = torch.utils.data.DataLoader(dataset2, batch_size=batch_size, sampler=val_sampler)
            '''

            # Define the device (GPU if available, otherwise CPU)
            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
            # Define the neural network
            model = resnet50(weights='ResNet50_Weights.DEFAULT')
            num_ftrs = model.fc.in_features
            model.fc = nn.Linear(num_ftrs, num_classes1)
            model = model.to(device)

            # Define loss function
            criterion = nn.CrossEntropyLoss()

            best_f1 = 0

            # Define optimizer
            if optimizer_name == 'Adam':
                optimizer = optim.Adam(model.parameters(), lr=lr)
            elif optimizer_name == 'SGD':
                optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)
            elif optimizer_name == 'RMSprop':
                optimizer = optim.RMSprop(model.parameters(), lr=lr)

            # Train and evaluate
            for epoch in range(num_epochs):
                # Training loop
                start_time = time.time()
                model.train()
                running_loss = 0.0
                correct = 0
                total = 0
                i = 0

                for inputs, labels in train_loader:
                    inputs, labels = inputs.to(device), labels.to(device) # Move inputs and labels to GPU
                    optimizer.zero_grad()
                    outputs = model(inputs)
                    loss = criterion(outputs, labels)
                    loss.backward()
                    optimizer.step()
                    running_loss += loss.item()
                    _, preds = torch.max(outputs, 1)
                    total += labels.size(0)
                    correct += (preds == labels).sum().item()
                epoch_train_loss = running_loss / len(train_loader)
                epoch_train_accuracy = correct / total
                train_losses.append(epoch_train_loss)
                train_accuracies.append(epoch_train_accuracy)

                # Validation loop
                model.eval()
                all_preds = []
                all_labels = []
                running_loss = 0.0
                correct = 0
                total = 0
                '''

                #---
                # Validation loop
    model.eval()
    val_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in val_loader:
            output = model(data)
            val_loss += criterion(output, target).item()
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()

                #---
                '''

                with torch.no_grad():
                  for inputs, labels in val_loader:
                      inputs, labels = inputs.to(device), labels.to(device) # Move inputs and labels to GPU
                      outputs = model(inputs)
                      loss = criterion(outputs, labels)
                      running_loss += loss.item()
                      _, preds = torch.max(outputs, 1)
                      total += labels.size(0)
                      correct += (preds == labels).sum().item()
                      all_preds.extend(preds.cpu().numpy())
                      all_labels.extend(labels.cpu().numpy())
                  # The duration of time needed for one fold
                  end_time = time.time()
                  duration = end_time - start_time

                  epoch_val_loss = running_loss / len(val_loader)
                  epoch_val_accuracy = correct / total
                  val_losses.append(epoch_val_loss)
                  val_accuracies.append(epoch_val_accuracy)

                  # Calculate F1 score
                  f1 = f1_score(all_labels, all_preds, average='macro')
                  print(f"Fold {fold+1}, Epoch {epoch+1}, F1 Score: {f1}, Duration: {duration} s")

                  # Update best F1 score and model
                  if f1 > best_f1:
                    best_f1 = f1

            print('end of epoch: ', str(epoch+1))
            f1_vals[fold] = best_f1

            results.append({
                    'Optimizer': optimizer_name,
                    'batch_size': batch_size,
                    'learning_rate': lr,
                    'Fold': fold+1,
                    'F1 Score': best_f1,
                    'Duration': duration,
              })

            df = Save_Results_to_Drive(results , results_directory)

            # Define a ModelCheckpoint callback to save the weights
            #path = checkpoint_path + 'model' + int(fold+1) + '.h5'
            checkpoint_callback = ModelCheckpoint(filepath= checkpoint_path,
                                      save_weights_only=True,
                                      monitor='val_loss',
                                      mode='min',
                                      save_best_only=True)

            torch.save(model.state_dict(), checkpoint_path)
            #model.save(checkpoint_path, save_format="h5")

        results.append({
                  'min f1': min(f1_vals),
                  'max f1': max(f1_vals),
                  'average f1': sum(f1_vals)/len(f1_vals)
                  })
        df = Save_Results_to_Drive(results , results_directory)

# -------------------- Plot the error/epoch plot ------------------------

# Define the filename for the Excel file
excel_filename = Plot_directory

# Create the directory if it doesn't exist
os.makedirs(os.path.dirname(excel_filename), exist_ok=True)

# Check if the Excel file exists
if os.path.exists(excel_filename):
    # Load existing Excel file
    wb = load_workbook(excel_filename)
    ws = wb.active
else:
    # Create a new Excel workbook
    wb = Workbook()
    ws = wb.active

# Append data to the DataFrame (train_losses, val_losses, train_accuracies, val_accuracies are assumed to be lists)
df['Train Loss'] = train_losses
df['Val Loss'] = val_losses
df['Train Accuracy'] = train_accuracies
df['Val Accuracy'] = val_accuracies
num_epochs = train_losses

# Plotting the error/epoch plot
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.plot(range(1, num_epochs + 1), train_losses, label='Training Loss')
plt.plot(range(1, num_epochs + 1), val_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Losses ResNet50')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(range(1, num_epochs + 1), train_accuracies, label='Training Accuracy')
plt.plot(range(1, num_epochs + 1), val_accuracies, label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracies ResNet50')
plt.legend()

plt.tight_layout()

# Save the plot as an image (e.g., PNG)
plot_image_path = '/content/plot_ResNet50.png'
plt.savefig(plot_image_path)

# Insert the image into the Excel file
img = xlImage(plot_image_path)
ws.add_image(img, 'A1')

# Save the Excel file
wb.save(excel_filename)
# ----------------- End of Plot the error/epoch plot ---------------------

Training with optimizer: Adam, learning rate: 0.0001, batch size: 32
Fold 1, Epoch 1, F1 Score: 0.07320847160121109, Duration: 160.71682357788086 s


In [18]:
print(f"Fold {fold + 1} - Train2 indices: {len(train2_idx)}, Val1 indices: {len(val1_idx)}")
print(f"Dataset1 size: {len(dataset1)}, Dataset2 size: {len(dataset2)}")

# Verify index ranges
print(f"Fold {fold + 1} - Train2 indices: {len(train2_idx)}, Val1 indices: {len(val1_idx)}")
print(f"Dataset1 size: {len(dataset1)}, Dataset2 size: {len(dataset2)}")

# Ensure the indices are within the dataset size
if max(train2_idx) >= len(dataset2) or max(val1_idx) >= len(dataset1):
  raise IndexError("Index out of range in the dataset")


Fold 1 - Train2 indices: 425, Val1 indices: 1113
Dataset1 size: 5563, Dataset2 size: 532
Fold 1 - Train2 indices: 425, Val1 indices: 1113
Dataset1 size: 5563, Dataset2 size: 532
