# Import necessary libraries

In [2]:
%matplotlib inline
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import os
import PIL
import PIL.Image
import tensorflow_datasets as tfds
import pathlib
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
from sklearn.utils.class_weight import compute_class_weight

# Set the default figure size for matplotlib
plt.rcParams['figure.figsize'] = (7,7) # Make the figures a bit bigger

# Load data and preprocess it

## Upload the data to drive:
Download the dataset from Brigthspace, and upload it to the content folder in Google Colab.


###Option 1: To avoid having to upload the dataset every time, you can upload the dataset to your google drive, and then connect this virtual machine to your drive, and make a copy of your data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Replace the path /content/drive/MyDrive/Work/2025-CS3002/BrainTumorDataset.zip with your actual path where you copied the dataset in your drive.

In [4]:
!pwd

'pwd' is not recognized as an internal or external command,
operable program or batch file.


In [None]:
!cp /content/drive/MyDrive/Work/2025-CS3002/BrainTumorDataset.zip /content/

###Option 2: Upload the zip archive in the Files section of your virtual machine, under /content/

###After the data is loaded, let's start processing it.

In [None]:
# After you have copied the data locally, point the dataset_url to the local path
dataset_url = '/content/BrainTumorDataset.zip'

In [None]:
import zipfile
import os

# Path to your local zip file
zip_path = dataset_url

# Path where you want to extract
out_path = "/content/"

# Extract
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(out_path)

print("Extraction complete. Files are in:", out_path)


In [None]:
# Create a 'pathlib.Path' object for the downloaded archive
# Pathlib module offers classes representing filesystem paths with semantics
# appropriate for different operating systems.
# data_dir = pathlib.Path(archive).with_suffix('')

extract_dir = pathlib.Path(zip_path).with_suffix('')
# Now use Pathlib for further work
data_dir = pathlib.Path(extract_dir)

print("Data directory is:", data_dir)

In [None]:
# Count the number of images in a specific directory
image_count = len(list(data_dir.glob('./Training/glioma/*.jpg')))
print(image_count)

In [None]:
# BEGIN YOUR CODE HERE

# Brain tumour dataset is split into train, validation and test folders. Inside those folders you will
# find additional folders: glioma, pituitary, notumor. You can explore the folders using
# 'Files' tab from the right hand side.
# Tip: use the len() function

# 1a. How many images do we have in training for positive glaucoma? How many for negative glaucoma?


# 1b. How about in the testing set?


# 1c. How about in the validation set?


# Q. Is the dataset balanced or not?

# END YOUR CODE HERE

In [None]:
# Create a list of file paths for glaucoma images
positive_images = list(data_dir.glob('Training/glioma/*'))
# Open and display the first glaucoma image in the list
PIL.Image.open(str(positive_images[0]))

In [None]:
# BEGIN YOUR CODE HERE

# 2. Display a glioma image from the validation dataset
# Create a list of file paths for glioma images


# Open and display the first glioma image in the list



# END YOUR CODE HERE

In [None]:
# BEGIN YOUR CODE HERE

# 2. Display a normal image from the validation dataset

# Create a list of file paths for the no tumour images


# Open and display the first  image in the list


# END YOUR CODE HERE

# Define a deep learning model that will learn the differences between glioma, pituitary and normal brain images


In [None]:
# Define batch size and image dimensions for training

# BEGIN YOUR CODE HERE
# The batch size is the number of samples processed before the model is updated.
# Choose an appropriate batch size.

# batch_size = TODO


# What is the resolution of the images?
# Specify the size to resize images to after they are read from disk.
# Since the pipeline processes batches of images that must all have the same size, this must be provided.

# img_height = TODO
# img_width = TODO


# END YOUR CODE HERE

In [None]:
train_data_dir  = os.path.join(data_dir,'Training')
valid_data_dir = os.path.join(data_dir,'Validation')
test_data_dir = os.path.join(data_dir,'Testing')

In [None]:
# Create a TensorFlow image dataset from a directory
# BEGIN YOUR CODE HERE
# Use the function tf.keras.utils.image_dataset_from_directory to load
# the training dataset: https://www.tensorflow.org/api_docs/python/tf/keras/utils/image_dataset_from_directory
# 1. First argument is your training directory folder (train_data_dir),
# 2. Do NOT use validation_split since we only want a training set,
# 3. You can set a seed such that when you repeat experiments you get similar results, e.g. seed=123,
# 4. For image size, use the img_height and img_width variables you defined previously,
# 5. For batch size, use the batch_size variable you defined earlier in the code.
# tf.keras.utils.image_dataset_from_directory(
#     directory,
#     seed=123,
#     image_size=(height, width),
#     batch_size=-1,
#     shuffle=False
# )
# Example:



# END YOUR CODE HERE

In [None]:
# Create a layer to normalise pixel values to the [0, 1] range.
# By default, when you load an image, each pixel value will have a value between 0-255
# but, in neural networks, we need as input normalised values in [0,1] interval.
normalization_layer = tf.keras.layers.Rescaling(1./255)

In [None]:
normalized_ds = train_ds.map(lambda x, y: (normalization_layer(x), y))
image_batch, labels_batch = next(iter(normalized_ds))
first_image = image_batch[0]
# Notice the pixel values are now in `[0,1]`.
print(np.min(first_image), np.max(first_image), first_image[0][0])

In [None]:
# BEGIN YOUR CODE HERE
# Define the number of classes in the classification problem
# How many classes do we have in this dataset?
# num_classes = TODO


# END YOUR CODE HERE

In [None]:
# BEGIN YOUR CODE HERE
# Define a tensorflow model using the tf.keras.Sequential class: https://www.tensorflow.org/api_docs/python/tf/keras/Sequential
# The last layer should be a Dense layer with the number of output neurons num_classes
# Use as a starting point the tf.keras.Sequential model defined for the MNIST problem.
# See Lab-DeepLearning-ImageClassification.
# For the first Conv2D layer, you are not required to specify the input shape. If that
# parameter is not given, tensorflow library will infer the size of the input when
# you fit the model, so it will depend on the size of the dataset.
# Important: change the output of the last Dense layer to match the number of classes for this problem.
# If you don't use any of the Dropout layers what accuracy do you get?
# What accuracy do you get with the Dropout layers?





# END YOUR CODE HERE


model.compile(
  optimizer='adam',
  loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
  metrics=['accuracy']) # Monitor accuracy and F1 score during training



In [None]:
# Extract training labels
y_train = np.concatenate([y for x, y in train_ds], axis=0)

# Compute class weights
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)

class_weights = dict(enumerate(class_weights))
print("Class Weights:", class_weights)

# TIP: Search how you can use the class weights during training to improve your results

In [None]:
# BEGIN YOUR CODE HERE
# Train the model on the provided dataset for a specified number of epochs
# Modify the network architecture such that you maximise the accuracy.
# Tip: aim to get an accuracy of at least 70% on the training set.
# For this, you can use the function fit, as in model.fit(...)
# The first argument is the train_ds variable defined above.
# This variable contains both the x (data - glaucoma and normal images) and y
# (labels - glaucoma vs normal).
# Start training using 5 epochs. What is the accuracy you get?
# How about if you increase the number of epochs?
# Use as validation_data the val_ds variable you defined previously




# END YOUR CODE HERE

In [None]:
from scipy.ndimage import gaussian_filter1d  # for smoothing

# function to plot loss/accuracy with trend lines
def plot_loss_accuracy(history, smooth_sigma=2):
    # Extract loss and accuracy values
    loss_values = history.history['loss']
    acc_values = history.history['accuracy']

    # Validation (if available)
    val_loss = history.history.get('val_loss')
    val_acc = history.history.get('val_accuracy')

    epochs = np.arange(1, len(loss_values) + 1)

    # --- Plot Training & Validation Loss ---
    plt.figure(figsize=(14, 5))

    plt.subplot(1, 2, 1)
    plt.plot(epochs, loss_values, marker='o', linestyle='-', color='b', alpha=0.5, label='Training Loss')
    plt.plot(epochs, gaussian_filter1d(loss_values, sigma=smooth_sigma), color='b', linewidth=2, label='Trend (Train)')

    if val_loss is not None:
        plt.plot(epochs, val_loss, marker='x', linestyle='--', color='r', alpha=0.5, label='Validation Loss')
        plt.plot(epochs, gaussian_filter1d(val_loss, sigma=smooth_sigma), color='r', linewidth=2, label='Trend (Val)')

    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Loss Over Epochs')
    plt.grid(True)
    plt.legend()

    # --- Plot Training & Validation Accuracy ---
    plt.subplot(1, 2, 2)
    plt.plot(epochs, acc_values, marker='o', linestyle='-', color='g', alpha=0.5, label='Training Accuracy')
    plt.plot(epochs, gaussian_filter1d(acc_values, sigma=smooth_sigma), color='g', linewidth=2, label='Trend (Train)')

    if val_acc is not None:
        plt.plot(epochs, val_acc, marker='x', linestyle='--', color='orange', alpha=0.5, label='Validation Accuracy')
        plt.plot(epochs, gaussian_filter1d(val_acc, sigma=smooth_sigma), color='orange', linewidth=2, label='Trend (Val)')

    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.title('Accuracy Over Epochs')
    plt.grid(True)
    plt.legend()

    plt.tight_layout()
    plt.show()

In [None]:
plot_loss_accuracy(history)

In [None]:
# Let's load the testing dataset
test_ds = tf.keras.utils.image_dataset_from_directory(
  test_data_dir,
  image_size=(img_height, img_width),
  batch_size=batch_size,
  shuffle=False   # ðŸ”‘ Ensure labels line up with predictions
)

In [None]:
# BEGIN YOUR CODE HERE
# What is the loss and accuracy on the Testing dataset?
# Compute and plot the confusion matrix on the test dataset
# Tip: instead of (x_test, y_test) we used in the lab last week, you can use
# directly test_ds which contains both data and labels
# https://www.tensorflow.org/api_docs/python/tf/keras/Model#evaluate
# When you print the output of the evaluate function is run, it will return both
# the loss and accuracy, maybe in a  format like [loss_value, accuracy_value]
# print("Model accuracy on the test set is:", model.evaluate(test_ds))
# Display the confusion matrix, precision, recall and f1-score




# END YOUR CODE HERE

In [None]:
# Try to improve the model such that it performs well on both training and testing datasets.