<a href="https://colab.research.google.com/github/sebastiantonn/phd/blob/main/chapter3/04_deploy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import re
import numpy as np
import pandas as pd
from PIL import Image
import cv2 as cv
import itertools
import tensorflow.keras as keras
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications import vgg16
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.applications.vgg16 import decode_predictions
import zipfile


In [None]:
# Disable built-in DOS attack protection of PIL to accomodate large input image size
Image.MAX_IMAGE_PIXELS = None

# Load the trained neural network and weights
TrypanBlueModel = keras.models.load_model('/content/drive/MyDrive/PhD/TrypanBlue/240225-03_trypan-blue_VGG16_adamax_AP60-checkpoints/TrypanBlueModel.keras')
TrypanBlueModel.load_weights('/content/drive/MyDrive/PhD/TrypanBlue/240225-03_trypan-blue_VGG16_adamax_AP60-checkpoints/weights_19-0.08.hdf5')

In [None]:
# Define the tile size that the leaf disc image will be split into.
# The model was trained using 400x400 tiles that were rescaled to 224 x 224 during the preprocessing phase.
# 224 x 224 rescaling will occur here as well prior to prediction.
tile_size = 480 # 500 (or 480) pixel for new 2023 image set?

# Define the threshold value with which to find the largest contour in the image
# This is how the program finds the region of interest.
# 200 is the default
thresh_value = 200

# Define the radius of the region of interest (ROI) from the center of the disc.
# This is how the program segregates tiles that are within the ROI from those that are outside of it.
# 5800 is the default
radius = 6800 # 7300 (or 6800) for new leaf disc images taken in 2023, 5800 for old images

In [None]:
# DEFINE THE INPUT FOLDER that contains the images of leaf discs

# IMPORTANT NOTE: The images must be saved with file names in the following format with a .jpg extension:

#      "genotype_Xdpi_pathogen_Y-?????.jpg"

#       2023 data set has a different format:
#       "bedford_a_24_2dpi_1_Stitching.jpg"

#   where
#       'genotype' = The host variety name.
#                    If the variety name normally contains a space, replace the space with a dash("-")

#        X = an integer representing the days after inoculation, follow this integer with 'dpi' - no spaces

#       'pathogen' = pathogen information; spaces should be replaced by "-"

#        Y = an integer representing the replicate number.
#            If there are no replicates, the filename must still contain a "1" here.

#        -????? = Any further identifying information about the file.
#                 Include the dash ("-") but otherwise, special charachters are not allowed.

# EXAMPLE: "grand-rapids_3dpi_Bl33_1-stitch.jpg"

# The filename format (especially the order and the underscores) is important because the script
#     will automatically deposit that information into a dictionary for later analysis.

# Indicate where the subdirectories are located.
base_dir = '/content/drive/MyDrive/PhD/TrypanBlue/240226_model-deploy/'

# Indicate the subdirectory that contains images of whole leaf discs
disc_img_dir = '/content/drive/MyDrive/PhD/TrypanBlue/240226_model-deploy/trypanblue_2023set_JPG' # The subdirectory that contains images of whole leaf discs


In [None]:
# Define, and if necessary create, output folders

# Directory to save model predictions per tile for each image
pred_csv_dir = base_dir + 'tile_prediction_csv'
if not os.path.isdir(pred_csv_dir):
    os.mkdir(pred_csv_dir)

# Directory to save images with annotations indicating which tiles are predicted to contain hyphae
img_out_dir = base_dir + 'model_output_images'
if not os.path.isdir(img_out_dir):
    os.mkdir(img_out_dir)

results_dir = base_dir + 'CNN-240225-03_test.csv'

In [None]:
# CREATE A NESTED DICTIONARY to store all the information for each leaf disc and its corresponding tiles
# Note that since Python v3.7, dictionaries are now ordered.

# The structure of the nested dictionary will be:

# 'exp_dict'
#   KEY = 'Image_ID' - The name of the image file without an extension; parent key, the values for which are two subdictionaries.
#   VALUE = 'disc_dict'
#      where 'disc_dict' =
#          'disc_filename' = leaf disc filename. IF THERE ARE DUPLICATE FILE NAMES, THEN ONLY THE LAST ONE THROUGH THE LOOP WILL BE RECORDED
#          'local_path' = base_dir/disc_img_dir/disc_filename.jpg
#          'file_extension' = should be .jpg for all discs
#          'img_shape' = image dimensions in pixels (height, width, channels)
#          'variety' = host variety. Uppercase enforced.
#          'pathogen' = pathogen id. Uppercase enforced.
#          'dpi' = integer of days post inoculation
#          'rep_number' = the integer of the technical replicate
#          'ROI_tile_count' = total number of 400 x 400 px tiles within the determined region of interest
#          'ROI_hyphae_count' = number of 400 x 400 px tiles within the determined region of interest that are predicted by the model to contain hyphae
#          'infection_frequency' = the proportion of tiles within the region of interest that are predicted to contain hyphae.
#                                  Expressed as a float between 0 and 1.


# 'tile_dict' = a separate dictionary of tile information and the model output prediction for each leaf disc
#               This is kept separate for ease and cleanliness of data output to .csv later

#      KEY: ('tile_pt1', 'tile_pt2') = tuple representing the two opposing coordinate points that create a tile
#      VALUE: 'hyphae_prediction' = model prediction float that the tile contains hyphae


# create lists that contain the keys for the 'disc_dict'
disc_keys = ['disc_filename', 'local_path', 'file_extension', 'img_shape', 'variety', 'pathogen', 'dpi', 'batch', 'rep_number']
# the 'ROI_tile_count', 'ROI_hyphae_count','infection_frequency' will be added later
# the individual tile predictions will go to a separate dictionary

# Create an empty dictionary. This is the top layer dictionary that will have Image_IDs as the keys.
exp_dict = {}

# Loop through the 'disc_img_dir' and extract values for the Image_ID keys and the subdictionary values.
for subdir, dirs, files in os.walk(disc_img_dir):
    for file in files:
        filepath = subdir + os.sep + file
        if filepath.endswith(".jpg") or filepath.endswith(".jpeg"):

            # Create empty dictionaries for the technical and biological information
            technical_info = {}
            biological_info = {}

            # Create an empty dictionary for the individual tile predictions.   "bedford_a_24_2dpi_1_Stitching.jpg"
            tile_dict = {}

            # Extract the values
            disc_filename = file
            Image_ID = file.split('.')[0]
            local_path = filepath
            file_extension = os.path.splitext(file)[1]
            img = cv.imread(filepath)
            img_shape = img.shape

            variety = file.split('_')[0]
            if len(re.findall("-", variety)) > 0:
                variety = re.sub("-", " ", variety)
                variety = variety.upper()
            else:
                variety = variety.upper()

            pathogen = file.split('_')[2]
            if len(re.findall("-", pathogen)) > 0:
                pathogen = re.sub("-", " ", pathogen)
                pathogen = pathogen.upper()
            else:
                pathogen = pathogen.upper()

            dpi_first_cut = file.split('_')[3]
            dpi = ''.join(d for d in dpi_first_cut if d.isdigit())
            dpi = int(dpi)
            rep_number_first_cut = file.split('_')[4]
            rep_number = rep_number_first_cut.split('-')[0]

            batch = file.split('_')[1]

            # Add the values to the appropriate subdictionary
            disc_values = [disc_filename, local_path, file_extension, img_shape, variety, pathogen, dpi, batch, rep_number]
            disc_zip = zip(disc_keys, disc_values)
            disc_dict = dict(disc_zip)

            # Create a variable that stores the total amount of tiles within the ROI:
            ROI_tile_count = 0
            # Create a variable that stores the total amount of tiles within the ROI that are predicted to have hyphae in them
            ROI_hyphae_count = 0

            # NEXT STEPS:
            #   1. FIND THE REGION OF INTEREST
            #   2. SLICE THE IMAGE ALONG A 400 x 400 GRID
            #   3. ONLY RUN THOSE TILES THAT FALL COMPLETELY WITHIN THE REGION OF INTEREST THROUGH THE TRAINED CNN
            #   5. OUTPUT THE COORDINATES AND PREDICTION FOR EACH TILE INTO THE SECOND LAYER OF THE 'exp_dict' AS A VALUE FOR THE 'tile_predictions' KEY.

            # Create an empty dictionary that will store the tile coordinates and prediction values for all tiles within the region of interest of the disc
            tile_dict = {}

            # Define var 'img_PIL' as the image to be tiled, open the image file.
            img_PIL = Image.open(filepath)

            # Define var 'img_cv' as the image to be interpreted as a numpy array and analyzed for contours.
            # This was alread performed - re-assigned to 'img_cv' for clarity
            img_cv = img

            # Convert the RGB image 'img_cv' to grayscale.
            gray = cv.cvtColor(img_cv, cv.COLOR_BGR2GRAY)

            # Apply threshold to the inverted grayscale 'img_cv'.
            retval, thresh_gray = cv.threshold(gray, thresh=thresh_value, maxval=255, type=cv.THRESH_BINARY_INV)

            # Use the threshold to generate a list of all the objects in 'img_cv'.
            contours, hierarchy = cv.findContours(thresh_gray, cv.RETR_LIST, cv.CHAIN_APPROX_SIMPLE)

            # Find object with the biggest bounding box
            mx = (0,0,0,0)      # biggest bounding box so far
            mx_area = 0
            for cont in contours: #iterate through the list of objects to find the biggest one.
                x,y,w,h = cv.boundingRect(cont)
                area = w*h
                if area > mx_area:
                    mx = x,y,w,h
                    mx_area = area
            x,y,w,h = mx
            # 'mx' = the coordinates that correspond to the rectangular bounding box that encompasses the largest object
            # (i.e. the leaf disc)

            # Calculate x,y coordinate of the center of the largest contour in 'img_cv'
            centerCoord = (mx[0]+(mx[2]/2), mx[1]+(mx[3]/2))

            # Extract the X and Y coordinate of the ROI center
            cX = int(centerCoord[0])
            cY = int(centerCoord[1])

            # Define the grid lines (as a list) within the range of the largest contour along which the tiling cuting will be made
            # Note: partial tiles are ignored
            grid = list(itertools.product(range(y, h - h%tile_size, tile_size), range(x, w-w%tile_size, tile_size)))

            for i, j in grid: # Iterate through the list 'grid'

                # Use the 'grid' list to make a tile with dimensions according to 'tile_size' argument
                tile = (j, i, j + tile_size, i + tile_size)

                # determine how far away the farthest point of each tile is from the center of the area of interest
                dx = max(abs(cX - j), abs((j+tile_size)- cX))
                dy = max(abs(cY - i), abs((i+tile_size)- cY))

                # Draw the bounding box and region of interest onto the whole leaf disc image
                cv.circle(img_cv, (cX, cY), radius, (0, 0, 0), 10)
                cv.rectangle(img_cv,(x,y),(x+w,y+h),(0, 0, 0),10)

                # Use the Pythagorean theorum to determine if the the furthest point of each tile
                # is within our circular area of interest, which has a user-defined radius
                # and whose center is derived from the center of the rectangular bounding box.
                if radius*radius >= (dx * dx) + (dy * dy):

                    # Add 1 to the count of total ROI_tiles
                    ROI_tile_count += 1

                    # Save the 'y_px_range' values (min, max)
                    tile_pt1 = (j, i)

                    # Save the 'x_px_range' values (min, max)
                    tile_pt2 = (j + tile_size, i + tile_size)

                    # Fetch that slice of the original disc image and load it into Keras' image module.
                    tile_PIL = img_PIL.crop(tile)
                    tile_resized = tile_PIL.resize((224, 224), resample = 1)

                    # Perform preprocessing steps for the model
                    tile_array = image.img_to_array(tile_resized)
                    tile_array_batched = np.expand_dims(tile_array, axis = 0)
                    tile_preprocessed = preprocess_input(tile_array_batched)

                    # Run it through the trained neural netork to get the model's prediction
                    hyphae_prediction = TrypanBlueModel.predict(tile_preprocessed, verbose = 0)

                    # Create a dictionary element for every tile in the region of interest and store it in the 'tile_dict' dictionary
                    # The key will be the tuple of tile coordinates and the value will be the float of the prediction values generated by the model.
                    tile_coordinates = tile_pt1, tile_pt2
                    tile_coordinates = str(tile_coordinates)
                    tile_dict[tile_coordinates] = hyphae_prediction[0][0]

                    # Represent the prediction of each tile as a color-coded rectangle drawn onto the whole leaf-disc image
                    if hyphae_prediction[0][0] >= 0.5:
                      cv.rectangle(img_cv, pt1 = tile_pt1, pt2 = tile_pt2, color = (150, 20, 255),thickness = 20)

                      # Add 1 to the count of ROI tiles that are predicted to contain hyphae
                      ROI_hyphae_count += 1

            # Write the tile_dict to a csv file with the filename equal to the Image_ID + .csv
            tile_df = pd.DataFrame.from_dict(data = tile_dict, orient = 'index')
            tile_csv_name = str(Image_ID) + '.csv'
            tile_df.to_csv(os.path.join(pred_csv_dir, tile_csv_name), header = ['hyphae_prediction'])

            # Add the 'ROI_tile_counts' to the 'disc_dict' dictionary
            disc_dict['ROI_tile_count'] = ROI_tile_count
            disc_dict['ROI_hyphae_count'] = ROI_hyphae_count

            # Calculate the frequency of tiles within the ROI that are predicted to contain hyphae
            infection_frequency = (ROI_hyphae_count / ROI_tile_count)
            perc_inf_freq = infection_frequency *100 # Used in the image output

            # and store it in the dictionary
            disc_dict['infection_frequency'] = infection_frequency

            # set the 'disc_dict' as the value for the 'Image_ID' key in the 'exp_dict'
            exp_dict[Image_ID] = disc_dict

            # Include some biological information as text on the edited image.
            cv.putText(img_cv,('Image ID: ' + Image_ID), (x,400), cv.FONT_HERSHEY_PLAIN, 10, (0,0,0), 12)
            cv.putText(img_cv,(('Variety: ' + variety + '     B. lactucae race: ' + pathogen)), (x,600), cv.FONT_HERSHEY_PLAIN, 10, (0,0,0), 12)
            cv.putText(img_cv,('DPI: ' + str(dpi) + '     %Infected Tiles: ' + str(round(perc_inf_freq, 2))), (x,800), cv.FONT_HERSHEY_PLAIN, 10, (0,0,0), 12)

            # Save the image of the leaf disc that has bounding box, region of interest, and hyphae predictions drawn onto it.
            cv.imwrite(os.path.join(img_out_dir, f'{Image_ID}_{file_extension}'), img_cv)

In [None]:
# Write the exp_dict to a csv file
exp_df = pd.DataFrame.from_dict(data = exp_dict, orient = 'index')
exp_df.to_csv(results_dir, header = True)