<a href="https://colab.research.google.com/github/richardgault/Automated-Ki-67-proliferation-scoring/blob/main/Generate_Mask_from_JSON.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Overview
This notebook will take JSON files in the format

[{"x": 123, "y":456, "label_id": 1},{"x":321, "y":654, "label_id": 2}, ... ] 

and create the mask (png) size 256x256. It is assumed that the original image sise is 1228x1228.

It is also assumed that the data has already been separated in to train/test/validation groups.

For more information please see: https://github.com/richardgault/Automated-Ki-67-proliferation-scoring/blob/main/README.md

At the end, the number of actual Ki-67 positive cells in the JSON files along with the number of non-positive cells is saved to a CSV file for future reference.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
dataset_directory = input('Specify the directory where your data is stored (ending with / e.g drive/MyDrive/myfiles/are/here/): ')

Import Packages

In [3]:
from PIL import Image, ImageDraw, ImageFilter

import matplotlib.pyplot as plt
import torch
import torchvision.transforms as T
import numpy as np
import random
import itertools
import cv2
import os
import glob
import json
import imutils
import shutil

In [4]:
def setup_directories(directory_list):
    for folder in directory_list:
        if not os.path.exists(folder):
            os.makedirs(folder)

Note in the next function the dimensions of the input and output images are prescribed as being 1228x1228. You may wish to modify this for your own particular needs.

In [5]:
def create_dataset(read_images, read_labels, write_images, write_labels):
    index = 1
    for file in glob.glob(read_labels + "*.json"):
        print(index,file)
        
        with open(file, 'r') as jsonFile:
            data = jsonFile.read()
        label_dict = json.loads(data)
        
        base = os.path.basename(file)
        image_filename = os.path.splitext(base)[0]
        
        image_to_read = read_images + image_filename + ".jpg"
        # BGR
        img = cv2.imread(image_to_read)
        # HSV
        hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
        # RGB
        rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        # GRAY
        gray = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
        
        image_file = Image.open(image_to_read)
        image_file = image_file.resize((256, 256))
        image_file = cv2.cvtColor(np.array(image_file), cv2.COLOR_BGR2RGB)
        cv2.imwrite(write_images+image_filename+".png", image_file)
        # shutil.copy(image_to_read, write_images + "image"+str(index)+".jpg")
        
        thresh_gaussian = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, 
                                          cv2.THRESH_BINARY_INV, 199, 5)  
        
        # Find the contours
        contours = cv2.findContours(thresh_gaussian.copy(), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
        contours = imutils.grab_contours(contours)
        mask = np.zeros((1228,1228,3), dtype=np.uint8)
        mask_contours = cv2.drawContours(mask.copy(), contours, -1, (128,128,128), thickness=-1)
        
        for c in contours:
            M = cv2.moments(c)
            area = cv2.contourArea(c)
            if(M["m00"]!=0 and area > 120):
                #cX = int(M["m10"] / M["m00"])
                #cY = int(M["m01"] / M["m00"])
                for entry in range(len(label_dict)):
                    X = label_dict[entry]['x']
                    Y = label_dict[entry]['y']
                    label = 0
                    in_contour = cv2.pointPolygonTest(c, (X,Y), False)
                    if in_contour ==1:
                        label =  label_dict[entry]['label_id']
                        if (label == 1):
                            mask_contours = mask_contours.astype(np.uint8)
                            # mask dimensions must be 2 pixels greater than image dimensions
                            temp_mask = np.zeros((1230,1230), dtype=np.uint8)
                            cv2.floodFill(mask_contours,temp_mask,(X,Y), (255,255,255))
            #else:
             #   cX,cY = 0,0
                
        final_img = Image.fromarray(mask_contours)
        final_img = final_img.resize((256,256))
        final_img = cv2.cvtColor(np.array(final_img),cv2.COLOR_BGR2RGB)
        lower_Healthy = np.array([60,60,60])
        upper_Healthy = np.array([240,240,240])
        lower_ki67 = np.array([241,241,241])
        upper_ki67 = np.array([255,255,255])
        mask_Healthy = cv2.inRange(final_img,lower_Healthy,upper_Healthy)
        mask_ki67 = cv2.inRange(final_img,lower_ki67,upper_ki67)
        mask_Combined = mask_ki67 + mask_Healthy
        
        final_img[mask_ki67>0]=(255,255,255)
        final_img[mask_Healthy>0]=(128,128,128)
        final_img[mask_Combined==0]=(0,0,0)
        final_img = cv2.cvtColor(np.array(final_img),cv2.COLOR_BGR2GRAY)
        cv2.imwrite(write_labels+image_filename+"_mask.png", final_img)
        index = index + 1     

# Data format

It is assumed that within the main data directory (entered previously) that the datasets are in the following directory structure

*Dataset directory*

*   train

    ->image1.jpeg

    ->image2.jpeg

    ->...
    
*   train_labels

    ->image1.json

    ->image2.json

    ->...
    
*   test

    ->imageT1.jpeg

    ->imageT2.jpeg

    ->...
    
*   test_labels

    ->imageT1.json

    ->imageT2.json

    ->...
    
*   validation

    ->imageV1.jpeg

    ->imageV2.jpeg

    ->...
    
*   validation_labels

    ->imageV1.json

    ->imageV2.json

    ->...

In [6]:
input_train_images = dataset_directory + "train/"
input_train_labels = dataset_directory +"train_labels/"
input_validation_images = dataset_directory + "validation/"
input_validation_labels = dataset_directory + "validation_labels/"
input_test_images = dataset_directory + "test/"
input_test_labels = dataset_directory + "test_labels/"

In [7]:
output_directory = dataset_directory+"Output_data/"
output_training_images = output_directory + "train_png/"
output_training_labels = output_directory + "train_masks/"
output_validation_images = output_directory + "validation_png/"
output_validation_labels = output_directory + "validation_masks/"
output_test_images = output_directory + "test_png/"
output_test_labels = output_directory + "test_masks/"

In [8]:
read_directory_list = [dataset_directory,
                       input_train_images,input_train_labels,
                        input_validation_images,input_validation_labels,
                       input_test_images, input_test_labels]

write_directory_list = [output_directory,
                        output_training_images,output_training_labels,
                        output_validation_images, output_validation_labels,
                        output_test_images,output_test_labels]

In [None]:
setup_directories(write_directory_list)

create_dataset(input_train_images,input_train_labels,output_training_images,output_training_labels)
create_dataset(input_validation_images,input_validation_labels,output_validation_images,output_validation_labels)
create_dataset(input_test_images, input_test_labels,output_test_images,output_test_labels)

Count the number of each class in the JSON files

In [10]:
def count_cells(output_name,read_labels):
  with open(output_name, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Filename", "Normal cells", "Ki67 cells"])
    for file in glob.glob(read_labels + "*.json"):      
      with open(file, 'r') as jsonFile:
        data = jsonFile.read()
      label_dict = json.loads(data)
      
      base = os.path.basename(file)
      image_filename = os.path.splitext(base)[0]
      normal_count = 0;
      Ki67_count = 0;
      for entry in range(len(label_dict)):
        label =  label_dict[entry]['label_id']
        if (label == 1):
          Ki67_count = Ki67_count+1;
        else:
          normal_count = normal_count+1;
      
      writer.writerow([image_filename, normal_count, Ki67_count])

In [11]:
import csv
train_output_name = dataset_directory + 'training_cell_counts.csv';
test_output_name = dataset_directory + 'test_cell_counts.csv';
val_output_name = dataset_directory + 'val_cell_counts.csv';

count_cells(train_output_name,input_train_labels)
count_cells(test_output_name,input_test_labels)
count_cells(val_output_name,input_validation_labels)