<img src='https://radiant-assets.s3-us-west-2.amazonaws.com/PrimaryRadiantMLHubLogo.png' alt='Radiant MLHub Logo' width='300'/>

# South Africa Field Boundary Detection Tutorial

## Data Preparation

In this tutorial, we will prepare the augmented training data into training, validation and test data.

In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0' #gpu

In [None]:
from pathlib import Path
downloads_path = str(Path().resolve())
data_path =str(f"{downloads_path}/data")

### Importing necessary libraries

In [None]:
import torch
import os
import sys
import random

import cv2
import re
import warnings

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from PIL import Image

from tqdm import tqdm
from itertools import chain
import json

In [None]:
augmented_path = data_path+"/augmented/" #augmented data from previous notebook
FRAME_PATH = augmented_path+'/fields/' #for image without mask
MASK_PATH = augmented_path+'/masks/' #for image with mask
val_frame_path = f"{augmented_path}/val_frames" #validation data without mask
val_mask_path = f"{augmented_path}/val_masks" #validation data with mask
%cd "{data_path}/fields"
%rm -rf .ipynb_checkpoints #remove any jupyter checkpoints that may disrupt the data items
%cd "{data_path}/masks"
%rm -rf .ipynb_checkpoints

Next, we checked for images with empty labels/masks.

These images do not contribute to any info for training, hence are not relevant.

After removing the empty images and masks, we would then add images containing masks to a train, validation and test frame (and mask) in 70:20:10 ratio for model development.

Note that the test data for model evaluation would be using the original fields and masks, and not the augmented test data here. The purpose of splitting into a test data of augmented images here is to illustrate how this data can be divided into training, validation and test data

In [None]:
#check if there are any images containing no masks
#this took about 5 mins
for filename in os.listdir(MASK_PATH):
    if (filename.endswith(".png")):
        masked = np.array(Image.open(f"{MASK_PATH}/"+ filename))
        obj_ids = np.unique(masked)  # Sorted
        obj_ids = obj_ids[1:]

        # Split the index-encoded mask into a set of binary masks
        masks = masked == obj_ids[:, None, None]

        # Get bounding box (target) coordinates for each mask
        num_objs = len(obj_ids)
        boxes = []
        for i in range(num_objs):
            pos = np.where(masks[i])
            xmin = np.min(pos[1])
            xmax = np.max(pos[1])
            ymin = np.min(pos[0])
            ymax = np.max(pos[0])
            boxes.append([xmin, ymin, xmax, ymax])
        boxes = torch.as_tensor(boxes, dtype=torch.float32)  # type: ignore
        try: #checking if these parameters' values will throw an exception
            target = {
            "boxes": boxes,
            "labels": torch.ones((num_objs,), dtype=torch.int64),  # type: ignore
            "masks": torch.as_tensor(masks, dtype=torch.uint8),  # type: ignore
            "image_id": torch.tensor([4]),  # type: ignore
            "area": (boxes[:, 3,] - boxes[:, 1,]) * (boxes[:, 2,] - boxes[:, 0,]),  # type: ignore
            "iscrowd": torch.zeros((num_objs,), dtype=torch.int64),  # type: ignore
        }
        except: #if there is excepion, it means there is a missing item from the list, hence no mask
            #print(filename)
            field_loc=FRAME_PATH + filename
            mask_loc = f"{MASK_PATH}/"+filename
            if os.path.exists(field_loc):
                os.remove(field_loc)
            if os.path.exists(mask_loc):
                os.remove(mask_loc)

In [None]:
######## folders for train, validation and test frame images without mask (x) and with mask (y)
folders = ['train_frames', 'train_masks', 'val_frames', 'val_masks', 'test_frames', 'test_masks']
for folder in folders:
    os.makedirs(augmented_path + '/'+folder)

In [None]:
all_frames = os.listdir(FRAME_PATH)
all_masks = os.listdir(MASK_PATH)

In [None]:
# Generate train, val, and test sets for frames
random.shuffle(all_frames)
train_split = int(0.7*len(all_frames))
val_split = int(0.9 * len(all_frames))

train_frames = train_masks = all_frames[:train_split]
val_frames = val_masks = all_frames[train_split:val_split]
test_frames = test_masks = all_frames[val_split:]

In [None]:
#Add train, val, test frames and masks to relevant folders


def add_frames(dir_name, image):
  
  img = Image.open(FRAME_PATH+image)
  img.save(augmented_path+'/{}'.format(dir_name)+'/'+image)
  
  
  
def add_masks(dir_name, image):
  
  img = Image.open(MASK_PATH+image)
  img.save(augmented_path+'/{}'.format(dir_name)+'/'+image)


  
  
frame_folders = [(train_frames, 'train_frames'), (val_frames, 'val_frames'), 
                 (test_frames, 'test_frames')]

mask_folders = [(train_masks, 'train_masks'), (val_masks, 'val_masks'), 
                (test_masks, 'test_masks')]

In [None]:
#This took approximately 23 mins
# Add frames

for folder in frame_folders:
  
  array = folder[0]
  name = [folder[1]] * len(array)

  list(map(add_frames, name, array))
         
    
# Add masks

for folder in mask_folders:
  
  array = folder[0]
  name = [folder[1]] * len(array)
  
  list(map(add_masks, name, array))