# Summary

This notebook is 1st step of my work. This notebook is responsible for re-organising, filtering and generating enhanced images in Yolo format for Yolo yaml config.

This will generate 4 dataset folders, which are as follows

### Dataset is filtered/tranformed and divided into four parts
- **"original-no-annot"** contains original, un-enhanced files of images which **do not have any annotations**
- **"original-annot"** contains original, un-enhanced files of images which **have annotations**
- **"enhanced-no-annot"** contains enhanced files of images which **do not have any annotations**
- **"enhanced-annot"** contains enhanced files of images which **have annotations**


> **NOTE 1 : images with no annotations will generate empty {image_id}.txt file under "/labels" folder**

> **NOTE 2 : There are 18.5k un-annotated images. So, we will only copy/generate 1k due to memory constraints**


### Data Folder Structure
- dataset
    - original-no-annot
        - images
        - labels
    - original-annot
        - images
        - labels
    - enhanced-no-annot
        - images
        - labels
    - enhanced-annot
        - images
        - labels

### What is enhanced image?
   - Enhance Image has 2 filters applied to it,
       - [CLAHE](https://en.wikipedia.org/wiki/Adaptive_histogram_equalization#Contrast_Limited_AHE)
       - [Gamma Correction](https://en.wikipedia.org/wiki/Gamma_correction)

### What is label .txt?
A ".txt" file is generated under "/labels" corresponding folder for each image. Each row is "class x_center y_center width height" format within that text file.

### Imports

In [None]:
!python -m pip install --upgrade pip
!pip install tf_clahe

In [None]:
%load_ext autoreload
%autoreload 2
from IPython.display import Image, display, Markdown
import cv2
import matplotlib.pyplot as plt
import matplotlib 
%matplotlib inline
import numpy as np
import pandas as pd
from PIL import Image
import os
import copy
import os.path as osp
import json5
import yaml
import shutil
from pathlib import Path
import ast
import sys
import tensorflow as tf
import tf_clahe

# Make numpy values easier to read.
np.set_printoptions(precision=3, suppress=True)
sys.path.append('../input/tensorflow-great-barrier-reef')
np.random.seed(0)
tf.version

### Image Enhancement Helpers

In [None]:
def plot_images(img1, img2, title='', lbl1='', lbl2=''):
    plt.close('all')
    fig, ax = plt.subplots(1, 2, figsize=(23,13))
    ax[0].imshow(img1, cmap = plt.get_cmap(name = 'gray'))
    ax[0].set_axis_off()
    ax[0].set_title(lbl1, fontsize=18)
    ax[1].imshow(img2, cmap = plt.get_cmap(name = 'gray'))
    ax[1].set_axis_off()
    ax[1].set_title(lbl2, fontsize=18)
    fig.suptitle(title, fontsize=22, y=0.81)
    plt.tight_layout()
    plt.show()

def tf_enhance_image_helper(img_path, gpu_optimized = False):
    # channel 1 is grayscale and 3 is RGB
    og_img = tf.io.decode_jpeg(tf.io.read_file(img_path), channels=3)
     
    correct_img = tf_clahe.clahe(og_img, tile_grid_size=(32, 32), clip_limit=3.3, gpu_optimized=gpu_optimized)
    correct_img = tf.image.adjust_gamma(correct_img, 1.2)
    return correct_img, og_img

@tf.function(jit_compile=True)  # Enable XLA
def tf_enhance_image_gpu(img_path):
    return tf_enhance_image_helper(img_path, True)

def tf_enhance_image(img_path):
    import torch
    if torch.cuda.is_available():
        return tf_enhance_image_gpu(img_path)
    return tf_enhance_image_helper(img_path, False)

def tf_save_image(img, fname):
    enc = tf.image.encode_jpeg(img)
    tf.io.write_file(tf.constant(fname), enc)
    
def tf_save_label(annot_list, fname):
    annots = ''
    if annot_list:
        annots = '\n'.join(annot_list)
    tf.io.write_file(tf.constant(fname), annots)

### What "Enhanced Image" looks like

In [None]:
def sample_enhanced_img():
    img_path1 = "../input/tensorflow-great-barrier-reef/train_images/video_2/5752.jpg"
    img_path2 = "../input/tensorflow-great-barrier-reef/train_images/video_2/5774.jpg"

    correct_img, og_img = tf_enhance_image(img_path1)
    correct_img2, og_img2 = tf_enhance_image(img_path2)

    plot_images(og_img.numpy(), correct_img, 'Image Comparison',
                'Original #2_5752', 'CLAHE + Gamma Corrected #2_5752')
    display(Markdown('## Original Image #2_5774'))
    display(Image.fromarray(og_img2.numpy()))
    display(Markdown('## CLAHE + GAMMA Corrected Image #2_5774'))
    display(Image.fromarray(correct_img2.numpy()))

sample_enhanced_img()
# %timeit -n10 -r3 sample_enhanced_img(img_path1)

### Data Related Helpers

In [None]:
# helper to generate yolo label file
def generateYoloLabelFile(folder_path, file_name, label_data_list):
    pth = f'{os.getcwd()}/{str(folder_path)}/{str(file_name)}.txt'
    with open(pth, 'w') as f:
        if label_data_list is None:
            # create empty txt file
            pass
        else:
            # create data txt file
            for label in label_data_list:
                if label:
                    f.write(label.strip())
                    f.write("\n")
    return pth

# helper to generate images
def generateImageFile(folder_path, file_name, image):
        new_dst = f'{os.getcwd()}/{str(folder_path)}/{str(file_name)}.jpg'
        cv2.imwrite(new_dst, image)
        return new_dst
    
# Memory saving function credit to https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

def get_path(row):
    row['image_path'] = f'/kaggle/input/tensorflow-great-barrier-reef/train_images/video_{row.video_id}/{row.video_frame}.jpg'
    return row

def load_labels():
    print('fetching original csv')
    labels = pd.read_csv("/kaggle/input/tensorflow-great-barrier-reef/train.csv", skipinitialspace=True)
    print('dropping duplicates')
    labels.drop_duplicates(inplace=True)
    print('dropping columns : sequence, sequence_frame')
    labels.drop(['sequence', 'sequence_frame'], axis = 1, inplace=True)
    print('reducing mem usage')
    labels = reduce_mem_usage(labels)
    print('adding image paths')
    labels = labels.apply(get_path, axis=1)
    print('Total rows :', len(labels))
    return labels

def create_dataset_csv():
    from sklearn.model_selection import train_test_split
    labels = load_labels()
    unannot_labels = labels[labels['annotations'] == '[]']
    annot_labels = labels[labels['annotations'] != '[]']
    print(f'Total annotated {len(annot_labels)}, Total un-annotated {len(unannot_labels)}\n')
    print('splitting data')
    an_train, an_test = train_test_split(annot_labels, test_size=0.2)
    un_train, un_test = train_test_split(unannot_labels.sample(1300), test_size=0.2)
    print(f'Train annotated {len(an_train)}, Test annotated {len(an_test)}')
    print(f'Train un-annotated {len(un_train)}, Test un-annotated {len(un_test)}\n combining data')
    train_data = an_train.append(un_train)
    print(f'Total Train (annot + un-annot) {len(train_data)}')
    test_data = an_test.append(un_test)
    print(f'Total Test (annot + un-annot) {len(test_data)}')
    print('writing train_new.csv')
    train_data.to_csv('/kaggle/working/train_new.csv', encoding='utf-8', index=False)
    print('writing test_new.csv')
    test_data.to_csv('/kaggle/working/test_new.csv', encoding='utf-8', index=False)

def load_train_data():
    df = pd.read_csv("/kaggle/working/train_new.csv", skipinitialspace=True)
    df = reduce_mem_usage(df)
    return df

def load_test_data():
    df = pd.read_csv("/kaggle/working/test_new.csv", skipinitialspace=True)
    df = reduce_mem_usage(df)
    return df

In [None]:
# generate train and test csv
create_dataset_csv()

### Dataset Creation

In [None]:
# helper to convert bbox value to yolo format
def convertToYoloFormat(x_min, y_min, bb_w, bb_h, im_w, im_h):
    # find x_center
    x_center = x_min + (bb_w/2.0)
    # find y_center
    y_center = y_min + (bb_h/2.0)
    # normalize values to 0-1
    n_x_center = x_center / im_w
    n_y_center = y_center / im_h
    n_bb_w = bb_w / im_w
    n_bb_h = bb_h / im_h
    return f'{round(n_x_center, 4)} {round(n_y_center,4)} {round(n_bb_w, 4)} {round(n_bb_h, 4)}'

def process_annot(ann_data):
    yolo_labels = []
    if ann_data and ann_data != '[]':
        im_w, im_h = 1280, 720
        ann_splits = json5.loads(ann_data)
        for ann_split in ann_splits:
            # load annotation data as json and get values
            a_data = json5.loads(ann_split)
            bb_x_min = int(a_data["x"])
            bb_y_min = int(a_data["y"])
            bb_w = int(a_data["width"])
            bb_h = int(a_data["height"])
            yolo_format_bb = convertToYoloFormat(bb_x_min, bb_y_min, bb_w, bb_h, im_w, im_h)
            yolo_labels.append(f'0 {yolo_format_bb}')
            del a_data, bb_x_min, bb_y_min, bb_w, bb_h, yolo_format_bb
        del ann_splits
    return yolo_labels
    
def process_row(val, og_fp, en_fp):
    video_id = str(val[0])
    video_frame = str(val[1])
    image_id = str(val[2])
    img_path = val[4]
    if os.path.exists(img_path) and os.path.isfile(img_path):
        try:
            # original dataset paths
            og_lbl_folder = os.path.join(og_fp, 'labels')
            og_img_folder = os.path.join(og_fp, 'images')
            
            # enhanced dataset paths
            en_lbl_folder = os.path.join(en_fp, 'labels')
            en_img_folder = os.path.join(en_fp, 'images')
            
            # process annotation, data
            yolo_labels = process_annot(val[3])
            
            # original - generate label data 
            og_lbl_save_path = os.path.join(og_lbl_folder, f'{image_id}.txt')
            tf_save_label(yolo_labels, og_lbl_save_path)
            
            # original - generate image data
            og_save_path = os.path.join(og_img_folder, f'{image_id}.jpg')
            shutil.copy(img_path, og_save_path)

            # enhanced - generate label data
            en_lbl_save_path = os.path.join(en_lbl_folder, f'{image_id}.txt')
            tf_save_label(yolo_labels, en_lbl_save_path)
            
            # enhanced generate image data
            en_save_path = os.path.join(en_img_folder, f'{image_id}.jpg')
            correct_img, og_img = tf_enhance_image(img_path)
            tf_save_image(correct_img, en_save_path)
            
            del yolo_labels, correct_img, og_img
        except KeyboardInterrupt:
            raise

        except:
            # corrupt file, skip
            print('corrupt file')
            raise
    del video_id, video_frame, image_id, img_path
            
def generate_dataset(data, is_train):
    from tqdm import tqdm
    import pandas as pd
    import numpy as np
    import gc
    import os
    import cv2
    from joblib import Parallel, delayed
    import multiprocessing
    import torch
    
    os.chdir('/kaggle/working/dataset')
    base_path = os.getcwd()
    work_folder = 'train' if is_train else 'test'
    og_fp = os.path.join(base_path, 'original', work_folder)
    en_fp = os.path.join(base_path, 'enhanced', work_folder)
#     for val in tqdm(data.values):
#         torch.cuda.empty_cache()
#         process_row(val, og_fp, en_fp)
    Parallel(n_jobs=multiprocessing.cpu_count(), prefer='processes')(delayed(process_row)(val, og_fp, en_fp) for val in tqdm(data.values))
    gc.collect()

### Run create dataset

In [None]:
# always re-create
%cd /kaggle/working
!rm -rf dataset
!mkdir dataset
!mkdir dataset/original
!mkdir dataset/original/train
!mkdir dataset/original/train/images
!mkdir dataset/original/train/labels
!mkdir dataset/original/test
!mkdir dataset/original/test/images
!mkdir dataset/original/test/labels
!mkdir dataset/enhanced
!mkdir dataset/enhanced/train
!mkdir dataset/enhanced/train/images
!mkdir dataset/enhanced/train/labels
!mkdir dataset/enhanced/test
!mkdir dataset/enhanced/test/images
!mkdir dataset/enhanced/test/labels
%cd /kaggle/working

try:
    # generate train data
    train_data = load_train_data()
    generate_dataset(train_data, True)
    
    # generate test data
    test_data = load_test_data()
    generate_dataset(test_data, False)
except KeyboardInterrupt:
    pass

In [None]:
def check_file_counts():
    total_image_count = len(os.listdir('/kaggle/input/tensorflow-great-barrier-reef/train_images/video_0')) \
    + len(os.listdir('/kaggle/input/tensorflow-great-barrier-reef/train_images/video_1')) \
    + len(os.listdir('/kaggle/input/tensorflow-great-barrier-reef/train_images/video_2'))

    print('Total image count', total_image_count)
    
    im_len = len(os.listdir('/kaggle/working/dataset/original/train/images'))
    lab_len = len(os.listdir('/kaggle/working/dataset/original/train/labels'))
    print(f'OG Train IM count {im_len}, Label Count {lab_len}')
    
    im_len = len(os.listdir('/kaggle/working/dataset/enhanced/train/images'))
    lab_len = len(os.listdir('/kaggle/working/dataset/enhanced/train/labels'))
    print(f'EN Train IM count {im_len}, Label Count {lab_len}')

    im_len = len(os.listdir('/kaggle/working/dataset/original/test/images'))
    lab_len = len(os.listdir('/kaggle/working/dataset/original/test/labels'))
    print(f'OG Test  IM count {im_len}, Label Count {lab_len}')
    
    im_len = len(os.listdir('/kaggle/working/dataset/enhanced/test/images'))
    lab_len = len(os.listdir('/kaggle/working/dataset/enhanced/test/labels'))
    print(f'EN Test  IM count {im_len}, Label Count {lab_len}')

check_file_counts()

### References
- Improvements learned from [Learning to Sea: Underwater img Enhancement](https://www.kaggle.com/soumya9977/learning-to-sea-underwater-img-enhancement-eda#%F0%9F%8E%AF-Main-Working-Code)
- Improved white balance implementation by [gist](https://gist.github.com/DavidYKay/9dad6c4ab0d8d7dbf3dc#gistcomment-3025656)
- [Image preprocessing tips](https://machinelearningmastery.com/how-to-configure-image-data-augmentation-when-training-deep-learning-neural-networks/)
- [Why "White Balance Correction" is neccessary](https://openaccess.thecvf.com/content_ICCV_2019/papers/Afifi_What_Else_Can_Fool_Deep_Learning_Addressing_Color_Constancy_Errors_ICCV_2019_paper.pdf)