# Data Visualization

## Compare the original data with 4 types of cropped data

### [Dataset Details](https://www.kaggle.com/c/happy-whale-and-dolphin/discussion/310779)

In [None]:
import re
import os
import numpy as np
import pandas as pd
import random
import math
import tensorflow as tf
import matplotlib.pyplot as plt
import pickle
import json

from kaggle_datasets import KaggleDatasets

In [None]:
class config:
    IMAGE_SIZE=256
    BATCH_SIZE=40

In [None]:
AUTO = tf.data.experimental.AUTOTUNE

def arcface_format(posting_id, image, label_group, matches):
    return posting_id, {'inp1': image, 'inp2': label_group}, label_group, matches

def arcface_inference_format(posting_id, image, label_group, matches):
    return image,posting_id

# Function to decode our images
# Updated to include crops.
def decode_image(image_data):
    image = tf.image.decode_jpeg(image_data, channels = 3)
    image = tf.image.resize(image, [config.IMAGE_SIZE,config.IMAGE_SIZE])
    image = tf.cast(image, tf.float32) / 255.0
    return image

# This function parse our images and also get the target variable
def read_labeled_tfrecord(example):
    LABELED_TFREC_FORMAT = {
        "image_name": tf.io.FixedLenFeature([], tf.string),
        "image": tf.io.FixedLenFeature([], tf.string),
        "target": tf.io.FixedLenFeature([], tf.int64)
    }

    example = tf.io.parse_single_example(example, LABELED_TFREC_FORMAT)
    posting_id = example['image_name']
    image = decode_image(example['image'])
    label_group = tf.cast(example['target'], tf.int32)
    matches = 1
    return posting_id, image, label_group, matches

# This function loads TF Records and parse them into tensors
def load_dataset(filenames, ordered = False):
    
    ignore_order = tf.data.Options()
    if not ordered:
        ignore_order.experimental_deterministic = False 
        
    dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads = AUTO)
    dataset = dataset.with_options(ignore_order)
    dataset = dataset.map(read_labeled_tfrecord, num_parallel_calls = AUTO) 
    return dataset

# This function is to get our training tensors
def get_training_dataset(filenames):
    dataset = load_dataset(filenames, ordered = True)
    dataset = dataset.map(arcface_format, num_parallel_calls = AUTO)
    dataset = dataset.map(lambda posting_id, image, label_group, matches: (image, label_group))
    dataset = dataset.batch(config.BATCH_SIZE)
    dataset = dataset.prefetch(AUTO)
    return dataset

# This function is to get our training tensors
def get_test_dataset(filenames, get_names = True):
    dataset = load_dataset(filenames, ordered = True)
    dataset = dataset.map(arcface_inference_format, num_parallel_calls = AUTO)
    if not get_names:
        dataset = dataset.map(lambda image, posting_id: image)
    dataset = dataset.batch(config.BATCH_SIZE)
    dataset = dataset.prefetch(AUTO)
    return dataset

In [None]:
input_files = ['happywhale-tfrecords-v1', 
               'happywhale-tfrecords-detic-512', 
               'happywhale-tfrecords-yolov5-512-005',
               'happywhale-tfrecords-yolov5-512-010',
               'happywhale-tfrecords-tokencut-384']
train_ds = []
test_ds = []
for input_file in input_files:
    GCS_DS_PATH = KaggleDatasets().get_gcs_path(input_file)
    train_file = np.sort(np.array(tf.io.gfile.glob(GCS_DS_PATH + '/happywhale-2022-train*.tfrec')))
    test_file  = np.sort(np.array(tf.io.gfile.glob(GCS_DS_PATH + '/happywhale-2022-test*.tfrec')))
    train_ds.append(get_test_dataset(train_file))
    test_ds.append(get_test_dataset(test_file))

### Train Images

#### From left to right: orginal, detic, yolov5(0.05), yolov5(0.10), tokencut

In [None]:
row = 30; col = 5;
row = min(row, config.BATCH_SIZE)

plt.figure(figsize=(25,int(25*row/col)))
for k, ds in enumerate(train_ds):
    for img, label in ds:
        for j in range(row):
            plt.title(label[j].numpy())
            plt.subplot(row, col, j*col+k+1)
            plt.axis('off')
            plt.imshow(img[j,])
        break
plt.show()

### Test Images

#### From left to right: orginal, detic, yolov5(0.05), yolov5(0.10), tokencut

In [None]:
row = 30; col = 5;
row = min(row, config.BATCH_SIZE)

plt.figure(figsize=(25,int(25*row/col)))
for k, ds in enumerate(test_ds):
    for img, label in ds:
        for j in range(row):
            plt.title(label[j].numpy())
            plt.subplot(row, col, j*col+k+1)
            plt.axis('off')
            plt.imshow(img[j,])
        break
plt.show()