In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        continue
        #print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
def resize_with_aspect_ratio(image, longest_edge):

    """
    Resize image while preserving its aspect ratio

    Parameters
    ----------
    image: numpy.ndarray of shape (height, width, 3)
        Image array

    longest_edge: int
        Desired number of pixels on the longest edge

    Returns
    -------
    image: numpy.ndarray of shape (resized_height, resized_width, 3)
        Resized image array
    """

    height, width = image.shape[:2]
    scale = longest_edge / max(height, width)
    image = cv2.resize(image, dsize=(int(np.ceil(width * scale)), int(np.ceil(height * scale))), interpolation=cv2.INTER_AREA)

    return image


In [3]:
## Getting the data compressed to be workable, using https://www.kaggle.com/code/jack11111111/ubc-ocean-jpeg-dataset-pipeline/edit

import os
from pathlib import Path
os.environ['OPENCV_IO_MAX_IMAGE_PIXELS'] = str(pow(2, 40))

competition_dataset_directory = Path('/kaggle/input/UBC-OCEAN')


from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import cv2
import matplotlib.pyplot as plt

df_train = pd.read_csv(competition_dataset_directory / 'train.csv')
df_test = pd.read_csv(competition_dataset_directory / 'test.csv')


MAX_SIZE = 20000
JPEG_QUALITY = 80

train_compressed_image_directory = Path('./train_compressed_images')
train_compressed_image_directory.mkdir(exist_ok=True, parents=True)

for idx, row in tqdm(df_train.iterrows(), total=df_train.shape[0]):
    
    raw_image_path = str(competition_dataset_directory / 'train_images' / f'{row["image_id"]}.png')
    compressed_image_path = str(train_compressed_image_directory / f'{row["image_id"]}.jpg')
    image_type = 'TMA' if row['is_tma'] else 'WSI' 
    
    image = cv2.imread(raw_image_path)
    
    raw_image_shape = image.shape[:2]
    longest_edge = max(raw_image_shape)
    if longest_edge > MAX_SIZE:
        image = resize_with_aspect_ratio(image=image, longest_edge=MAX_SIZE)
    resized_image_shape = image.shape[:2]

    cv2.imwrite(compressed_image_path, image, [int(cv2.IMWRITE_JPEG_QUALITY), JPEG_QUALITY])

    raw_image_size = os.path.getsize(raw_image_path) / (1 << 20)
    compressed_image_size = os.path.getsize(compressed_image_path) / (1 << 20)
    print(f'Image ID: {row["image_id"]} Type: {image_type} Shape: {raw_image_shape[0]}x{raw_image_shape[1]} -> {resized_image_shape[0]}x{resized_image_shape[1]} Size: {raw_image_size:.2f} -> {compressed_image_size:.2f} MBs')

test_compressed_image_directory = Path('./test_compressed_images')
test_compressed_image_directory.mkdir(exist_ok=True, parents=True)

for idx, row in tqdm(df_test.iterrows(), total=df_test.shape[0]):
    
    raw_image_path = str(competition_dataset_directory / 'test_images' / f'{row["image_id"]}.png')
    compressed_image_path = str(test_compressed_image_directory / f'{row["image_id"]}.jpg')
    
    image = cv2.imread(raw_image_path)
    
    raw_image_shape = image.shape[:2]
    longest_edge = max(raw_image_shape)
    if longest_edge > MAX_SIZE:
        image = resize_with_aspect_ratio(image=image, longest_edge=MAX_SIZE)
    resized_image_shape = image.shape[:2]

    cv2.imwrite(compressed_image_path, image, [int(cv2.IMWRITE_JPEG_QUALITY), JPEG_QUALITY])

    raw_image_size = os.path.getsize(raw_image_path) / (1 << 20)
    compressed_image_size = os.path.getsize(compressed_image_path) / (1 << 20)
    print(f'Image ID: {row["image_id"]} Shape: {raw_image_shape[0]}x{raw_image_shape[1]} -> {resized_image_shape[0]}x{resized_image_shape[1]} Size: {raw_image_size:.2f} -> {compressed_image_size:.2f} MBs')


  0%|          | 0/538 [00:00<?, ?it/s]

Image ID: 4 Type: WSI Shape: 20008x23785 -> 16825x20000 Size: 576.34 -> 51.95 MBs
Image ID: 66 Type: WSI Shape: 48195x48871 -> 19724x20000 Size: 2712.86 -> 62.03 MBs
Image ID: 91 Type: TMA Shape: 3388x3388 -> 3388x3388 Size: 17.48 -> 1.72 MBs
Image ID: 281 Type: WSI Shape: 15545x42309 -> 7349x20000 Size: 363.37 -> 12.31 MBs
Image ID: 286 Type: WSI Shape: 30020x37204 -> 16139x20000 Size: 996.41 -> 38.37 MBs
Image ID: 431 Type: WSI Shape: 40943x39991 -> 20000x19535 Size: 1649.99 -> 53.34 MBs
Image ID: 706 Type: WSI Shape: 25965x75606 -> 6869x20000 Size: 1261.31 -> 11.97 MBs
Image ID: 970 Type: WSI Shape: 18935x32131 -> 11787x20000 Size: 739.61 -> 40.89 MBs
Image ID: 1020 Type: WSI Shape: 33751x36585 -> 18451x20000 Size: 1221.68 -> 50.10 MBs
Image ID: 1080 Type: WSI Shape: 23200x31336 -> 14808x20000 Size: 747.34 -> 42.80 MBs
Image ID: 1101 Type: WSI Shape: 18403x26306 -> 13992x20000 Size: 548.08 -> 43.89 MBs
Image ID: 1252 Type: WSI Shape: 27480x60420 -> 9097x20000 Size: 589.41 -> 10.35 M

  0%|          | 0/1 [00:00<?, ?it/s]

Image ID: 41 Shape: 16987x28469 -> 11934x20000 Size: 620.29 -> 44.62 MBs
