In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tifffile import tifffile
from PIL import Image
import PIL
from skimage import color, data
from skimage.color import rgb2gray
from itertools import product
import os
from datasets import Dataset
import torch
from torch.utils.data import Dataset
from tqdm import tqdm

In [2]:
# Directory paths:
dir_tif = "data/doq_data/"
dir_rgb = "./data/doq_data_256/"
dir_gray = "./data/doq_data_256_gray/"

In [3]:
# tif files:
file1 = "C3311604.SES.100201791.tif"
file2 = "C4712134.NES.100228298.tif"
file3 = "O3712206.SWS.53377.tif"
file4 = "O3712208.SWS.53164.tif"
file5 = "O3712223.SES.53274.tif"
file6 = "O3712232.NWS.53174.tif"
file7 = "O3910434.NES.1137827.tif"

In [4]:
# Splitting Tif images to tiles:
def split_images_to_tiles(filename, dir_in, dir_out, d):
    name, ext = os.path.splitext(filename)
    img = Image.open(os.path.join(dir_in, filename))
    w, h = img.size
    
    grid = product(range(0, h-h%d, d), range(0, w-w%d, d))
    for i, j in grid:
        box = (j, i, j+d, i+d)
        out = os.path.join(dir_out, f'{name}_{i}_{j}{ext}')
        img.crop(box).save(out)

    return None

## creates an array of filenames in dir_in 
def create_filename_array(dir_in):
    directory_files = os.listdir(dir_in)
    filename_array = []
    for file in directory_files:
        d = str(dir_in + "/" + file)
        filename_array.append(d)
    return filename_array

## Read RGB files from dir_in directory filenames, converts them to gray-scale, and saves as png images.
def convert_tiles_to_grayscale(filename_array, dir_in, dir_out):
    for file in filename_array:
        dir_in, fileextension =  os.path.splitext(file)    ## split directory + filename + extension
        filename = os.path.basename(dir_in)                ## read filename only 
        tile_rgb = tifffile.imread(file)                     ## read image as tifffile
        tile_gray = rgb2gray(tile_rgb)
        #tile_gray = color.rgb2gray(tile_rgb, channel_axis=-1)
        img = Image.fromarray(tile_gray*255).convert('L').save(dir_out + filename + ".png")
        
    return None


# Converts grayscale tif image tiles to png
def convert_tif_tiles_to_png(filename_array, dir_out):
    for file in filename_array:
        dir_in, fileextension =  os.path.splitext(file)    ## split directory + filename + extension
        filename = os.path.basename(dir_in)                ## read filename only 
        tile_tif = tifffile.imread(file)                   ## read image as tifffile                  
        img = Image.fromarray(tile_tif).convert('L').save(dir_out + filename + ".png")
    return None

## creates an array of filenames in dir_in 
def create_gray_image_tile_array(dir_out): 
    gray_directory_files = os.listdir(dir_out)
    gray_file_names = []
    for file in gray_directory_files:
        d = str(dir_out + file)
        gray_file_names.append(d)

    ## Read gray files from dir_in directory filenames and save them as an numpy array.
    directory_array = []

    for file in gray_file_names:
        img = plt.imread(file)  ## numpy.ndarray
        directory_array.append(img)

    # torch expects type    
    # datasets.arrow_dataset.Dataset  
    return directory_array

In [5]:
# 1. splitting rgb images to RGB tiles:
# split_images_to_tiles(filename= file2, dir_in= "data/doq_data/", dir_out="data/doq_data_256/", d=256)
# split_images_to_tiles(filename= file7, dir_in= "data/doq_data/gray/", dir_out = "data/doq_data/gray_256_tif/", d=256)

In [6]:
# 2. creating filename array:
# filename_array_rgb = create_filename_array(dir_in = "data/doq_data_256")
# print(len(filename_array_rgb)) # 4785 from 7 tiffiles

# filename_array_gray = create_filename_array("data/doq_data/gray_256_tif")
# print(filename_array_gray)

In [7]:

# 3. converting all images to Grayscale and saving them to a separate folder:
# convert_tiles_to_grayscale(filename_array, dir_in=dir_rgb, dir_out="./data/doq_data_gray_256/")

# 3. save gray tif files to png:
# convert_tif_tiles_to_png(filename_array_gray, dir_out="./data/doq_data_gray_256/")


In [8]:
# file_array = create_gray_image_tile_array(dir_out="./data/doq_data_gray_256/")

In [9]:
#np.save("gray_image_tile_array_256", file_array)

In [18]:
arr= np.load("./data/gray_image_tile_array_64.npy")

In [21]:
len(arr), np.max(arr), np.min(arr)

(79052, 1.0, 0.0)

## Uploading the dataset to Huggingface Hub:

In [7]:
from datasets import load_dataset

# https://huggingface.co/docs/datasets/upload_dataset

from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
dataset = load_dataset("data/doq_data_gray_64")

Resolving data files:   0%|          | 0/79052 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [8]:
dataset.push_to_hub("ReginaFoley/doq_data_64")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/79052 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/791 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/ReginaFoley/doq_data_64/commit/8e5a2e5b8a172cb4ecd70ceecc79e14469a0e960', commit_message='Upload dataset', commit_description='', oid='8e5a2e5b8a172cb4ecd70ceecc79e14469a0e960', pr_url=None, pr_revision=None, pr_num=None)