In [None]:
#Basic Python and Machine learning libraries
import os, sys, random, time, cv2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import skimage.io
from PIL import Image
from scipy import stats
from IPython.display import display
from tqdm.notebook import tqdm
import zipfile

import albumentations

In [None]:
img_path = '../input/prostate-cancer-grade-assessment/train_images/0ab626bb7e1eb8c7b1eb6ac3fb129349.tiff'
image = skimage.io.MultiImage(img_path)[1]
plt.figure(figsize=(10,10))
# img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
plt.imshow(image)
print(type(image), image.shape)

In [None]:
# !mkdir  -p /root/.kaggle/
# !cp ../input/kaggle-api/kaggle.json /root/.kaggle
# os.environ['KAGGLE_CONFIG_DIR'] = "/root/.kaggle"
# !chmod 600 /root/.kaggle/kaggle.json


!mkdir -p ~/.kaggle
!cp ../input/tokken-api/kaggle.json ~/.kaggle/kaggle.json
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
# #create ds folder:
# !mkdir /kaggle/working/proc-prostate-cancer-grade-assessment
# #init ds
# !kaggle datasets metadata -p /kaggle/working/proc-prostate-cancer-grade-assessment feascr/proc-prostate-cancer-grade-assessment


# #create ds


# # with open('/kaggle/working/proc-prostate-cancer-grade-assessment/somefile.txt', 'w') as f:
# #     f.write('Hello\n')
# # !kaggle datasets create -p /kaggle/working/proc-prostate-cancer-grade-assessment

In [None]:
!mkdir -p /tmp/panda_dataset
!ls /tmp/

In [None]:
dataset_metadata = '''{
  "title": "PANDA dataset medium 256_64_64",
  "id": "blablamc/panda-dataset-medium-256-64-64",
  "licenses": [
    {
      "name": "CC0-1.0"
    }
  ]
}
'''
with open("/tmp/panda_dataset/dataset-metadata.json", 'w+') as text_file:
    n = text_file.write(dataset_metadata)

In [None]:
class Config:
    pwd = '/kaggle/working/'
    data_dir = '../input/prostate-cancer-grade-assessment/'
    train_images_dir = '../input/prostate-cancer-grade-assessment/train_images/'
    train_masks_dir = '../input/prostate-cancer-grade-assessment/train_label_masks/'
    image_size = 64
    tile_size = 64
    tile_mode = 0
    n_tiles = 256
    seed = 2020
    out_train = '/tmp/panda_dataset/train_images.zip'
    out_masks = '/tmp/panda_dataset/train_label_masks.zip'

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(Config.seed)

In [None]:
train_df = pd.read_csv(Config.data_dir+'train.csv')
display(train_df.head())
len(train_df)

In [None]:
def get_tiles(img, mask, tile_size, n_tiles, mode=0):
    result = []
    h, w, c = img.shape
    pad_h = (tile_size - h % tile_size) % tile_size + ((tile_size * mode) // 2)
    pad_w = (tile_size - w % tile_size) % tile_size + ((tile_size * mode) // 2)

    img = np.pad(img,[[pad_h // 2, pad_h - pad_h // 2], [pad_w // 2,pad_w - pad_w//2], [0,0]], constant_values=255)
    img = img.reshape(
            img.shape[0] // tile_size,
            tile_size,
            img.shape[1] // tile_size,
            tile_size,
            3
        )
    img = img.transpose(0,2,1,3,4).reshape(-1, tile_size, tile_size,3)
    
    
    if mask is not None:
        mask = np.pad(mask,[[pad_h // 2, pad_h - pad_h // 2], [pad_w // 2,pad_w - pad_w//2], [0,0]], constant_values=0)
        mask = mask.reshape(
                mask.shape[0] // tile_size,
                tile_size,
                mask.shape[1] // tile_size,
                tile_size,
                3
            )
        mask = mask.transpose(0,2,1,3,4).reshape(-1, tile_size, tile_size,3)
    
    if len(img) < n_tiles:
        img = np.pad(img,[[0,n_tiles-len(img)],[0,0],[0,0],[0,0]], constant_values=255)
        if mask is not None:
            mask = np.pad(mask,[[0,n_tiles-len(mask)],[0,0],[0,0],[0,0]], constant_values=0)
    idxs = np.argsort(img.reshape(img.shape[0],-1).sum(-1))[:n_tiles]
    img = img[idxs]
    if mask is not None:
        mask = mask[idxs]
    for i in range(len(img)):
        if mask is not None:
            result.append({'img':img[i], 'mask':mask[i], 'idx':i})
        else:
            result.append({'img':img[i], 'idx':i})
    return result

In [None]:
with zipfile.ZipFile(Config.out_train, 'w') as img_out,\
        zipfile.ZipFile(Config.out_masks, 'w') as mask_out:
    for i in tqdm(range(len(train_df))):
        img_path = os.path.join(Config.train_images_dir, train_df['image_id'].values[i]) + '.tiff'
        mask_path = os.path.join(Config.train_masks_dir, train_df['image_id'].values[i]) + '_mask.tiff'
        img = skimage.io.MultiImage(img_path)[1]
        mask = skimage.io.MultiImage(mask_path)
        mask_not_exist = not mask
        if mask_not_exist:
            tiles = get_tiles(img, None, Config.tile_size, Config.n_tiles, Config.tile_mode)
        else:
            mask = mask[1]
            tiles = get_tiles(img, mask, Config.tile_size, Config.n_tiles, Config.tile_mode)
        for t in tiles:
            if mask_not_exist:
                img, mask, idx = t['img'], None, t['idx']
            else:
                img, mask, idx = t['img'],t['mask'],t['idx']
            #if read with PIL RGB turns into BGR
            img = cv2.imencode('.png',cv2.cvtColor(img, cv2.COLOR_RGB2BGR))[1]
            img_out.writestr(f"{train_df['image_id'].values[i]}_{idx}.png", img)
            if not mask_not_exist:
            # TO DO SAVE IMG TO FOLDER VIA CV OR PIL + ADD KAGGLE DATASET SUPPORT
                mask = cv2.imencode('.png',mask[:,:,0])[1]
                mask_out.writestr(f"{train_df['image_id'].values[i]}_mask_{idx}.png", mask)

In [None]:
!unzip -q /tmp/panda_dataset/train_images.zip -d /tmp/train_images

In [None]:
train_files = os.listdir("/tmp/train_images")

In [None]:
len(train_files)

In [None]:
files = set([file.split("_")[0] for file in train_files])

In [None]:
len(files)

In [None]:
import shutil

if len(files) != train_df.shape[0]:
    print('YES')
    train_df = train_df[df.image_id.isin(files)]
    train_df.to_csv("/tmp/panda_dataset/train.csv", index = False)
else:
    shutil.copy2("../input/prostate-cancer-grade-assessment/train.csv", "/tmp/panda_dataset/train.csv")

In [None]:
!ls -l /tmp/panda_dataset

In [None]:
!kaggle datasets create -p /tmp/panda_dataset -u

In [None]:
!rm -rf /tmp/train_images

In [None]:
# FOR DOWNLOADING FILES FROM DRIVE
# import torchvision
# torchvision.datasets.utils.download_file_from_google_drive('https://drive.google.com/file/d/1Ex-kv2F9yF15DREprw1fVLTqrb2xa1v6/view?usp=sharing', '/kaggle/working', filename='kaggle.json', md5=None)

In [None]:
# # !mkdir  -p /root/.kaggle/
# !mkdir ~/.kaggle
# !cp /kaggle/working/kaggle.json ~/.kaggle/kaggle.json
# !chmod 600 ~/.kaggle/kaggle.json

In [None]:
# MIGHT HELP
# !echo '{"username":"feascr","key":"07fa08279a5745a468798a76cb4dbfb7"}' > ~/.kaggle/kaggle.json