# Create Numpy Arrays and Compare w/ Tito Mask Dataset

---

In [None]:
import numpy as np # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm.notebook import tqdm; tqdm.pandas()
import matplotlib.pyplot as plt
import os
import ast

In [None]:
TRAIN_OUTPUT_DIR = "/kaggle/working/hpa_cell_mask/train"
os.makedirs(TRAIN_OUTPUT_DIR, exist_ok=True)

TEST_OUTPUT_DIR = "/kaggle/working/hpa_cell_mask/test"
os.makedirs(TEST_OUTPUT_DIR, exist_ok=True)

TRAIN_CSV = os.path.join("/kaggle/input", "hpa-train-data-with-additional-metadata", "updated_train.csv")
SS_CSV = os.path.join("/kaggle/input","hpa-sample-submission-with-extra-metadata", "updated_sample_submission.csv")

train_df = pd.read_csv(TRAIN_CSV)
ss_df = pd.read_csv(SS_CSV)

train_df.mask_rles = train_df.mask_rles.progress_apply(lambda x: ast.literal_eval(x))
ss_df.mask_rles = ss_df.mask_rles.progress_apply(lambda x: ast.literal_eval(x))

display(train_df)
display(ss_df)

In [None]:
def rle_to_mask(rle_string, height, width, output_val=1):
    """ Convert RLE sttring into a binary mask 
    
    Args:
        rle_string (rle_string): Run length encoding containing 
            segmentation mask information
        height (int): Height of the original image the map comes from
        width (int): Width of the original image the map comes from
    
    Returns:
        Numpy array of the binary segmentation mask for a given cell
    """

    rle_numbers = [int(num_string) for num_string in rle_string.split(' ')]
    rle_pairs = np.array(rle_numbers).reshape(-1,2)
    img = np.zeros(height*width,dtype=np.uint8)
    for index,length in rle_pairs:
        index -= 1
        img[index:index+length] = output_val
    img = img.reshape(width,height).T
    return img

def flatten_list_of_lists(l_o_l):
    return [item for sublist in l_o_l for item in sublist]

In [None]:
def save_masks_to_numpy(df, output_dir):
    mask_rles = df.mask_rles.values
    shapes = df.ImageWidth.values
    ids = df.ID.values

    for _id, shape, mask_list in tqdm(zip(ids, shapes, mask_rles), total=len(df)):
        cell_mask = sum([
            np.array(rle_to_mask(rle_string, shape, shape, output_val=i+1)) \
            for i,rle_string in enumerate(mask_list)
        ])
        np.savez_compressed(os.path.join(output_dir, _id), cell_mask)
        
print("\n... TRAINING CONVERSION ...\n")
save_masks_to_numpy(train_df, TRAIN_OUTPUT_DIR)

print("\n... TESTING CONVERSION ...\n")
save_masks_to_numpy(ss_df, TEST_OUTPUT_DIR)

In [None]:
def compare_tito_and_darien(img_id=None, tito_root="/kaggle/input/hpa-mask/hpa_cell_mask", darien_root="/kaggle/working/hpa_cell_mask/train"):
    # Get random if None is passed
    if img_id is None:
        img_id = train_df.ID.sample(1).values[0]
    
    # Load
    tito_img = np.load(os.path.join(tito_root, img_id+".npz"))["arr_0"]
    darien_img = np.load(os.path.join(darien_root, img_id+".npz"))["arr_0"]

    plt.figure(figsize=(18,19))
    
    # SETUP SUBPLOT 1
    plt.subplot(1,2,1)
    plt.title(f"TITOS – {img_id}")
    plt.axis(False)
    plt.imshow(tito_img)

    # SETUP SUBPLOT 2
    plt.subplot(1,2,2)
    plt.title(f"DARIENS – {img_id}")
    plt.axis(False)
    plt.imshow(darien_img)
    
    plt.show()
    

for _ in range(25):
    compare_tito_and_darien()