In [2]:
import json
import time
import torch
import numpy as np
import pandas as pd
import dask.dataframe as dd
from PIL import Image
from dask.distributed import Client
from cerebro.dask_backend import DaskBackend
from cerebro.dataset_info import DatasetInfo
from cerebro.params import Params
from cerebro.etl import etl
import cerebro.constants as constants
from torchvision import transforms
from sentence_transformers import SentenceTransformer, util

In [3]:
def prepare_data():
    data = None
    with open("/data/cerebro_data_storage/coco/annotations/captions_val2014.json") as f:
        data = json.load(f)
    dataset = {
        'id': [],
        'file_name': [],
        'height': [],
        'width': [],
        'captions': [],
        'date_captured': [] 
    }

    annotations = {}
    annotations_list = data['annotations']
    for i in annotations_list:
        if not i["image_id"] in annotations:
            annotations[i["image_id"]] = []
        annotations[i["image_id"]].append(i["caption"])

    for i in range(len(data['images'])):
        dataset['id'].append(data["images"][i]['id'])
        dataset['file_name'].append(data["images"][i]['file_name'])
        dataset['height'].append(data["images"][i]['height'])
        dataset['width'].append(data["images"][i]['width'])
        dataset['captions'].append(annotations[data["images"][i]['id']])
        dataset['date_captured'].append(data["images"][i]['date_captured'])

    dataset = pd.DataFrame(dataset)
    dataset.to_csv("/data/cerebro_data_storage/coco/annotations/captions_val2014_modified.csv", index=False)

In [4]:
def row_preprocessing_routine(row, to_root_path, kwargs):
    t1 = time.time()
    input_image_path = to_root_path + str(row["file_name"])
    output_caption = row["captions"]
    img = Image.open(input_image_path)
    img_tensor = transforms.PILToTensor()(img)
    enc_model = kwargs['nlp_model']
    caption_tensor = enc_model.encode([output_caption], convert_to_tensor=True)
    saved = [img_tensor, caption_tensor]
    t2 = time.time()
    return [ kwargs["io_time"], t2-t1]

In [5]:
def main():
    dsk_bknd = DaskBackend("0.0.0.0:8786")

    prepare_data()
    is_feature_download = [False, True, False, False, False, False]
    feature_names = ["id", "file_name", "height", "width", "captions", "date_captured"]
    dtypes = (int, str, int, int, list, str)
    data_info = DatasetInfo(feature_names, feature_names, [], dtypes, is_feature_download)

    metadata_path = "/data/cerebro_data_storage/coco/annotations/captions_val2014_modified.csv"
    from_root_path = "/data/cerebro_data_storage/coco/val2014/"
    to_root_path = "/data/cerebro_data_storage_worker/coco/val2014/"
    output_path = ""
    requirements_path = ""
    download_type = constants.DOWNLOAD_FROM_SERVER

    nlp_model = SentenceTransformer('all-MiniLM-L6-v2')

    params = Params(metadata_path, from_root_path, to_root_path,
        output_path, requirements_path, download_type)

    e = etl(dsk_bknd, params, row_preprocessing_routine, data_info)

    e.load_data(frac=0.01)
    e.shuffle_shard_data()
    result = e.preprocess_data(nlp_model=nlp_model)
    return result
    
#     for i in range(result.npartitions):
#         out1 = result.partitions[i].compute().to_frame()
#         out1[["io", "cpu"]] = list(out1["transformed_data"])
#         out1 = out1.drop("transformed_data", axis=1)
#         io_sum = out1["io"].sum()
#         cpu_sum = out1["cpu"].sum()
#         print("i:" + str(i) + " io time:"+ str(io_sum))
#         print("i:" + str(i) + " cpu time:"+ str(cpu_sum))



In [6]:
result = main()

Client dashboard:  http://0.0.0.0:8787/status
Number of workers: 8


In [7]:
out = result.compute()
out1 = out.to_frame()
out1[["io", "cpu"]] = list(out1["transformed_data"])
out1 = out1.drop("transformed_data", axis=1)
io_sum = out1["io"].sum() / result.npartitions
cpu_sum = out1["cpu"].sum() / result.npartitions

In [8]:
print("io time:"+ str(io_sum))
print("cpu time:"+ str(cpu_sum))

io time:2.8381438553333282
cpu time:2.5297029316425323
