In [2]:
import boto3
import json
import pandas as pd
import numpy as np

from dask.distributed import Client, progress
from dask.diagnostics import ProgressBar
import dask.bag as db

bucket = 'sagemaker-project-p-o3c1kiruwcnf'
val_ground_truth_label = 'bdd100k_images/labels/object_detection_labels/det_val.json'

val_image_path = 'bdd100k_images/val/'
prediction_path = 'bdd100k_images/predictions/'

model_labels = {
    "weather": [
        "rainy",
        "snowy", 
        "clear",
        "overcast",
        "partly cloudy",
        "undefined"
    ],
    "timeofday": [
        "daytime",
        "night",
        "dawn/dusk",
        "undefined"
    ],
    "scene": [
        "residential", 
        "city street",
        "highway",
        "undefined"
    ]
}

s3 = boto3.resource('s3')
content_object = s3.Object(bucket, val_ground_truth_label)
file_content = content_object.get()['Body'].read().decode('utf-8')
ground_truth_labels = json.loads(file_content)


# Make sure your instance has the CPU for this!
client = Client(processes=False, threads_per_worker=4,
                n_workers=15, memory_limit='2GB')
client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://169.255.255.2:8787/status,

0,1
Dashboard: http://169.255.255.2:8787/status,Workers: 15
Total threads: 60,Total memory: 27.94 GiB
Status: running,Using processes: False

0,1
Comm: inproc://169.255.255.2/200/1,Workers: 15
Dashboard: http://169.255.255.2:8787/status,Total threads: 60
Started: Just now,Total memory: 27.94 GiB

0,1
Comm: inproc://169.255.255.2/200/18,Total threads: 4
Dashboard: http://169.255.255.2:45119/status,Memory: 1.86 GiB
Nanny: None,
Local directory: /root/weather-model/Spring 2022/bdd100k/dask-worker-space/worker-0uzcl6r9,Local directory: /root/weather-model/Spring 2022/bdd100k/dask-worker-space/worker-0uzcl6r9

0,1
Comm: inproc://169.255.255.2/200/14,Total threads: 4
Dashboard: http://169.255.255.2:33033/status,Memory: 1.86 GiB
Nanny: None,
Local directory: /root/weather-model/Spring 2022/bdd100k/dask-worker-space/worker-q7fsfufi,Local directory: /root/weather-model/Spring 2022/bdd100k/dask-worker-space/worker-q7fsfufi

0,1
Comm: inproc://169.255.255.2/200/15,Total threads: 4
Dashboard: http://169.255.255.2:46215/status,Memory: 1.86 GiB
Nanny: None,
Local directory: /root/weather-model/Spring 2022/bdd100k/dask-worker-space/worker-hmwjoi19,Local directory: /root/weather-model/Spring 2022/bdd100k/dask-worker-space/worker-hmwjoi19

0,1
Comm: inproc://169.255.255.2/200/10,Total threads: 4
Dashboard: http://169.255.255.2:39553/status,Memory: 1.86 GiB
Nanny: None,
Local directory: /root/weather-model/Spring 2022/bdd100k/dask-worker-space/worker-zqj7rxb8,Local directory: /root/weather-model/Spring 2022/bdd100k/dask-worker-space/worker-zqj7rxb8

0,1
Comm: inproc://169.255.255.2/200/11,Total threads: 4
Dashboard: http://169.255.255.2:38829/status,Memory: 1.86 GiB
Nanny: None,
Local directory: /root/weather-model/Spring 2022/bdd100k/dask-worker-space/worker-xdo2i8rm,Local directory: /root/weather-model/Spring 2022/bdd100k/dask-worker-space/worker-xdo2i8rm

0,1
Comm: inproc://169.255.255.2/200/4,Total threads: 4
Dashboard: http://169.255.255.2:35479/status,Memory: 1.86 GiB
Nanny: None,
Local directory: /root/weather-model/Spring 2022/bdd100k/dask-worker-space/worker-jocvfnlx,Local directory: /root/weather-model/Spring 2022/bdd100k/dask-worker-space/worker-jocvfnlx

0,1
Comm: inproc://169.255.255.2/200/5,Total threads: 4
Dashboard: http://169.255.255.2:43029/status,Memory: 1.86 GiB
Nanny: None,
Local directory: /root/weather-model/Spring 2022/bdd100k/dask-worker-space/worker-4pc6g0x0,Local directory: /root/weather-model/Spring 2022/bdd100k/dask-worker-space/worker-4pc6g0x0

0,1
Comm: inproc://169.255.255.2/200/8,Total threads: 4
Dashboard: http://169.255.255.2:44879/status,Memory: 1.86 GiB
Nanny: None,
Local directory: /root/weather-model/Spring 2022/bdd100k/dask-worker-space/worker-qubuxlk8,Local directory: /root/weather-model/Spring 2022/bdd100k/dask-worker-space/worker-qubuxlk8

0,1
Comm: inproc://169.255.255.2/200/6,Total threads: 4
Dashboard: http://169.255.255.2:41355/status,Memory: 1.86 GiB
Nanny: None,
Local directory: /root/weather-model/Spring 2022/bdd100k/dask-worker-space/worker-cm4xaa1n,Local directory: /root/weather-model/Spring 2022/bdd100k/dask-worker-space/worker-cm4xaa1n

0,1
Comm: inproc://169.255.255.2/200/12,Total threads: 4
Dashboard: http://169.255.255.2:34171/status,Memory: 1.86 GiB
Nanny: None,
Local directory: /root/weather-model/Spring 2022/bdd100k/dask-worker-space/worker-vntwmxqe,Local directory: /root/weather-model/Spring 2022/bdd100k/dask-worker-space/worker-vntwmxqe

0,1
Comm: inproc://169.255.255.2/200/9,Total threads: 4
Dashboard: http://169.255.255.2:35489/status,Memory: 1.86 GiB
Nanny: None,
Local directory: /root/weather-model/Spring 2022/bdd100k/dask-worker-space/worker-s5dva86o,Local directory: /root/weather-model/Spring 2022/bdd100k/dask-worker-space/worker-s5dva86o

0,1
Comm: inproc://169.255.255.2/200/13,Total threads: 4
Dashboard: http://169.255.255.2:38587/status,Memory: 1.86 GiB
Nanny: None,
Local directory: /root/weather-model/Spring 2022/bdd100k/dask-worker-space/worker-_vn_kbkn,Local directory: /root/weather-model/Spring 2022/bdd100k/dask-worker-space/worker-_vn_kbkn

0,1
Comm: inproc://169.255.255.2/200/16,Total threads: 4
Dashboard: http://169.255.255.2:37439/status,Memory: 1.86 GiB
Nanny: None,
Local directory: /root/weather-model/Spring 2022/bdd100k/dask-worker-space/worker-6jk9j3vo,Local directory: /root/weather-model/Spring 2022/bdd100k/dask-worker-space/worker-6jk9j3vo

0,1
Comm: inproc://169.255.255.2/200/7,Total threads: 4
Dashboard: http://169.255.255.2:38169/status,Memory: 1.86 GiB
Nanny: None,
Local directory: /root/weather-model/Spring 2022/bdd100k/dask-worker-space/worker-gez1re8o,Local directory: /root/weather-model/Spring 2022/bdd100k/dask-worker-space/worker-gez1re8o

0,1
Comm: inproc://169.255.255.2/200/17,Total threads: 4
Dashboard: http://169.255.255.2:42055/status,Memory: 1.86 GiB
Nanny: None,
Local directory: /root/weather-model/Spring 2022/bdd100k/dask-worker-space/worker-ald1493o,Local directory: /root/weather-model/Spring 2022/bdd100k/dask-worker-space/worker-ald1493o


In [3]:
def parse_one_file(x):
    result = []
    
    for model_name in ['scene', 'timeofday', 'weather']:
        temp = {}
        temp['name'] = x['name']
        temp['model'] = model_name
        
        labels = model_labels[model_name]
        temp_truth = x['attributes'][model_name]
        if temp_truth not in labels:
            temp_truth = 'undefined'
        temp['truth'] = temp_truth
        
        content_object = s3.Object(bucket, f"{prediction_path}{model_name}-v2/{x['name']}.out")
        file_content = content_object.get()['Body'].read().decode('utf-8')
        preds = json.loads(file_content)['prediction']
        
        temp['prediction'] = labels[np.argmax(preds)]
        temp['prediction_confidence'] = preds[np.argmax(preds)]
        temp['correct_answer_confidence'] = preds[labels.index(temp['truth'])]
        
        predictions = dict(zip(labels, preds))
        temp.update(**predictions)    
        
        result.append(temp)
        
    return result[0], result[1], result[2]

In [4]:
b = db.from_sequence(ground_truth_labels).map(parse_one_file)
result = b.compute()

scene, timeofday, weather = zip(*result)
scene_df = pd.DataFrame(scene)
timeofday_df = pd.DataFrame(timeofday)
weather_df = pd.DataFrame(weather)

scene_df.head()

Unnamed: 0,name,model,truth,prediction,prediction_confidence,correct_answer_confidence,residential,city street,highway,undefined
0,b1c66a42-6f7d68ca.jpg,scene,city street,city street,0.999843,0.999843,5.858832e-05,0.9998435,8.7e-05,1.067028e-05
1,b1c81faa-3df17267.jpg,scene,highway,highway,0.999956,0.999956,3.538883e-08,4.266537e-05,0.999956,7.784259e-07
2,b1c81faa-c80764c5.jpg,scene,highway,highway,1.0,1.0,3.658841e-10,1.626245e-07,1.0,6.316718e-09
3,b1c9c847-3bda4659.jpg,scene,city street,residential,0.473957,0.408414,0.4739566,0.4084144,0.043401,0.07422841
4,b1ca2e5d-84cf9134.jpg,scene,city street,city street,0.999996,0.999996,1.206679e-10,0.9999958,4e-06,8.699438e-09


In [5]:
scene_df.to_parquet(f"s3://{bucket}/{prediction_path}combined/scene-v3.parquet")
timeofday_df.to_parquet(f"s3://{bucket}/{prediction_path}combined/timeofday-v3.parquet")
weather_df.to_parquet(f"s3://{bucket}/{prediction_path}combined/weather-v3.parquet")