# ADAPTING PROCESSED DATA TO FIT INTO MODEL

In [1]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# Import packages
import pandas as pd
import numpy as np
import ndjson
# import json
from google.cloud import storage
import random

# from utils import upload_blobS

from sklearn.preprocessing import OneHotEncoder

In [3]:
from pictionary_ai.params import *
from pictionary_ai.utils import *

In [4]:
# Parameters
max_length = 150
from_bucket_name = 'quickdraw-simplified-processed'
to_bucket_name = 'quickdraw-simplified-modelready'
test_split = 0.3


In [5]:
# Create list of blobs in bucket (functions from utils package)
blob_list = list_blobs(from_bucket_name)
blob_list

['processed_The Eiffel Tower.ndjson',
 'processed_The Great Wall of China.ndjson',
 'processed_The Mona Lisa.ndjson',
 'processed_aircraft carrier.ndjson',
 'processed_airplane.ndjson',
 'processed_alarm clock.ndjson',
 'processed_ambulance.ndjson',
 'processed_angel.ndjson',
 'processed_animal migration.ndjson',
 'processed_ant.ndjson',
 'processed_anvil.ndjson',
 'processed_apple.ndjson',
 'processed_arm.ndjson',
 'processed_asparagus.ndjson',
 'processed_axe.ndjson',
 'processed_backpack.ndjson',
 'processed_banana.ndjson',
 'processed_bandage.ndjson',
 'processed_barn.ndjson',
 'processed_baseball bat.ndjson',
 'processed_baseball.ndjson',
 'processed_basket.ndjson',
 'processed_basketball.ndjson',
 'processed_bat.ndjson',
 'processed_bathtub.ndjson',
 'processed_beach.ndjson',
 'processed_bear.ndjson',
 'processed_beard.ndjson',
 'processed_bed.ndjson',
 'processed_bee.ndjson',
 'processed_belt.ndjson',
 'processed_bench.ndjson',
 'processed_bicycle.ndjson',
 'processed_binoculars

In [6]:
# Download a blob from a bucket and store it in memory
def download_blob_to_memory(bucket_name, source_blob_name) -> list:
    '''
    Downloads a (ndjson) blob from the bucket and return json file as dict
    '''
    # Initialize a client
    storage_client = storage.Client()

    # Get the bucket
    bucket = storage_client.bucket(bucket_name)

    # Get the blob
    blob = bucket.blob(source_blob_name)

    # Download the blob content as a string
    blob_content = blob.download_as_string()

    # Need to replace all single quotes with double quotes for processed
    # Not need once processing code has been updated
    blob_content = str(blob_content, encoding='utf-8').replace("'",'"')

    # Load JSON from the blob contents
    json_data = ndjson.loads(blob_content)

    # Returns a list of dictionaries - each dictionary represents one drawing
    return json_data

In [7]:
data = download_blob_to_memory(from_bucket_name, blob_list[0])
data[0]

{'key_id': '5027286841556992',
 'class': 'The Eiffel Tower',
 'length': 35,
 'list_deltas': [[0.0863, 0.0086, 0.0],
  [0.0588, 0.0302, 0.0],
  [0.1059, 0.0043, 0.0],
  [0.749, -0.0733, 1.0],
  [-0.702, 0.0388, 0.0],
  [0.0745, -0.3534, 0.0],
  [0.1569, -0.4612, 0.0],
  [0.0235, -0.1336, 0.0],
  [0.0353, 0.2716, 0.0],
  [0.0353, 0.069, 0.0],
  [0.0275, 0.1638, 0.0],
  [0.0549, 0.1422, 0.0],
  [0.0235, 0.1767, 0.0],
  [0.0588, 0.1422, 1.0],
  [-0.4196, -0.0517, 0.0],
  [0.0392, -0.194, 0.0],
  [0.0275, -0.0776, 0.0],
  [0.0314, -0.2974, 0.0],
  [0.0314, -0.0905, 0.0],
  [0.0549, -0.0776, 0.0],
  [0.0078, -0.0474, 0.0],
  [-0.0039, 0.4483, 0.0],
  [0.149, 0.2931, 0.0],
  [0.0431, 0.1293, 1.0],
  [-0.3216, -0.4741, 0.0],
  [0.0706, -0.0086, 0.0],
  [0.0392, -0.069, 0.0],
  [0.0392, -0.0302, 0.0],
  [0.0118, 0.0086, 0.0],
  [0.0471, 0.1078, 0.0],
  [0.0392, 0.0172, 0.0],
  [0.051, 0.0, 1.0],
  [-0.4314, 0.1466, 0.0],
  [0.2157, -0.0129, 0.0],
  [0.1098, -0.0388, 1.0]]}

In [8]:
# Applying padding to a drawing array
def add_padding(array_drawing: np.ndarray, max_length: int) -> list:

    # Define values for padding layers - e.g. [99,99,99]
    padding = [[99,99,99]]

    # If array is greater than max_length slice off remainder of array
    if len(array_drawing) >= max_length :
        padded_array = array_drawing[0:max_length]

    # If array is less than max_length, adding padding
    else :
        pad_length = max_length - len(array_drawing)
        padded_array = array_drawing + padding * pad_length

    return padded_array

In [9]:
# Create a mapping dictionary
classes = []
dict_class = {}

# Create a list of class - ASSUMPTION:
for blob in blob_list :
    classes.append(blob.replace('processed_','').replace('.ndjson',''))

for key, value in enumerate(classes) :
    dict_class[value] = key

# Dictionary of classes - with a key for OHC
# Should I include dict_class in params?
dict_class

{'The Eiffel Tower': 0,
 'The Great Wall of China': 1,
 'The Mona Lisa': 2,
 'aircraft carrier': 3,
 'airplane': 4,
 'alarm clock': 5,
 'ambulance': 6,
 'angel': 7,
 'animal migration': 8,
 'ant': 9,
 'anvil': 10,
 'apple': 11,
 'arm': 12,
 'asparagus': 13,
 'axe': 14,
 'backpack': 15,
 'banana': 16,
 'bandage': 17,
 'barn': 18,
 'baseball bat': 19,
 'baseball': 20,
 'basket': 21,
 'basketball': 22,
 'bat': 23,
 'bathtub': 24,
 'beach': 25,
 'bear': 26,
 'beard': 27,
 'bed': 28,
 'bee': 29,
 'belt': 30,
 'bench': 31,
 'bicycle': 32,
 'binoculars': 33,
 'bird': 34,
 'birthday cake': 35,
 'blackberry': 36,
 'blueberry': 37,
 'book': 38,
 'boomerang': 39,
 'bottlecap': 40,
 'bowtie': 41,
 'bracelet': 42,
 'brain': 43,
 'bread': 44,
 'bridge': 45,
 'broccoli': 46,
 'broom': 47,
 'bucket': 48,
 'bulldozer': 49,
 'bus': 50,
 'bush': 51,
 'butterfly': 52,
 'cactus': 53,
 'cake': 54,
 'calculator': 55,
 'calendar': 56,
 'camel': 57,
 'camera': 58,
 'camouflage': 59,
 'campfire': 60,
 'candle':

In [19]:
pwd

'/home/raj/code/rs-uk/pictionary-ai/notebooks'

In [23]:
with open('/home/raj/code/rs-uk/pictionary-ai/shared_data/OHC_mapping.json', 'w') as file :
      json.dump(dict_class, file)

In [10]:
# OHC class names for modelling
# Doing this manually rather than using transformer as we don't have easy access to all class names right now

def OHC_class_name(class_name: str) -> np.ndarray:
    OHC_output = np.zeros((1,len(dict_class)))
    OHC_output[0, dict_class[class_name]] = 1

    # Need to convert the np.ndarray into a list so it can be parsed into JSON
    return OHC_output.tolist()

In [11]:
# Example OHC output
OHC_class_name(classes[0])

[[1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,

In [12]:
# Pad and OHC an entire class - loop through a blob and pad each
def pad_and_OHC_class(class_list: list) -> list:
    for i in range(len(class_list)) :
        class_list[i]['list_deltas'] = add_padding(class_list[i]['list_deltas'], max_length)
        class_list[i]['OHC_class'] = OHC_class_name(class_list[i]['class'])


In [14]:
pad_and_OHC_class(data)

In [17]:
data[0]['OHC_class']

[[1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,

In [14]:
# Shuffle class list and split into train and test
def shuffle_and_split_class(class_list: list, test_split: float) -> list:

    n = len(class_list)
    n_split = int(n*(1-test_split))

    random.shuffle(class_list)

    # TAKE 10PC SAMPLE TO REDUCE SIZE FOR NOW
    data_train = class_list[:n_split:10]
    data_test = class_list[n_split::10]

    return data_train, data_test

In [15]:
# data_train, data_test = shuffle_and_split_class(data, test_split)

In [16]:
# print(len(data_train))
# print(len(data_test))
# print(len(data))

In [17]:
# data_train[0]

In [18]:
# Upload data to GCloud bucket
#def upload_blob(source_path, source_file_name, bucket_name, destination_blob_name=None) -> None:

def upload_blob_from_memory(bucket_name: str, source_data: list, destination_blob_name: str) -> None:
    '''
    Uploads a file to the bucket from memory
    '''

    # Initialize a client
    storage_client = storage.Client()

    # Get the bucket
    bucket = storage_client.bucket(bucket_name)

    # Create a blob
    blob = bucket.blob(destination_blob_name)

    # Convert data to JSON and upload to the blob
    blob.upload_from_string(
        data=json.dumps(source_data),
        content_type='application/json'
        )

    print(f"JSON file '{destination_blob_name}' uploaded to bucket '{bucket_name}'")


In [19]:
# # Upload a file to a bucket
# upload_blob_from_memory(to_bucket_name, data_train, 'test_data_train')

In [20]:
#### LOOP THROUGH ALL DATA AND POSTPROCESS
# using datetime module
import datetime;

# ct stores current time
ct = datetime.datetime.now()
print("current time:-", ct)

for i in range(36, len(blob_list)) :
    blob = blob_list[i]

    print(i)
    data = download_blob_to_memory(from_bucket_name, blob)
    print(f"Downloaded {blob}")
    ct = datetime.datetime.now()
    print("current time:-", ct)

    pad_and_OHC_class(data)
    print(f"Padded and OHC {blob}")
    ct = datetime.datetime.now()
    print("current time:-", ct)

    data_train, data_test = shuffle_and_split_class(data, test_split)
    print(f"Shuffled and split {blob}")
    ct = datetime.datetime.now()
    print("current time:-", ct)

    upload_blob_from_memory(to_bucket_name, data_train, ("train_10pc_"+blob[10:]))
    print(f"Uploaded train split of {blob}")
    ct = datetime.datetime.now()
    print("current time:-", ct)

    upload_blob_from_memory(to_bucket_name, data_test, ("test_10pc_"+blob[10:]))
    print(f"Uploaded test split of {blob}")
    ct = datetime.datetime.now()
    print("current time:-", ct)


current time:- 2024-03-08 07:22:07.320801
36


Downloaded processed_blackberry.ndjson
current time:- 2024-03-08 07:23:07.423441
Padded and OHC processed_blackberry.ndjson
current time:- 2024-03-08 07:23:08.843410
Shuffled and split processed_blackberry.ndjson
current time:- 2024-03-08 07:23:08.931183
JSON file 'train_10pc_blackberry.ndjson' uploaded to bucket 'quickdraw-simplified-modelready'
Uploaded train split of processed_blackberry.ndjson
current time:- 2024-03-08 07:23:30.054067
JSON file 'test_10pc_blackberry.ndjson' uploaded to bucket 'quickdraw-simplified-modelready'
Uploaded test split of processed_blackberry.ndjson
current time:- 2024-03-08 07:23:40.529967
37
Downloaded processed_blueberry.ndjson
current time:- 2024-03-08 07:24:14.355207
Padded and OHC processed_blueberry.ndjson
current time:- 2024-03-08 07:24:17.505430
Shuffled and split processed_blueberry.ndjson
current time:- 2024-03-08 07:24:18.018151
JSON file 'train_10pc_blueberry.ndjson' uploaded to bucket 'quickdraw-simplified-modelready'
Uploaded train split of

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7fa742a004f0>>
Traceback (most recent call last):
  File "/home/raj/.pyenv/versions/3.10.6/envs/pictionary-ai/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 770, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 
Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7fa742a004f0>>
Traceback (most recent call last):
  File "/home/raj/.pyenv/versions/3.10.6/envs/pictionary-ai/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 770, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 
