# Looping through modelready python files and concat

In [4]:
%load_ext autoreload
%autoreload 2

In [5]:
from google.cloud import storage
import json
import random
import numpy as np

In [6]:
# Parameters
max_length = 150
from_bucket_name = 'quickdraw-simplified-modelready'
to_bucket_name = 'quickdraw-simplified-modelready'
test_split = 0.3

## Create a list of the files we will be concatenating

In [7]:
def list_blobs(bucket_name: str) -> list:
    '''
    Lists all the blobs in the bucket.
    '''
    # Initialize a client
    storage_client = storage.Client()

    # Get the bucket
    bucket = storage_client.bucket(bucket_name)

    # List the blobs in the bucket
    blobs = bucket.list_blobs()

    # Collect the names of blobs into a list
    blob_names = [blob.name for blob in blobs]

    return blob_names

In [9]:
blob_list = list_blobs(from_bucket_name)
print(len(blob_list))

1382


In [10]:
test_10_pc = [blob for blob in blob_list if blob.find('test_10pc') != -1]
len(test_10_pc)

345

In [11]:
train_10_pc = [blob for blob in blob_list if blob.find('train_10pc') != -1]
len(train_10_pc)

345

In [12]:
test = [blob for blob in blob_list if (blob.find('test_') != -1 and blob.find('test_10pc') == -1)]
len(test)

345

In [13]:
train = [blob for blob in blob_list if (blob.find('train_') != -1 and blob.find('train_10pc') == -1)]
len(test)

345

In [29]:
save_json_to_local(test_10_pc, '/home/jupyter/data/', 'test_10_pc.json')
save_json_to_local(train_10_pc, '/home/jupyter/data/', 'train_10_pc.json')
save_json_to_local(test, '/home/jupyter/data/', 'test.json')
save_json_to_local(train, '/home/jupyter/data/', 'train.json')

Saved data to /home/jupyter/data/test_10_pc.json
Saved data to /home/jupyter/data/train_10_pc.json
Saved data to /home/jupyter/data/test.json
Saved data to /home/jupyter/data/train.json


## Download locally and concatenate

In [14]:
# Download a blob from a bucket and store it in memory
def download_blob_to_memory(bucket_name: str, source_blob_name: str) -> list:
    '''
    Downloads a (ndjson) blob from the bucket and return json file as dict
    '''
    # Initialize a client
    storage_client = storage.Client()

    # Get the bucket
    bucket = storage_client.bucket(bucket_name)

    # Get the blob
    blob = bucket.blob(source_blob_name)

    # Download the blob content as a string
    blob_content = blob.download_as_string()

    # Need to replace all single quotes with double quotes for processed
    # Not need once processing code has been updated
    # blob_content = str(blob_content, encoding='utf-8').replace("'",'"')

    # Load JSON from the blob contents
    json_data = json.loads(blob_content)

    # Returns a list of dictionaries - each dictionary represents one drawing
    return json_data

In [15]:
# # Function to save the drawings in the list to an json file locally
# def save_drawings_to_json_local(list_drawings: list, output_file: str) -> None:
#     '''
#         - list_drawings: contains a dictionary for each drawing
#         - output_file: the complete filepath to the target file to save/create (.json)
#     '''
#     with open(output_file, 'w') as json_file:
#         # Write each drawing's dict to the file as a new line
#         for dict_drawing in list_drawings:
#             json.dump(dict_drawing, json_file)

In [10]:
def load_json_from_local (folder_path: str, file_name: str) -> list: 
    file_path = folder_path+file_name
    with open(file_path, 'r') as file : 
        json_data = json.load(file)
    print(f'Loaded {file_name} from {folder_path}')
    return json_data

In [2]:
def save_json_to_local (data: list, folder_path: str, file_name: str) -> None: 
    file_path = folder_path+file_name
    with open(file_path, 'w') as file : 
        json.dump(data, file)
    print(f'Saved data to {file_path}')
    return None

# Create 50 class data set

In [14]:
# Select 50 random numbers for choosing classes used 
random.seed(42) # Keeps same random sample
random_50 = random.sample(range(0,len(test_10_pc)),50)
random_50.sort()
print(random_50)

[3, 12, 13, 15, 16, 22, 40, 44, 47, 49, 52, 57, 63, 71, 79, 81, 101, 110, 111, 112, 114, 119, 125, 135, 140, 142, 150, 172, 174, 176, 183, 193, 194, 214, 216, 229, 235, 258, 274, 279, 282, 287, 301, 302, 308, 309, 316, 321, 327, 332]


In [27]:
folder_path = '/home/jupyter/data/'
file_name = 'random_50.json'

save_json_to_local(random_50, folder_path, file_name)

Saved data to /home/jupyter/data/random_50.json


In [30]:
def append_random_jsons_from_gcb(blob_list: list, sample_n: int) -> list: 
    appended_json = []
    count = 0
    
    # random.seed(42) # To keep random sample repeatable
    # random_list = random.sample(range(0,len(blob_list)),sample_n)
    
    # Used pre-stored random number list
    random_list = load_json_from_local('/home/jupyter/data/', 'random_50.json')
    
    # Iterate over each blob in the list
    for i in random_list : 
        count += 1
        print(count)
        
        blob = blob_list[i]
        json_data = download_blob_to_memory(from_bucket_name, blob)
        print(f'{blob} downloaded: {len(json_data)} drawings')

        # Append the dictionary to the list
        appended_json = appended_json + json_data
        print(f'{blob} appended: {len(appended_json)} drawing in total')
    
    return appended_json

In [32]:
def append_random_jsons_from_local(folder_path: str, blob_list: list, sample_n: int) -> list: 
    appended_json = []
    count = 0
    
    # random.seed(42) # To keep random sample repeatable
    # random_list = random.sample(range(0,len(blob_list)),sample_n)
    
    # Used pre-stored random number list
    random_list = load_json_from_local('/home/jupyter/data/', 'random_50.json')
    
    # Iterate over each blob in the list  
    for i in random_list : 
        count += 1
        print(count)
        
        blob = blob_list[i]
        json_data = load_json_from_local(folder_path, blob)
        print(f'{blob} downloaded: {len(json_data)} drawings')

        # Append the dictionary to the list
        appended_json = appended_json + json_data
        print(f'{blob} appended: {len(appended_json)} drawing in total')
    
    return appended_json

In [33]:
folder_path = '/home/jupyter/data/test_data/'

appended_json = append_random_jsons_from_local(folder_path, test_10_pc, 50)

Loaded random_50.json from /home/jupyter/data/
1
Loaded test_10pc_aircraft carrier.ndjson from /home/jupyter/data/test_data/
test_10pc_aircraft carrier.ndjson downloaded: 3496 drawings
test_10pc_aircraft carrier.ndjson appended: 3496 drawing in total
2
Loaded test_10pc_arm.ndjson from /home/jupyter/data/test_data/
test_10pc_arm.ndjson downloaded: 3629 drawings
test_10pc_arm.ndjson appended: 7125 drawing in total
3
Loaded test_10pc_asparagus.ndjson from /home/jupyter/data/test_data/
test_10pc_asparagus.ndjson downloaded: 5044 drawings
test_10pc_asparagus.ndjson appended: 12169 drawing in total
4
Loaded test_10pc_backpack.ndjson from /home/jupyter/data/test_data/
test_10pc_backpack.ndjson downloaded: 3775 drawings
test_10pc_backpack.ndjson appended: 15944 drawing in total
5
Loaded test_10pc_banana.ndjson from /home/jupyter/data/test_data/
test_10pc_banana.ndjson downloaded: 9239 drawings
test_10pc_banana.ndjson appended: 25183 drawing in total
6
Loaded test_10pc_basketball.ndjson from /h

In [34]:
save_json_to_local(appended_json, '/home/jupyter/data/all_classes/', 'test_50_classes.json')

Saved data to /home/jupyter/data/all_classes/test_50_classes.json


In [None]:
# folder_path = '/home/jupyter/data/test_data/'

# appended_json = append_random_jsons_from_local(folder_path, train_10_pc, 50)

# save_json_to_local(appended_json, '/home/jupyter/data/all_classes/', 'test_50_classes.json')

# SHUFFLE JSON

In [36]:
# Shuffle the 50 class dataset
random.shuffle(appended_json)

In [39]:
save_json_to_local(appended_json, '/home/jupyter/data/all_classes/', 'test_50_classes_shuffled.json')

Saved data to /home/jupyter/data/all_classes/test_50_classes_shuffled.json


In [40]:
len(appended_json)

239312

In [47]:
# Check list is shuffled
print([drawing['class'] for drawing in appended_json][0:50])

['eyeglasses', 'potato', 'grapes', 'compass', 'mouse', 'basketball', 'hand', 'trumpet', 'fish', 'fan', 'butterfly', 'hat', 'marker', 'eyeglasses', 'fence', 'lighthouse', 'telephone', 'asparagus', 'drums', 'mouse', 'potato', 'arm', 'chair', 'sword', 'compass', 'camel', 'fence', 'face', 'line', 'bulldozer', 'eyeglasses', 'fish', 'camel', 'lighthouse', 'bread', 'broom', 'chair', 'bread', 'trumpet', 'steak', 'chair', 'violin', 'mouse', 'sword', 'telephone', 'spider', 'bread', 'basketball', 'sword', 'sea turtle']


In [51]:
# Extract X_test list deltas
X_test_50_classes = [{'key_id':drawing['key_id'], 'class':drawing['class'],'list_deltas':drawing['list_deltas']} for drawing in appended_json]

In [54]:
# Check 
X_test_50_classes[0:2]

[{'key_id': '4817705540517888',
  'class': 'eyeglasses',
  'list_deltas': [[0.5961, -0.0538, 1.0],
   [-0.5961, 0.043, 0.0],
   [0.0314, 0.4301, 0.0],
   [0.0314, 0.0968, 0.0],
   [0.0588, 0.0645, 0.0],
   [0.0745, -0.0323, 0.0],
   [0.0471, -0.1075, 0.0],
   [0.0353, -0.2043, 0.0],
   [-0.0078, -0.2151, 1.0],
   [-0.0118, 0.0108, 0.0],
   [0.051, 0.4301, 0.0],
   [0.0471, 0.086, 0.0],
   [0.0549, 0.0215, 0.0],
   [0.0588, -0.0753, 0.0],
   [0.0431, -0.1398, 0.0],
   [0.051, -0.2366, 0.0],
   [0.0078, -0.1828, 1.0],
   [-0.5451, 0.1398, 0.0],
   [-0.0706, -0.1505, 0.0],
   [-0.1216, -0.129, 1.0],
   [0.7647, 0.1398, 0.0],
   [0.1333, -0.2903, 1.0],
   [-0.9255, 0.1935, 0.0],
   [0.2471, 0.3011, 1.0],
   [0.5529, -0.0968, 0.0],
   [0.2, -0.4624, 1.0],
   [99, 99, 99],
   [99, 99, 99],
   [99, 99, 99],
   [99, 99, 99],
   [99, 99, 99],
   [99, 99, 99],
   [99, 99, 99],
   [99, 99, 99],
   [99, 99, 99],
   [99, 99, 99],
   [99, 99, 99],
   [99, 99, 99],
   [99, 99, 99],
   [99, 99, 99],
 

In [55]:
save_json_to_local(X_test_50_classes, '/home/jupyter/data/all_classes/', 'X_test_50_classes.json')

Saved data to /home/jupyter/data/all_classes/X_test_50_classes.json


In [74]:
# Extract y_train OHC class
# However, need to change OHC to reflect subset of 50 classes!

In [15]:
# Need to re-OHC the classes for the 50 class subset
class_list = [test_10_pc[i] for i in random_50]
class_names=[class_.replace('test_10pc_','').replace('.ndjson','') for class_ in class_list]
#print(class_names)

dict_class_10pc = {}
for key, value in enumerate(class_names) :
    dict_class_10pc[value] = key
    
print(dict_class_10pc)

{'aircraft carrier': 0, 'arm': 1, 'asparagus': 2, 'backpack': 3, 'banana': 4, 'basketball': 5, 'bottlecap': 6, 'bread': 7, 'broom': 8, 'bulldozer': 9, 'butterfly': 10, 'camel': 11, 'canoe': 12, 'chair': 13, 'compass': 14, 'cookie': 15, 'drums': 16, 'eyeglasses': 17, 'face': 18, 'fan': 19, 'fence': 20, 'fish': 21, 'flying saucer': 22, 'grapes': 23, 'hand': 24, 'hat': 25, 'horse': 26, 'light bulb': 27, 'lighthouse': 28, 'line': 29, 'marker': 30, 'mountain': 31, 'mouse': 32, 'parachute': 33, 'passport': 34, 'pliers': 35, 'potato': 36, 'sea turtle': 37, 'snowflake': 38, 'spider': 39, 'square': 40, 'steak': 41, 'swing set': 42, 'sword': 43, 'telephone': 44, 'television': 45, 'tooth': 46, 'traffic light': 47, 'trumpet': 48, 'violin': 49}


In [16]:
save_json_to_local(dict_class_10pc, '/home/jupyter/data/mappings/', 'dict_50_class_subset.json')

Saved data to /home/jupyter/data/mappings/dict_50_class_subset.json


In [76]:
def OHC_class_name(class_name: str) -> np.ndarray:
    OHC_output = np.zeros((1,len(dict_class_10pc)))
    OHC_output[0, dict_class_10pc[class_name]] = 1

    # Need to convert the np.ndarray into a list so it can be parsed into JSON
    return OHC_output.tolist()

In [77]:
y_test_50_classes = [{'key_id':drawing['key_id'], 'class':drawing['class'],'OHC_class':OHC_class_name(drawing['class'])} for drawing in appended_json]


In [86]:
#Check X&Y ordered the same way
for i in range(20):
    print(f"key id:{X_test_50_classes[i]['key_id']}, class:{X_test_50_classes[i]['class']}")
    print(f"key id:{y_test_50_classes[i]['key_id']}, class:{y_test_50_classes[i]['class']}")

key id:4817705540517888, class:eyeglasses
key id:4817705540517888, class:eyeglasses
key id:5932336841490432, class:potato
key id:5932336841490432, class:potato
key id:5156375304863744, class:grapes
key id:5156375304863744, class:grapes
key id:6515054788739072, class:compass
key id:6515054788739072, class:compass
key id:6066733288783872, class:mouse
key id:6066733288783872, class:mouse
key id:6111767363059712, class:basketball
key id:6111767363059712, class:basketball
key id:4507594724999168, class:hand
key id:4507594724999168, class:hand
key id:6363290877820928, class:trumpet
key id:6363290877820928, class:trumpet
key id:4633025168015360, class:fish
key id:4633025168015360, class:fish
key id:4969540603609088, class:fan
key id:4969540603609088, class:fan
key id:5939668493271040, class:butterfly
key id:5939668493271040, class:butterfly
key id:5692177541758976, class:hat
key id:5692177541758976, class:hat
key id:5264579842015232, class:marker
key id:5264579842015232, class:marker
key id:4

In [87]:
# All looks good
# Save to json
save_json_to_local(y_test_50_classes, '/home/jupyter/data/all_classes/', 'y_test_50_classes.json')

Saved data to /home/jupyter/data/all_classes/y_test_50_classes.json


In [56]:
# Check test and train classes are the same: 
print([test_10_pc[i] for i in random_50])
print([train_10_pc[i] for i in random_50])

['test_10pc_aircraft carrier.ndjson', 'test_10pc_arm.ndjson', 'test_10pc_asparagus.ndjson', 'test_10pc_backpack.ndjson', 'test_10pc_banana.ndjson', 'test_10pc_basketball.ndjson', 'test_10pc_bottlecap.ndjson', 'test_10pc_bread.ndjson', 'test_10pc_broom.ndjson', 'test_10pc_bulldozer.ndjson', 'test_10pc_butterfly.ndjson', 'test_10pc_camel.ndjson', 'test_10pc_canoe.ndjson', 'test_10pc_chair.ndjson', 'test_10pc_compass.ndjson', 'test_10pc_cookie.ndjson', 'test_10pc_drums.ndjson', 'test_10pc_eyeglasses.ndjson', 'test_10pc_face.ndjson', 'test_10pc_fan.ndjson', 'test_10pc_fence.ndjson', 'test_10pc_fish.ndjson', 'test_10pc_flying saucer.ndjson', 'test_10pc_grapes.ndjson', 'test_10pc_hand.ndjson', 'test_10pc_hat.ndjson', 'test_10pc_horse.ndjson', 'test_10pc_light bulb.ndjson', 'test_10pc_lighthouse.ndjson', 'test_10pc_line.ndjson', 'test_10pc_marker.ndjson', 'test_10pc_mountain.ndjson', 'test_10pc_mouse.ndjson', 'test_10pc_parachute.ndjson', 'test_10pc_passport.ndjson', 'test_10pc_pliers.ndjson'

# Rinse and repeat for train data

In [90]:
folder_path = '/home/jupyter/data/train_data/'

appended_json = []

appended_json = append_random_jsons_from_local(folder_path, train_10_pc, 50)

save_json_to_local(appended_json, '/home/jupyter/data/all_classes/', 'train_50_classes.json')

Loaded random_50.json from /home/jupyter/data/
1
Loaded train_10pc_aircraft carrier.ndjson from /home/jupyter/data/train_data/
train_10pc_aircraft carrier.ndjson downloaded: 8156 drawings
train_10pc_aircraft carrier.ndjson appended: 8156 drawing in total
2
Loaded train_10pc_arm.ndjson from /home/jupyter/data/train_data/
train_10pc_arm.ndjson downloaded: 8467 drawings
train_10pc_arm.ndjson appended: 16623 drawing in total
3
Loaded train_10pc_asparagus.ndjson from /home/jupyter/data/train_data/
train_10pc_asparagus.ndjson downloaded: 11768 drawings
train_10pc_asparagus.ndjson appended: 28391 drawing in total
4
Loaded train_10pc_backpack.ndjson from /home/jupyter/data/train_data/
train_10pc_backpack.ndjson downloaded: 8806 drawings
train_10pc_backpack.ndjson appended: 37197 drawing in total
5
Loaded train_10pc_banana.ndjson from /home/jupyter/data/train_data/
train_10pc_banana.ndjson downloaded: 21556 drawings
train_10pc_banana.ndjson appended: 58753 drawing in total
6
Loaded train_10pc_b

In [91]:
random.shuffle(appended_json)

save_json_to_local(appended_json, '/home/jupyter/data/all_classes/', 'train_50_classes_shuffled.json')

Saved data to /home/jupyter/data/all_classes/train_50_classes_shuffled.json


In [92]:
len(appended_json)

558353

In [93]:
print([drawing['class'] for drawing in appended_json[0:50]])

['potato', 'potato', 'light bulb', 'spider', 'snowflake', 'banana', 'square', 'flying saucer', 'hand', 'hat', 'basketball', 'mouse', 'arm', 'tooth', 'pliers', 'cookie', 'face', 'camel', 'tooth', 'sword', 'mountain', 'fish', 'parachute', 'light bulb', 'bread', 'lighthouse', 'fence', 'violin', 'marker', 'line', 'canoe', 'light bulb', 'trumpet', 'television', 'bottlecap', 'lighthouse', 'asparagus', 'eyeglasses', 'snowflake', 'sword', 'hat', 'hand', 'potato', 'backpack', 'marker', 'face', 'chair', 'eyeglasses', 'asparagus', 'camel']


In [94]:
# Extract X_train list deltas
X_train_50_classes = [{'key_id':drawing['key_id'], 'class':drawing['class'],'list_deltas':drawing['list_deltas']} for drawing in appended_json]

save_json_to_local(X_train_50_classes, '/home/jupyter/data/all_classes/', 'X_train_50_classes.json')

# Extract and re-OHC y_train classes
y_train_50_classes = [{'key_id':drawing['key_id'], 'class':drawing['class'],'OHC_class':OHC_class_name(drawing['class'])} for drawing in appended_json]

save_json_to_local(y_train_50_classes, '/home/jupyter/data/all_classes/', 'y_train_50_classes.json')

Saved data to /home/jupyter/data/all_classes/X_train_50_classes.json
Saved data to /home/jupyter/data/all_classes/y_train_50_classes.json


In [98]:
y_train_50_classes[0]

{'key_id': '6467442496765952',
 'class': 'potato',
 'OHC_class': [[0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   1.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0]]}

# Upload to Google Cloud

In [96]:
# Upload a local file to a bucket
def upload_blob_from_local_file(source_path: str, source_file_name:str, bucket_name: str, destination_blob_name:str = None) -> None:
    '''
    Uploads a file to the bucket.
    '''
    # Initialize a client
    storage_client = storage.Client()

    # Get the bucket
    bucket = storage_client.bucket(bucket_name)

    # Use the file name for the blob name if a blob name is not provided
    if destination_blob_name is None:
        destination_blob_name = source_file_name

    # Create a blob
    blob = bucket.blob(destination_blob_name)

    # Define the source file path
    source_file_path = '/'.join((source_path, source_file_name))

    # Upload the file to the blob
    blob.upload_from_filename(source_file_path)

In [97]:
bucket_name_2 = 'quickdraw-simplified-traintest'
source_path = '/home/jupyter/data/all_classes/'
source_file_name1 = 'X_train_50_classes.json'
source_file_name2 = 'y_train_50_classes.json'
source_file_name3 = 'X_test_50_classes.json'
source_file_name4 = 'y_test_50_classes.json'

upload_blob_from_local_file(source_path, source_file_name1, bucket_name_2)
print(f'{source_file_name1} uploaded to {bucket_name_2}')
upload_blob_from_local_file(source_path, source_file_name2, bucket_name_2)
print(f'{source_file_name2} uploaded to {bucket_name_2}')
upload_blob_from_local_file(source_path, source_file_name3, bucket_name_2)
print(f'{source_file_name3} uploaded to {bucket_name_2}')
upload_blob_from_local_file(source_path, source_file_name4, bucket_name_2)
print(f'{source_file_name4} uploaded to {bucket_name_2}')

X_train_50_classes.json uploaded to quickdraw-simplified-traintest
y_train_50_classes.json uploaded to quickdraw-simplified-traintest
X_test_50_classes.json uploaded to quickdraw-simplified-traintest
y_test_50_classes.json uploaded to quickdraw-simplified-traintest
