In [3]:
import numpy as np
import pandas as pd
import os
import json
import matplotlib.pyplot as plt
import linecache
from google.cloud import storage

In [4]:
path_data = '/Users/gregorytaylor/code/pictionary-ai/raw_data'
file_name = 'full_simplified_face.ndjson'

3. Processing Simplified file

In [5]:
##Define the function to process the Simplified file and return an array and class name (the drawing to guess)
def parse_line(json_drawing: json) -> dict:
    """Parse an ndjson line and return ink (as np array) and classname."""
    class_name = json_drawing["word"]
    inkarray = json_drawing["drawing"]
    stroke_lengths = [len(stroke[0]) for stroke in inkarray]
    total_points = sum(stroke_lengths)
    np_ink = np.zeros((total_points, 3), dtype=np.float32)
    current_t = 0
    for stroke in inkarray:
        for i in [0, 1]:
            np_ink[current_t:(current_t + len(stroke[0])), i] = stroke[i]
        current_t += len(stroke[0])
        np_ink[current_t - 1, 2] = 1  # stroke_end

    # Preprocessing.
    # 1. Size normalization.
    lower = np.min(np_ink[:, 0:2], axis=0)
    upper = np.max(np_ink[:, 0:2], axis=0)
    scale = upper - lower
    scale[scale == 0] = 1
    np_ink[:, 0:2] = (np_ink[:, 0:2] - lower) / scale

    # 2. Compute deltas.
    np_ink[1:, 0:2] -= np_ink[0:-1, 0:2]
    np_ink = np_ink[1:, :]
    return np.round(np_ink,5), class_name

In [6]:
##Define the function to process the Simplified file and return an array and class name (the drawing to guess)
def get_drawing_data(json_drawing: json) -> dict:
    '''Extracts the drawing data (strokes list) and class name from a drawing JSON file.'''
    class_name = json_drawing['word']
    lst_strokes = json_drawing['drawing']

    x_values = []
    y_values = []
    stroke_delimiter = []
    
    for stroke in lst_strokes:
        # Creating the third 'column' to pass to the model with 0 all along and a 1 at the end of the stroke
        stroke_delimiter = [0] * len(stroke[0])
        stroke_delimiter[-1] = 1
        # Appending the delimiter column to x and y
        stroke.append(stroke_delimiter)
        lst_strokes_augmented.append(stroke)


    
    lst_strokes_length = [len(stroke[0]) for stroke in lst_inkarray]

    
    total_points = sum(lst_strokes_length)

    
    np_ink = np.zeros((total_points, 3), dtype=np.float32)
    current_t = 0
    for stroke in inkarray:
        for i in [0, 1]:
            np_ink[current_t:(current_t + len(stroke[0])), i] = stroke[i]
        current_t += len(stroke[0])
        np_ink[current_t - 1, 2] = 1  # stroke_end

    # Preprocessing.
    # 1. Size normalization.
    lower = np.min(np_ink[:, 0:2], axis=0)
    upper = np.max(np_ink[:, 0:2], axis=0)
    scale = upper - lower
    scale[scale == 0] = 1
    np_ink[:, 0:2] = (np_ink[:, 0:2] - lower) / scale

    # 2. Compute deltas.
    np_ink[1:, 0:2] -= np_ink[0:-1, 0:2]
    np_ink = np_ink[1:, :]
    return np.round(np_ink,5), class_name

**4. Define the function to process a CLASS (one of the word to guess) full file**

In [8]:
ndjson_path = path_data + '/' + file_name

def drawing_extraction(ndjson_path, nb_drawings_to_load):
    dict_drawings = {}  # Initialize an empty dictionary
    with open(ndjson_path, 'r') as f:
        for i, line in enumerate(f):
            json_line = json.loads(line)
            np_ink, class_name = parse_line(json_line)
            #np_ink= np.round(np_ink,5)
            #print(np_ink)
            length_np_ink = len(np_ink)
            dict_drawing_info = {
                "class": class_name,
                "drawing_nb": i,
                "length_np_ink": length_np_ink,
                "np_ink": np_ink.tolist()  #need to be transformed to list to dump as Json file later

            }
            #print(image_info["np_ink"])
            dict_drawings[json_line["key_id"]] = dict_drawing_info
            i += 1
            if i == nb_drawings_to_load:
                break
    return dict_drawings

In [9]:
# Trying a more efficient way to load the data

def drawing_extraction2(ndjson_path, nb_drawings_to_load):
    dict_drawings = {}  # Initialize an empty dictionary

    for i in range(1, nb_drawings_to_load+1):
        json_drawing = json.loads(linecache.getline(ndjson_path, i, module_globals=None))
        np_ink, class_name = parse_line(json_drawing)
        length_np_ink = len(np_ink)
        dict_drawing_info = {"class": class_name,
                             "drawing_nb": i,
                             "length_np_ink": length_np_ink,
                             "np_ink": np_ink.tolist()  # need to be transformed to list to dump as Json file later
                            }
        #print(image_info["np_ink"])
        dict_drawings[json_drawing["key_id"]] = dict_drawing_info
        
    return dict_drawings

In [7]:
ndjson_path

'/Users/gregorytaylor/code/pictionary-ai/raw_data/full_simplified_face.ndjson'

In [8]:
%%time
dict_test = drawing_extraction(ndjson_path, 50000)

CPU times: user 9.66 s, sys: 454 ms, total: 10.1 s
Wall time: 11.2 s


In [9]:
%%time
dict_test = drawing_extraction2(ndjson_path, 50000)

CPU times: user 10.3 s, sys: 823 ms, total: 11.1 s
Wall time: 11.6 s


In [9]:
len(dict_test)

50000

In [11]:
# Where we want to save the output file and a name is given
output_file = path_data + '/' + 'processed_full_simplified_face.json'

In [10]:
# Function to save the dictionary file for the processed images
def save_images_dict_to_json(images_dict, output_file):
    with open(output_file, 'w') as json_file:
        json.dump(images_dict, json_file)

In [12]:
save_images_dict_to_json(dict_test,output_file)

In [13]:
len(dict_test)

50000

In [12]:
def upload_blob(bucket_name, file_path, file_name):
    """Uploads a file to the bucket."""
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(file_name)
    blob.upload_from_filename(file_path + '/' + file_name)

In [13]:
!echo $GOOGLE_APPLICATION_CREDENTIALS

/Users/gregorytaylor/code/gcp/pictionary-key.json


In [17]:
!gsutil ls gs://quickdraw-simplified-processed/

gs://quickdraw-simplified-processed/processed_full_simplified_face.json


In [18]:
bucket_name = 'quickdraw-simplified-processed'
file_path = path_data
file_name = 'processed_full_simplified_face.json'

upload_blob(bucket_name, file_path, file_name)

ConnectionError: ('Connection aborted.', TimeoutError('The write operation timed out'))