In [5]:
import numpy as np
import pandas as pd
import os
import json
import matplotlib.pyplot as plt
import linecache
from google.cloud import storage
import subprocess
import re

# Params

In [6]:
path_data = '/Users/loicregne/code/rs-uk/raw_data'
bucket_drawings_simplified = 'quickdraw-simplified'
bucket_drawings_simplified_processed = 'quickdraw-simplified-processed'

# Utils

In [15]:
# List all the blovs in a given bucket aand return their names in a list
def list_blobs(bucket_name: str) -> list:
    '''
    Lists all the blobs in the bucket.
    '''
    # Initialize a client
    storage_client = storage.Client()
    
    # Get the bucket
    bucket = storage_client.bucket(bucket_name)
    
    # List the blobs in the bucket
    blobs = bucket.list_blobs()
    
    # Collect the names of blobs into a list
    blob_names = [blob.name for blob in blobs]
    
    return blob_names

In [16]:
# Copy all the content of one bucket to another bucket
def copy_bucket(source_bucket_name: str, destination_bucket_name:str) -> None:
    '''
    Copies all blobs from the source bucket to the destination bucket.
    '''
    # Initialize clients for source and destination buckets
    source_client = storage.Client()
    destination_client = storage.Client()

    # Get the source and destination buckets
    source_bucket = source_client.bucket(source_bucket_name)
    destination_bucket = destination_client.bucket(destination_bucket_name)

    # List blobs in the source bucket
    blobs = source_bucket.list_blobs()

    # Copy each blob to the destination bucket
    for blob in blobs:
        source_blob = source_bucket.blob(blob.name)
        destination_blob = destination_bucket.blob(blob.name)
        destination_blob.copy(source_blob)

In [17]:
# Download a blob from a bucket and store it locally
def download_blob(bucket_name, source_blob_name, destination_path, destination_file_name=None) -> None:
    '''
    Downloads a blob from the bucket.
    '''
    # Initialize a client
    storage_client = storage.Client()

    # Get the bucket
    bucket = storage_client.bucket(bucket_name)

    # Get the blob
    blob = bucket.blob(source_blob_name)

    # Use the blob name for the local file is a filename is not provided
    if destination_file_name is None:
        destination_file_name = blob.name
    
    # Define the destination file path
    destination_file_path = '/'.join((destination_path, destination_file_name))
    
    # Download the blob to a file
    blob.download_to_filename(destination_file_path)

In [18]:
# Upload a local file to a bucket
def upload_blob(source_path, source_file_name, bucket_name, destination_blob_name=None) -> None:
    '''
    Uploads a file to the bucket.
    '''
    # Initialize a client
    storage_client = storage.Client()

    # Get the bucket
    bucket = storage_client.bucket(bucket_name)

    # Use the file name for the blob name if a blob name is not provided
    if destination_blob_name is None:
        destination_blob_name = source_file_name
        
    # Create a blob
    blob = bucket.blob(destination_blob_name)

    # Define the source file path
    source_file_path = '/'.join((source_path, source_file_name))
    
    # Upload the file to the blob
    blob.upload_from_filename(source_file_path)

# Copying the Google data into our own bucket

In [None]:
# Done already by Raj

# Downloading a simplified class from the bucket

In [None]:
# Already done

In [None]:
file_name_example = 'full_simplified_face.ndjson'

# Processing a simplified drawing

In [7]:
##Define the function to process the Simplified file and return an array and class name (the drawing to guess)
def parse_line(json_drawing: json) -> np.array:
    """Parse an ndjson line and return ink (as np array) and classname."""
    inkarray = json_drawing["drawing"]
    stroke_lengths = [len(stroke[0]) for stroke in inkarray]
    total_points = sum(stroke_lengths)
    np_ink = np.zeros((total_points, 3), dtype=np.float32)
    current_t = 0
    for stroke in inkarray:
        for i in [0, 1]:
            np_ink[current_t:(current_t + len(stroke[0])), i] = stroke[i]
        current_t += len(stroke[0])
        np_ink[current_t - 1, 2] = 1  # stroke_end

    # Preprocessing.
    # 1. Size normalization.
    lower = np.min(np_ink[:, 0:2], axis=0)
    upper = np.max(np_ink[:, 0:2], axis=0)
    scale = upper - lower
    scale[scale == 0] = 1
    np_ink[:, 0:2] = (np_ink[:, 0:2] - lower) / scale

    # 2. Compute deltas.
    np_ink[1:, 0:2] -= np_ink[0:-1, 0:2]
    np_ink = np_ink[1:, :]
    return np.round(np_ink,decimals=4)

In [8]:
#  Extracts the drawing data as an np.array of the deltas between points
def process_drawing_data(json_drawing: json) -> np.array:
    '''
    Extracts the drawing data (strokes list) from a drawing JSON file.
    Transforms the strokes from coordinates to deltas.
    Returns an np.array of deltas (d_x, d_y, end_of_stroke)
    '''
    # --- Data extraction ---
    list_strokes = json_drawing['drawing']
    
    x = []
    y = []
    stroke_delimiter = []
    list_points = [x, y, stroke_delimiter]
    
    for stroke in list_strokes:
        # Creating the third list to pass to the model with 0 all along and a 1 at the end of the stroke
        stroke_delimiter = [0.] * len(stroke[0])
        stroke_delimiter[-1] = 1
        # Appending x, y, and the delimiter to the new list of points
        list_points[0] += stroke[0]
        list_points[1] += stroke[1]
        list_points[2] += stroke_delimiter

    np_points = np.asarray(list_points)
    np_points = np_points.T

    # --- Processing ---
    # 1. Size normalization
    lower = np.min(np_points[:, 0:2], axis=0) # returns (x_min, y_min)
    upper = np.max(np_points[:, 0:2], axis=0) # returns (x_max, y_max)
    scale = upper - lower # returns (width, heigth)
    scale[scale == 0] = 1
    np_points[:, 0:2] = (np_points[:, 0:2] - lower) / scale

    # 2. Compute deltas
    np_points[1:, 0:2] -= np_points[0:-1, 0:2]
    np_points = np_points[1:, :]
    
    return np.round(np_points,decimals=4)

# Processing a simplified class

In [9]:
ndjson_filepath = '/'.join((path_data, file_name_example))

NameError: name 'file_name_example' is not defined

In [10]:
def drawings_extraction(ndjson_filepath, nb_drawings_to_load):
    list_drawings = []  # Initialize an empty list
    with open(ndjson_filepath, 'r') as f:
        for i, line in enumerate(f):
            json_line = json.loads(line)
            np_ink = parse_line(json_line)
            length_np_ink = len(np_ink)
            dict_drawing_info = {'key_id': json_line['key_id'],
                                 'class': json_line['word'],
                                 'length_np_ink': length_np_ink,
                                 'np_ink': np_ink.tolist()  #need to be transformed to list to dump as Json file later
                                }
            list_drawings.append(dict_drawing_info)
            i += 1
            if i == nb_drawings_to_load:
                break
    return list_drawings

In [11]:
# Trying a more efficient way to load the data

def process_class(ndjson_filepath: object, nb_drawings_to_load: str) -> list:
    '''
    Extract drawing(s) information from a list of JSON drawings (as NDJSON),
    as a list of dictionaries. We specify the number of drawings to load (in
    order of the NDJSON) as a string (number or 'all'). Each dictionary contains:
        - key_id, as string
        - class, as string
        - length, as integer
        - list_deltas, as list
    '''
    list_drawings = []  # Initialize the list to return

    if nb_drawings_to_load == 'all':
        # Getting the number of line in the file using a shell command (fastest way)
        nb_drawings_to_load = int(re.search(r'\d+', str(subprocess.check_output(['wc', '-l', ndjson_filepath]))).group())
    elif (isinstance(nb_drawings_to_load, str) and nb_drawings_to_load.isnumeric()) or isinstance(nb_drawings_to_load, int):
        # We also escape a number of drawings entered as an integer instead of a string...
        nb_drawings_to_load = int(nb_drawings_to_load)
    else:
        nb_drawings_to_load = 0
    
    l_bar='{percentage:3.0f}%|'
    bar = '{bar}'
    r_bar='| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}{postfix}]'
    bar_format = l_bar + bar + r_bar
    processing_bar = tqdm(range(int(nb_drawings_to_load)), bar_format=bar_format)

    for i in processing_bar:
        json_drawing = json.loads(linecache.getline(ndjson_filepath, i+1 , module_globals=None))
        np_deltas = process_drawing_data(json_drawing)
        dict_drawing = {'key_id': json_drawing['key_id'],
                        'class': json_drawing['word'],
                        'length': len(np_deltas),
                        'list_deltas': np_deltas.tolist()  # need to be transformed to list to dump as Json file later
                       }
        list_drawings.append(dict_drawing)
    linecache.clearcache()
    
    return list_drawings

In [None]:
%%time
list_drawings = drawings_extraction(ndjson_filepath, 1000)

In [None]:
%%time
list_drawings = process_class(ndjson_filepath, 1000)

In [None]:
list_drawings[0]

In [None]:
ndjson_filepath

In [None]:
# Where we want to save the output file and a name is given
output_filepath = '/'.join((path_data, 'processed_' + file_name_example))
output_filepath

In [12]:
# Function to save the dictionary file for the processed images
def save_images_dict_to_json(list_drawings, output_file):
    with open(output_file, 'w') as json_file:
        json.dump(list_drawings, json_file)

In [13]:
# Function to save the drawings in the list to an ndjson file locally
def save_drawings_to_ndjson_local(list_drawings: list, output_file: str) -> None:
    '''
        - list_drawings: contains a dictionary for each drawing
        - output_file: the complete filepath to the target file to save/create (.ndjson)
    '''
    with open(output_file, 'w') as ndjson_file:
        # Write each drawing's dict to the file as a new line
        for dict_drawing in list_drawings:
            ndjson_file.write(str(dict_drawing) + '\n')

In [None]:
%%time
list_drawings = drawings_extraction_v2(ndjson_filepath, 1000)

In [None]:
%%time
save_drawings_to_ndjson_local(list_drawings, output_file)

In [None]:
upload_blob(bucket_drawings_simplified_processed, output_filepath, 'processed_' + file_name_example)

# Go through all the blobs in a bucket

In [None]:
# TODO:
# Push the new file to gcp bucket
# Run for the list of ndjson files in a loop
# Include the resampling of the raw data 

In [19]:
list_classes = list_blobs(bucket_drawings_simplified)

In [None]:
!pip install alive-progress

In [None]:
!pip install tqdm

In [20]:
from alive_progress import alive_bar
import time
from tqdm.auto import tqdm


In [22]:
nb_classes = len(list_classes)

l_bar='{percentage:3.0f}%|'
bar = '{bar}'
r_bar='| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}{postfix}] {desc}'
bar_format = l_bar + bar + r_bar
processing_bar = tqdm(list_classes, bar_format=bar_format)

for blob_name in processing_bar:
    # Define the blob files locally
    blob_filepath = '/'.join((path_data, blob_name))
    blob_processed_filepath = '/'.join((path_data, 'test_' + blob_name))
    # Download that blob from the cloud
    # download_blob(bucket_drawings_simplified, blob_name, blob_filepath)
    # Process that blob (class)
    list_drawings = process_class(blob_filepath, 'all')
    # print(f"Processed {blob_name}")
    # bar()
    # Save the processed drawings locally
    save_drawings_to_ndjson_local(list_drawings, blob_processed_filepath)
    # Upload the processed blobs to the cloud
    # upload_blob(bucket_drawings_simplified_processed, blob_processed_filepath, 'test_' + blob_name)
    processing_bar.set_description("Processing %s" % blob_name)

  0%|          | 0/345 [00:00<?, ?it/s] 

  0%|          | 0/134801 [00:00<?, ?it/s]

  0%|          | 0/193015 [00:00<?, ?it/s]

  0%|          | 0/121383 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [1]:
!wc -l '/Users/loicregne/code/rs-uk/raw_data/face.ndjson'  

  161666 /Users/loicregne/code/rs-uk/raw_data/face.ndjson


In [3]:
subprocess.check_output(['wc', '-l', '/Users/loicregne/code/rs-uk/raw_data/face.ndjson'])

b'  161666 /Users/loicregne/code/rs-uk/raw_data/face.ndjson\n'