# Move MSRB Intraday Trade Files into Folders Grouped By Date
Last updated by Developer on 2024-02-20.

This notebook was used to convert every json file in the Google Cloud bucket `msrb_intraday_real_time_trade_files` into a folder that contained the date that it was created. This was done because the bucket had over 900k+ json files at the time of performing this cleanup, which made it so we could not locate or search for files in the bucket as Google filtering does not work when there are a large (number unknown) number of files in the bucket. 

Perhaps this notebook can be used in the future for other Google Cloud bucket cleanup tasks.

In [None]:
import multiprocess as mp    # using `multiprocess` instead of `multiprocessing` because function to be called in `map` is in the same file as the function which is calling it: https://stackoverflow.com/questions/41385708/multiprocessing-example-giving-attributeerror

from google.cloud import storage

In [None]:
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/home/user/ficc/mitas_creds.json'

In [None]:
def rename_file(bucket_name, old_file_name, new_file_name):
    client = storage.Client()    # initialize a client
    bucket = client.get_bucket(bucket_name)    # get bucket reference
    blob = bucket.blob(old_file_name)    # get blob (file) reference
    bucket.copy_blob(blob, bucket, new_file_name)    # copy the blob to the new destination with the new name
    blob.delete()    # delete the original blob
    # print(f"File '{old_file_name}' renamed to '{new_file_name}' successfully.")    # comment out otherwise too many print statements eventually causes the notebook to crash

In [None]:
client = storage.Client()    # initialize a client
bucket = client.get_bucket('msrb_intraday_real_time_trade_files')    # get bucket reference

blobs = bucket.list_blobs()    # list all blobs (files) in the bucket
blobs = [blob for blob in blobs if '/' not in blob.name]

count = 10
for blob in blobs:    # iterate through the first `count` blobs and print their names
    print(blob.name)
    count -= 1
    if count == 0: break

In [None]:
print('Total blobs:', len(blobs))    # use for monitoring

In [None]:
def date_from_filename(filename, suffix):
    prefix = 'real_time_msrb_file_'
    num_characters_in_prefix = len(prefix)
    num_characters_in_suffix = len(suffix)
    assert filename[:num_characters_in_prefix] == prefix, f'filename: {filename} does not have the correct prefix'
    assert filename[-num_characters_in_suffix:] == suffix, f'filename: {filename} does not have the correct suffix'
    assert len(filename) == num_characters_in_prefix + num_characters_in_suffix + 10 + 9, f'filename: {filename} does not have the right format'
    date_as_string = filename[num_characters_in_prefix : num_characters_in_prefix + 10]
    return date_as_string

In [None]:
def create_date_to_blobs_dict(suffix: str = '.json') -> dict:
    date_to_filenames_dict = {}
    for blob in blobs:
        try:
            date_as_string = date_from_filename(blob.name, suffix)
        except AssertionError as e:
            print(e)
            continue
        if date_as_string not in date_to_filenames_dict: date_to_filenames_dict[date_as_string] = []
        date_to_filenames_dict[date_as_string].append(blob)
    return date_to_filenames_dict

In [None]:
def blobs_to_filenames(date_to_filenames_dict: dict) -> dict:
    '''Mutates `date_to_filenames_dict`. Need to call this function before using `multiprocessing` 
    since blobs are not picklable, a list of blobs cannot be used in multiprocessing, so need to 
    convert them to a list of strings (in this case, filenames).'''
    total_blobs = 0    # use for monitoring
    for date, blob_list in date_to_filenames_dict.items():
        total_blobs += len(blob_list)
        date_to_filenames_dict[date] = [blob.name for blob in blob_list]
    print('Total blobs:', total_blobs)
    return date_to_filenames_dict

### Handle all files with format: `real_time_msrb_file_YYYY-MM-DD_HH:MM:SS.json`

In [None]:
date_to_filenames = blobs_to_filenames(create_date_to_blobs_dict())
date_to_filenames    # preview the dictionary

In [None]:
def rename_file_with_date_as_folder(date, filename_list: list):
    print('date:', date)    # use for monitoring
    for filename in filename_list:
        rename_file('msrb_intraday_real_time_trade_files', filename, f'{date}/{filename}')

Without multiprocessing.

In [None]:
def rename_files(date_to_filenames: dict):
    for date, filename_list in date_to_filenames.items():
        rename_file_with_date_as_folder(date, filename_list)

In [None]:
rename_files(date_to_filenames)

Use multiprocessing for the outer `for` loop.

In [None]:
def rename_files_with_multiprocessing(date_to_filenames_dict: dict):
    with mp.Pool() as pool_object:    # using template from https://docs.python.org/3/library/multiprocessing.html
        pool_object.starmap(rename_file_with_date_as_folder, list(date_to_filenames_dict.items()))    # need to use starmap since `upload_trade_history_to_trade_history_redis` has multiple arguments: https://stackoverflow.com/questions/5442910/how-to-use-multiprocessing-pool-map-with-multiple-arguments

In [None]:
rename_files_with_multiprocessing(date_to_filenames)

### Handle all files with format: `real_time_msrb_file_YYYY-MM-DD_HH:MM:SS_from_fast_redis_update.json`

In [None]:
date_to_filenames_from_fast_redis_update = blobs_to_filenames(create_date_to_blobs_dict(suffix='_from_fast_redis_update.json'))
date_to_filenames_from_fast_redis_update    # preview the dictionary

Without multiprocessing.

In [None]:
rename_files(date_to_filenames_from_fast_redis_update)

Use multiprocessing for the outer `for` loop.

In [None]:
rename_files_with_multiprocessing(date_to_filenames_from_fast_redis_update)

### Handling incorrectly renamed blobs (blobs that were renamed incorrectly during first pass of this code)

In [None]:
client = storage.Client()    # initialize a client
bucket = client.get_bucket('msrb_intraday_real_time_trade_files')    # get bucket reference

damaged_blobs = bucket.list_blobs()    # list all blobs (files) in the bucket
damaged_blobs = [damaged_blob for damaged_blob in damaged_blobs if '2021-05-04/msrb_intraday_real_time_trade_files/2021-05-04/' in damaged_blob.name]

count = 10
for damaged_blob in damaged_blobs:    # iterate through the first `count` blobs and print their names
    print(damaged_blob.name)
    count -= 1
    if count == 0: break

In [None]:
len(damaged_blobs)    # use for monitoring

In [None]:
date = '2021-05-04'
for blob in damaged_blobs:
    blob_name = blob.name
    last_slash = blob_name.rfind('/')
    blob_name_without_directories = blob_name[last_slash + 1:]
    rename_file('msrb_intraday_real_time_trade_files', blob_name, f'{date}/{blob_name_without_directories}')