Script that will convert from the old MM path to the NPZ path on GCP buckets. Note that the inventory file we write out is compressed as a GZIP file.

In [None]:
# Make sure to define the following environment variable before you run this:
#export GOOGLE_APPLICATION_CREDENTIALS=~/expanding-sdo-capabilities/config/space_weather_sdo.json
# And also run the following command to allow GCP storage access:
# gcloud auth application-default login

In [1]:
from io import BytesIO

from google.cloud import storage
from google.cloud.storage.blob import Blob
import pandas as pd

In [2]:
bucket_name = 'fdl-sdo-data'
inventory_in_path = 'SDOMLmm/inventory.pkl'

client = storage.Client()
bucket = client.get_bucket(bucket_name)

In [3]:
data = bucket.blob(inventory_in_path).download_as_string()

In [4]:
df = pd.read_pickle(BytesIO(data), compression=None)

In [5]:
df

Unnamed: 0_level_0,year,month,day,hour,min,channel,file
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1076941452,2010,5,1,0,12,by,/fdl_sdo_data/SDOMLmm/2010/05/01/HMI20100501_0...
1076941452,2010,5,1,0,12,bx,/fdl_sdo_data/SDOMLmm/2010/05/01/HMI20100501_0...
1076941452,2010,5,1,0,12,bz,/fdl_sdo_data/SDOMLmm/2010/05/01/HMI20100501_0...
1076941464,2010,5,1,0,24,by,/fdl_sdo_data/SDOMLmm/2010/05/01/HMI20100501_0...
1076941464,2010,5,1,0,24,bx,/fdl_sdo_data/SDOMLmm/2010/05/01/HMI20100501_0...
...,...,...,...,...,...,...,...
1081552314,2018,12,9,23,54,1600,/fdl_sdo_data/SDOMLmm/2018/12/09/AIA20181209_2...
1081552314,2018,12,9,23,54,1700,/fdl_sdo_data/SDOMLmm/2018/12/09/AIA20181209_2...
1081552314,2018,12,9,23,54,0094,/fdl_sdo_data/SDOMLmm/2018/12/09/AIA20181209_2...
1081552314,2018,12,9,23,54,0193,/fdl_sdo_data/SDOMLmm/2018/12/09/AIA20181209_2...


In [12]:
df.replace({
    '/fdl_sdo_data/SDOMLmm/': 'SDOMLnpz/',
    '.mm': '.npz',
}, regex=True, inplace=True)


In [13]:
df

Unnamed: 0_level_0,year,month,day,hour,min,channel,file
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1076941452,2010,5,1,0,12,by,SDOMLnpz/2010/05/01/HMI20100501_0012_by.npz
1076941452,2010,5,1,0,12,bx,SDOMLnpz/2010/05/01/HMI20100501_0012_bx.npz
1076941452,2010,5,1,0,12,bz,SDOMLnpz/2010/05/01/HMI20100501_0012_bz.npz
1076941464,2010,5,1,0,24,by,SDOMLnpz/2010/05/01/HMI20100501_0024_by.npz
1076941464,2010,5,1,0,24,bx,SDOMLnpz/2010/05/01/HMI20100501_0024_bx.npz
...,...,...,...,...,...,...,...
1081552314,2018,12,9,23,54,1600,SDOMLnpz/2018/12/09/AIA20181209_2354_1600.npz
1081552314,2018,12,9,23,54,1700,SDOMLnpz/2018/12/09/AIA20181209_2354_1700.npz
1081552314,2018,12,9,23,54,0094,SDOMLnpz/2018/12/09/AIA20181209_2354_0094.npz
1081552314,2018,12,9,23,54,0193,SDOMLnpz/2018/12/09/AIA20181209_2354_0193.npz


In [14]:
local_path = '/tmp/inventory.pkl'
df.to_pickle(local_path, compression='gzip')

In [15]:
# Note: you will get permission errors if this file already exists on GCP;
# if this happens, simply navigate to the existing SDOMLnpz/inventory.pkl file
# and delete it in the GCP browser before attempting to re-upload a new version.
inventory_out_path = 'SDOMLnpz/inventory.pkl'
blob = Blob(inventory_out_path, bucket)
blob.upload_from_filename(local_path)