# This notebook registers aerosol datasets to Azure Machine Learning workspace

Datasets originally in a Azure Blob storage

### Connect to Workspace, read configs

In [None]:
from azureml.core import Workspace

ws = Workspace.from_config()
#print(ws.name, "loaded")

In [None]:
# read config data about blob storage
import yaml
import os

with open("config.yml", "r") as ymlfile:
    cfg = yaml.load(ymlfile, Loader=yaml.FullLoader)

### Register blob storage container as Datastore

In [None]:
from azureml.core import Datastore

aerosol_ds=Datastore.register_azure_blob_container(ws, datastore_name='aerosol_data', 
                                     container_name=cfg["datastorage"]["container_name"], 
                                     account_name=cfg["datastorage"]["account_name"], 
                                     sas_token=None, 
                                     account_key=cfg["datastorage"]["account_key"], 
                                     protocol=None, 
                                     endpoint=None, 
                                     overwrite=True, 
                                     create_if_not_exists=False, 
                                     skip_validation=False, 
                                     blob_cache_timeout=None, 
                                     grant_workspace_access=False, 
                                     subscription_id=None, 
                                     resource_group=None)

### Create versioned datasets

1. Get and register concentration data


In [None]:
from azureml.core import Dataset

# Get and register concentration data
csv_measurement_path = [(aerosol_ds, cfg['datafiles']['conc_filename'])]
aerosol_dataset = Dataset.Tabular.from_delimited_files(path=csv_measurement_path)
aerosol_dataset.register(workspace=ws, name=cfg['datafiles']['conc_datasetname'],
                      create_new_version=True);

2. Get, clean and register classification data

In [None]:
import pandas as pd
import os

# get classification data
csv_classification_path=[(aerosol_ds, cfg['datafiles']['class_filename'])]
classification_dataset = Dataset.Tabular.from_delimited_files(path=csv_classification_path)
df = classification_dataset.to_pandas_dataframe()

#print(df.head())
# data has comment rows in the beginning of the file that contain the names of the columns
# extracting these names first
new_column_names=[x[6:] for x in df[df['# Hyytiälä SMEAR II data'].str.startswith('#')]['# Hyytiälä SMEAR II data'].iloc[3:].to_list()]

# expand data to multiple columns
df=df.loc[df['# Hyytiälä SMEAR II data'].str.startswith('7'),
          '# Hyytiälä SMEAR II data'].str.split(' ',  expand=True)
df.columns=new_column_names

# convert matlab datenumber to Datetime format
df.index=pd.to_datetime(df['Matlab datenum'].astype(int)-719529, unit='D')
df.index.name='date'
df=df.drop(['Matlab datenum'], axis=1)

# save to tempdata file (because uploading to data storage requires it...)
os.makedirs('../tempdata', exist_ok=True)
df.to_csv('../tempdata/'+cfg['datafiles']['class_tempdatafile'])



In [None]:
# save cleaned data back to datastore
aerosol_ds.upload(src_dir='tempdata',
               target_path=cfg['datafiles']['class_path'],
               overwrite=True, )

# register classification data
csv_classification_clean_path=[(aerosol_ds, 
                                cfg['datafiles']['class_cleanfilename'])]

classification_dataset = Dataset.Tabular.from_delimited_files(path=csv_classification_clean_path)

classification_dataset.register(workspace=ws, name=cfg['datafiles']['class_datasetname'],
                               create_new_version=True);
