In [2]:
from azureml.core import Workspace, Datastore, Dataset

# Access datastore 

Data is vital part of the machine learning workflow. In classical software engineering the source coude is version controlled. In machine learning engineering we additionally need to version control the data as well. There are two related concept about the data in azure ml. 
- `Datasores` are the places were data is stored in the cloud. When we create a workspace it creates some default datastore to store the data and artifact. We can also setup additional datastore to the workspace. 
- `Datasets` are versioned data registered in the azure ml workspace. 


In [3]:
ws = Workspace.from_config()

In [4]:
# We can list the all available datasotre in the workspace. 
for ds_name in ws.datastores:
    print(ds_name)

workspaceworkingdirectory
workspaceblobstore
workspacefilestore
workspaceartifactstore


In [5]:
# Accessing the data store by its name. 
Datastore.get(ws, datastore_name='workspaceartifactstore')

{
  "name": "workspaceartifactstore",
  "container_name": "azureml",
  "account_name": "amlmstutstorageaac522ec0",
  "protocol": "https",
  "endpoint": "core.windows.net"
}

In [6]:
# workspaceblobstore is the default datastore in the workspace. 
ws.get_default_datastore()

{
  "name": "workspaceblobstore",
  "container_name": "azureml-blobstore-2c139677-d5ec-4bce-96b5-37848a4e6382",
  "account_name": "amlmstutstorageaac522ec0",
  "protocol": "https",
  "endpoint": "core.windows.net"
}

In [7]:
# We can change the default datastore by the following code. 
ws.set_default_datastore('workspaceblobstore')

```python 
# To register a new azure storage blob container 
# We can make this created datastore as default datastore as well. 
blob_ds = Datastore.register_azure_blob_container(workspace=ws, 
                                                 datastore_name='new_blob_data', 
                                                 container_name='rk_data_container', 
                                                 account_name='name', 
                                                 account_key='key')
```

# Register data 

We register the data for the data versioning and reproducibility. Here we first upload the data from local machine to the `Datastore` and register the data from the `Datastore` to the `Datasets`. 

In [8]:
datastore = ws.get_default_datastore()

In [10]:
# We can register the single csv file 
datastore.upload(src_dir='Data', target_path='data', overwrite=True)

data_path = [(datastore, 'data/iris.csv')]
dataset = Dataset.Tabular.from_delimited_files(path=data_path)
dataset.register(workspace=ws, name='Irish Data')

In [None]:
import pandas as pd

pd.read_csv('Data/iris.csv').sample(5).to_csv('Data/sample.csv')

In [None]:
# We can also register multiple csv files. 
# A second csv file is created above for this demo. 

datastore.upload(src_dir='Data', target_path='data', overwrite=True)

data_path = [(datastore, 'data/iris.csv'), 
             (datastore, 'data/sample.csv')]
dataset = Dataset.Tabular.from_delimited_files(path=data_path)
dataset.register(workspace=ws, name='Two Irish Data')

# Retriving registered data 

We can retrive the registered data as a pandas dataframe in the following different ways. 

In [11]:
ws.datasets['Irish Data'].to_pandas_dataframe().head()



Unnamed: 0,Column1,PL,PW,SL,SW,y
0,84,5.4,3.0,4.5,1.5,1
1,127,6.1,3.0,4.9,1.8,2
2,85,6.0,3.4,4.5,1.6,1
3,146,6.3,2.5,5.0,1.9,2
4,25,5.0,3.0,1.6,0.2,0


In [12]:
Dataset.get_by_name(ws, 'Irish Data').to_pandas_dataframe().head()



Unnamed: 0,Column1,PL,PW,SL,SW,y
0,84,5.4,3.0,4.5,1.5,1
1,127,6.1,3.0,4.9,1.8,2
2,85,6.0,3.4,4.5,1.6,1
3,146,6.3,2.5,5.0,1.9,2
4,25,5.0,3.0,1.6,0.2,0


# Data versioning 

We can turn on and off the functionality to version the data during registration. If not versioned the registration overwrites the existing data. 

In [None]:
# Single csv file 
datastore.upload(src_dir='Data', target_path='data', overwrite=True)

data_path = [(datastore, 'data/sample.csv')]
dataset = Dataset.Tabular.from_delimited_files(path=data_path)

dataset.register(workspace=ws, name='Irish Data', create_new_version=True)

# Retriving vesrioned and Combined data

We can retrive a specific version of the versioned data. 

In [13]:
# Versioned data 
Dataset.get_by_name(ws, 'Irish Data', version=2).to_pandas_dataframe().head()



Unnamed: 0,Column1,PL,PW,SL,SW,y
0,84,5.4,3.0,4.5,1.5,1
1,127,6.1,3.0,4.9,1.8,2
2,85,6.0,3.4,4.5,1.6,1
3,146,6.3,2.5,5.0,1.9,2
4,25,5.0,3.0,1.6,0.2,0


In [29]:
# Splitted data (How to do it correctly?)
Dataset.get_by_name(ws, 'Two Irish Data').to_pandas_dataframe()

Unnamed: 0,PL,PW,SL,SW,y,Column6
0,5.1,3.5,1.4,0.2,0.0,
1,4.9,3.0,1.4,0.2,0.0,
2,4.7,3.2,1.3,0.2,0.0,
3,4.6,3.1,1.5,0.2,0.0,
4,5.0,3.6,1.4,0.2,0.0,
...,...,...,...,...,...,...
150,84.0,5.4,3.0,4.5,1.5,1.0
151,127.0,6.1,3.0,4.9,1.8,2.0
152,85.0,6.0,3.4,4.5,1.6,1.0
153,146.0,6.3,2.5,5.0,1.9,2.0
