In [1]:
import mltable

# glob the parquet file paths for years 2015-19, all months.
paths = [
    {
        "pattern": "wasbs://nyctlc@azureopendatastorage.blob.core.windows.net/green/puYear=2015/puMonth=*/*.parquet"
    },
    {
        "pattern": "wasbs://nyctlc@azureopendatastorage.blob.core.windows.net/green/puYear=2016/puMonth=*/*.parquet"
    },
    {
        "pattern": "wasbs://nyctlc@azureopendatastorage.blob.core.windows.net/green/puYear=2017/puMonth=*/*.parquet"
    },
    {
        "pattern": "wasbs://nyctlc@azureopendatastorage.blob.core.windows.net/green/puYear=2018/puMonth=*/*.parquet"
    },
    {
        "pattern": "wasbs://nyctlc@azureopendatastorage.blob.core.windows.net/green/puYear=2019/puMonth=*/*.parquet"
    },
]

# create a table from the parquet paths
tbl = mltable.from_parquet_files(paths)

# table a random sample
tbl = tbl.take_random_sample(probability=0.001, seed=735)

# filter trips with a distance > 0
tbl = tbl.filter("col('tripDistance') > 0")

# Drop columns
tbl = tbl.drop_columns(["puLocationId", "doLocationId", "storeAndFwdFlag"])

# Create two new columns - year and month - where the values are taken from the path
tbl = tbl.extract_columns_from_partition_format("/puYear={year}/puMonth={month}")

# print the first 5 records of the table as a check
tbl.show(5)

Unnamed: 0,vendorID,lpepPickupDatetime,lpepDropoffDatetime,passengerCount,tripDistance,pickupLongitude,pickupLatitude,dropoffLongitude,dropoffLatitude,rateCodeID,...,extra,mtaTax,improvementSurcharge,tipAmount,tollsAmount,ehailFee,totalAmount,tripType,year,month
0,2,2015-01-01 21:12:07,2015-01-01 21:15:41,3,0.71,-73.925369,40.761669,-73.923599,40.754658,1,...,0.5,0.5,0.3,1.0,0.0,,6.8,1,2015,1
1,2,2015-01-01 03:46:04,2015-01-01 04:01:51,1,1.06,-73.964684,40.682896,-73.961937,40.678196,1,...,0.5,0.5,0.3,0.0,0.0,,7.8,1,2015,1
2,2,2015-01-01 03:55:56,2015-01-01 04:10:34,1,4.9,-73.989822,40.691109,-73.974564,40.646412,1,...,0.5,0.5,0.3,4.38,0.0,,22.68,1,2015,1
3,2,2015-01-01 03:54:46,2015-01-01 04:14:51,1,3.57,-73.932167,40.707943,-73.96389,40.692127,1,...,0.5,0.5,0.3,0.0,0.0,,16.8,1,2015,1
4,2,2015-01-01 04:01:51,2015-01-01 04:09:30,5,1.52,-73.889145,40.747181,-73.895729,40.731693,1,...,0.5,0.5,0.3,0.0,0.0,,8.8,1,2015,1


In [2]:
# You can load the table into a pandas dataframe
# NOTE: The data is in East US region and the data is large, so this will take several minutes (~7mins)
# to load if you are in a different region.

# df = tbl.to_pandas_dataframe()

In [5]:
# serialize the above data loading steps into an MLTable file
tbl.save("./nyc_taxi")

In [6]:
with open("./nyc_taxi/MLTable", "r") as f:
    print(f.read())

paths:
- pattern: wasbs://nyctlc@azureopendatastorage.blob.core.windows.net/green/puYear=2015/puMonth=*/*.parquet
- pattern: wasbs://nyctlc@azureopendatastorage.blob.core.windows.net/green/puYear=2016/puMonth=*/*.parquet
- pattern: wasbs://nyctlc@azureopendatastorage.blob.core.windows.net/green/puYear=2017/puMonth=*/*.parquet
- pattern: wasbs://nyctlc@azureopendatastorage.blob.core.windows.net/green/puYear=2018/puMonth=*/*.parquet
- pattern: wasbs://nyctlc@azureopendatastorage.blob.core.windows.net/green/puYear=2019/puMonth=*/*.parquet
transformations:
- extract_columns_from_partition_format:
    ignore_errors: false
    partition_format: /puYear={year}/puMonth={month}
    path_column: Path
- read_parquet:
    include_path_column: false
    path_column: Path
- take_random_sample:
    probability: 0.001
    seed: 735
- filter: col('tripDistance') > 0
- drop_columns:
  - puLocationId
  - doLocationId
  - storeAndFwdFlag
type: mltable



## ♻️ Reproduce data loading steps

Now that the data loading steps have been serialized into a file, you can reproduce them at any point in time using the `load()` method. This means you do not need to redefine your data loading steps in code and makes it easier to share with others.

In [7]:
import mltable

# load the previously saved MLTable file
tbl = mltable.load("./nyc_taxi/")

# You can load the table into a pandas dataframe
# NOTE: The data is in East US region and the data is large, so this will take several minutes (~7mins)
# to load if you are in a different region.

# load the table into pandas
# df = tbl.to_pandas_dataframe()

# print the head of the data frame
# df.head()
# print the shape and column types of the data frame
# print(f"Shape: {df.shape}")
# print(f"Columns:\n{df.dtypes}")

### 🤝 Create a data asset to aid sharing and reproducibility

Your `MLTable` file is currently saved on disk, making it hard to share with Team members. By creating a *data asset* in AzureML, your MLTable will be uploaded to cloud storage and "bookmarked", meaning your Team members can access the MLTable using a friendly name. Also, the data asset is *versioned*.

In [9]:
subscription_id = "910ebf13-1058-405d-b6cf-eda03e5288d1"
resource_group = "aml-rg"
workspace = "aml-ws"

In [11]:
import time
from azure.ai.ml import MLClient
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes
from azure.identity import DefaultAzureCredential

# set the version number of the data asset to the current UTC time
VERSION = time.strftime("%Y.%m.%d.%H%M%S", time.gmtime())

# connect to the AzureML workspace
ml_client = MLClient(
    DefaultAzureCredential(), subscription_id, resource_group, workspace
)

my_data = Data(
    path="./nyc_taxi",
    type=AssetTypes.MLTABLE,
    description="A random sample of NYC Green Taxi Data between 2015-19.",
    name="green-quickstart",
    version=VERSION,
)

ml_client.data.create_or_update(my_data)

[32mUploading nyc_taxi (0.0 MBs): 100%|██████████| 946/946 [00:00<00:00, 11668.63it/s]
[39m



Data({'path': 'azureml://subscriptions/910ebf13-1058-405d-b6cf-eda03e5288d1/resourcegroups/aml-rg/workspaces/aml-ws/datastores/workspaceblobstore/paths/LocalUpload/221cad589ef5805f871fdbb2ed8b9f0cb59d685428ef25c7f4e7244a86aed899/nyc_taxi/', 'skip_validation': False, 'mltable_schema_url': None, 'referenced_uris': ['wasbs://nyctlc@azureopendatastorage.blob.core.windows.net/green/puYear=2015/puMonth=*/*.parquet', 'wasbs://nyctlc@azureopendatastorage.blob.core.windows.net/green/puYear=2016/puMonth=*/*.parquet', 'wasbs://nyctlc@azureopendatastorage.blob.core.windows.net/green/puYear=2017/puMonth=*/*.parquet', 'wasbs://nyctlc@azureopendatastorage.blob.core.windows.net/green/puYear=2018/puMonth=*/*.parquet', 'wasbs://nyctlc@azureopendatastorage.blob.core.windows.net/green/puYear=2019/puMonth=*/*.parquet'], 'type': 'mltable', 'is_anonymous': False, 'auto_increment_version': False, 'auto_delete_setting': None, 'name': 'green-quickstart', 'description': 'A random sample of NYC Green Taxi Data be


### 📖 Read the data asset in an interactive session

Now you have your MLTable stored in the cloud, you and Team members can access it using a friendly name in an interactive session (for example, a notebook).

In [21]:
import mltable
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential

# connect to the AzureML workspace
ml_client = MLClient(
    DefaultAzureCredential(), subscription_id, resource_group, workspace
)

# get the latest version of the data asset
# Note: The version was set in the previous code cell.
data_asset = ml_client.data.get(name="green-quickstart", version=VERSION)

data_asset

Overriding of current TracerProvider is not allowed
Overriding of current LoggerProvider is not allowed
Overriding of current MeterProvider is not allowed
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented


Data({'path': 'azureml://subscriptions/910ebf13-1058-405d-b6cf-eda03e5288d1/resourcegroups/aml-rg/workspaces/aml-ws/datastores/workspaceblobstore/paths/LocalUpload/221cad589ef5805f871fdbb2ed8b9f0cb59d685428ef25c7f4e7244a86aed899/nyc_taxi/', 'skip_validation': False, 'mltable_schema_url': None, 'referenced_uris': ['wasbs://nyctlc@azureopendatastorage.blob.core.windows.net/green/puYear=2015/puMonth=*/*.parquet', 'wasbs://nyctlc@azureopendatastorage.blob.core.windows.net/green/puYear=2016/puMonth=*/*.parquet', 'wasbs://nyctlc@azureopendatastorage.blob.core.windows.net/green/puYear=2017/puMonth=*/*.parquet', 'wasbs://nyctlc@azureopendatastorage.blob.core.windows.net/green/puYear=2018/puMonth=*/*.parquet', 'wasbs://nyctlc@azureopendatastorage.blob.core.windows.net/green/puYear=2019/puMonth=*/*.parquet'], 'type': 'mltable', 'is_anonymous': False, 'auto_increment_version': False, 'auto_delete_setting': None, 'name': 'green-quickstart', 'description': 'A random sample of NYC Green Taxi Data be

In [None]:


# create a table
tbl = mltable.load(f"azureml:/{data_asset.id}")

tbl.show(5)

# load into pandas
# NOTE: The data is in East US region and the data is large, so this will take several minutes (~7mins) to load if you are in a different region.
# df = tbl.to_pandas_dataframe()

Overriding of current TracerProvider is not allowed
Overriding of current LoggerProvider is not allowed
Overriding of current MeterProvider is not allowed
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Overriding of current TracerProvider is not allowed
Overriding of current LoggerProvider is not allowed
Overriding of current MeterProvider is not allowed
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Overriding of current TracerProvider is not allowed
Overriding of current LoggerProvider is not allowed
Overriding of current MeterProvider is not allowed
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Overriding of current TracerProvider is not allowed
Overriding of c

Unnamed: 0,vendorID,lpepPickupDatetime,lpepDropoffDatetime,passengerCount,tripDistance,pickupLongitude,pickupLatitude,dropoffLongitude,dropoffLatitude,rateCodeID,...,extra,mtaTax,improvementSurcharge,tipAmount,tollsAmount,ehailFee,totalAmount,tripType,year,month
0,2,2015-01-01 21:12:07,2015-01-01 21:15:41,3,0.71,-73.925369,40.761669,-73.923599,40.754658,1,...,0.5,0.5,0.3,1.0,0.0,,6.8,1,2015,1
1,2,2015-01-01 03:46:04,2015-01-01 04:01:51,1,1.06,-73.964684,40.682896,-73.961937,40.678196,1,...,0.5,0.5,0.3,0.0,0.0,,7.8,1,2015,1
2,2,2015-01-01 03:55:56,2015-01-01 04:10:34,1,4.9,-73.989822,40.691109,-73.974564,40.646412,1,...,0.5,0.5,0.3,4.38,0.0,,22.68,1,2015,1
3,2,2015-01-01 03:54:46,2015-01-01 04:14:51,1,3.57,-73.932167,40.707943,-73.96389,40.692127,1,...,0.5,0.5,0.3,0.0,0.0,,16.8,1,2015,1
4,2,2015-01-01 04:01:51,2015-01-01 04:09:30,5,1.52,-73.889145,40.747181,-73.895729,40.731693,1,...,0.5,0.5,0.3,0.0,0.0,,8.8,1,2015,1


### 📖 Read the data asset in a job

You can also access your Table in a job, using:

In [19]:
from azure.ai.ml import MLClient, command, Input
from azure.ai.ml.entities import Environment
from azure.identity import DefaultAzureCredential

# connect to the AzureML workspace
ml_client = MLClient(
    DefaultAzureCredential(), subscription_id, resource_group, workspace
)

# get the latest version of the data asset
# Note: the VERSION was set in a previous cell.
data_asset = ml_client.data.get(name="green-quickstart", version=VERSION)

job = command(
    command="python train.py --input ${{inputs.green}}",
    inputs={"green": Input(type="mltable", path=data_asset.id)},
    compute="cpu-cluster",
    environment=Environment(
        image="mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04",
        conda_file="./job-env/conda_dependencies.yml",
    ),
    code="./src",
)

ml_client.jobs.create_or_update(job)

Overriding of current TracerProvider is not allowed
Overriding of current LoggerProvider is not allowed
Overriding of current MeterProvider is not allowed
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Class AutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class AutoDeleteConditionSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseAutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class IntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class ProtectionLevelSchema: This is an experime

MlException: {
  "result": "Failed",
  "errors": [
    {
      "message": "Can't find directory or file in resolved absolute path: /root/repos/fabric-data-engineering-ws/scripts/aml/azureml:./src.; Not a valid URL.; In order to specify a git path, please provide the correct path prefixed with 'git+\n; In order to specify an existing codes, please provide the correct registry path prefixed with 'azureml://':\n; In order to specify an existing codes, please provide the correct registry path prefixed with 'azureml://':\n; Could not parse ./src. If providing an ARM id, it should start with a '/'.",
      "path": "component.code",
      "value": "azureml:./src"
    }
  ]
}