In [None]:
# Please type in the following variables
from datetime import datetime 
data_start_dt = datetime(2017,1,1)
data_end_dt = datetime(2018,12,31)
temp_local_dir = 'tmp'                  # Local dir for downloading data
storage_account_name = "amldbxadls"     # ADLS account
container_name = "data"                 # ADLS container, make sure it exists
path_to_raw_data = 'nyctlcraw'          # path in ADLS container
path_to_cleaned_date = 'nyctlccleaned'  # path in ADLS container
datastore_name = 'adlsgen2store'        # Name of Datastore representing ADLS in Azure ML


In [None]:
# this block downloads the files into a folder named `temp_local_dir`
# Note: if such folder exists, it will be deleted. Make sure to back up valuable data.

# Do not change beyond this point
from azureml.opendatasets import NycTlcGreen
import os 
import shutil
if os.path.exists(temp_local_dir):
    # Delete the folder if it exists
    shutil.rmtree(temp_local_dir)
os.makedirs(temp_local_dir)
NycTlcGreen.get_file_dataset(data_start_dt, data_end_dt, False).download(f'./{temp_local_dir}',)
shutil.move(f'{temp_local_dir}/https%3A/%2Fazureopendatastorage.azurefd.net/nyctlc', temp_local_dir)
shutil.rmtree(f'{temp_local_dir}/https%3A')

In [None]:
%pip install azure-storage-file-datalake

In [None]:

# Do not change 
from azureml.core import Workspace
from azure.storage.filedatalake import DataLakeFileClient, FileSystemClient
from azure.identity import ClientSecretCredential
from azure.core.exceptions import ResourceExistsError
ws = Workspace.from_config()
client_id = ws.get_default_keyvault().get_secret('client-id')
tenant_id = ws.get_default_keyvault().get_secret('tenant-id')
client_secret = ws.get_default_keyvault().get_secret('client-secret')
endpoint = f"https://{storage_account_name}.dfs.core.windows.net"


# Set the source directory path and the destination directory path in ADLS Gen2
source_directory_path = f"{temp_local_dir}/nyctlc/green"
destination_directory_path = path_to_raw_data

# Create a credential object using your Azure AD application credentials
credential = ClientSecretCredential(tenant_id, client_id, client_secret)

# Create a DataLakeFileClient object for the destination directory in ADLS Gen2
file_system_client = FileSystemClient(endpoint, file_system_name=container_name, credential=credential)
destination_directory_client = file_system_client.get_directory_client(destination_directory_path)

# Upload the local files to ADLS Gen2
for root, directories, files in os.walk(source_directory_path):
    for file_name in files:
        local_file_path = os.path.join(root, file_name)
        destination_file_path = os.path.relpath(local_file_path, source_directory_path)
        destination_file_client = destination_directory_client.get_file_client(destination_file_path)
        with open(local_file_path, "rb") as f:
            destination_file_client.upload_data(f.read(), overwrite=True)
        print(f"UPLOADED: {local_file_path} ")

print("All raw files have been uploaded to ADLS Gen2.")

file_system_client.get_directory_client(path_to_cleaned_date).create_directory()
print("Created an empty directory for cleaned data in ADLS Gen2.")

In [None]:
# Create and register ADLS datastore
from azureml.core import Datastore
ds = Datastore.register_azure_data_lake_gen2(ws, 
    datastore_name=datastore_name, 
    filesystem=container_name, 
    account_name=storage_account_name,
    tenant_id=tenant_id,
    client_id=client_id, 
    client_secret=client_secret, 
    grant_workspace_access=True,
    subscription_id=ws.subscription_id,
    resource_group=ws.resource_group,
)


In [None]:
# Create and register raw dataset
from azureml.core import Dataset 
d=Dataset.Tabular.from_parquet_files((ds, path_to_raw_data))
d.register(
    ws, 'nyctlcraw', 
    description=f'This dataset contains the raw New York Taxi data from {data_start_dt.date()} to {data_end_dt.date()}. It was downloaded using `azureml.opendatasets`. ',
    tags={},
    create_new_version=True
)

In [None]:
d=Dataset.Tabular.from_parquet_files((ds,path_to_cleaned_date), validate=False)
d.register(
    ws, 'nyctlccleaned', 
    description=f'This dataset contains the cleaned New York Taxi data based on `nyctlcraw` from {data_start_dt.date()} to {data_end_dt.date()}. It was downloaded using `azureml.opendatasets`. ',
    tags={},
    create_new_version=True
)