This notebook will guide you through a list of steps needed to prepare a time series-based dataset containing JSON files to be fed into the Metrics Advisor workspace. Each JSON file will contain daily data representing the count of COVID positive cases by age group.

First, let's import the requires libraries and namespaces.

In [1]:
import pandas as pd
import numpy as np
import datetime
import os
import math
import timeit
from io import StringIO
import re
import urllib.request, json

print("pandas version: {} numpy version: {}".format(pd.__version__, np.__version__))

import os
import azureml.core
from azureml.core import Workspace, Datastore, Dataset
# Check core SDK version number
print("azureml SDK version:", azureml.core.VERSION)

pandas version: 1.1.5 numpy version: 1.18.5
azureml SDK version: 1.20.0


In [None]:
#Provide the Subscription ID of your existing Azure subscription
subscription_id = "<SUBSCRIPTION>"

#Provide values for the existing Resource Group 
resource_group = "<RESOURCEGROUP>"

#Provide the Workspace Name and Azure Region of the Azure Machine Learning Workspace
workspace_name = "<MACHINELEARNING_WORKSPACE>"
workspace_region = "<MACHINELEARNING_WORKSPACE_REGION>"

#Provide values for the existing blob storage account name and key
blob_account_key = "<BLOBSTORAGE_ACCOUNT_KEY>"
blob_account_name = "<BLOBSTORAGE_ACCOUNT_KEY>"

In [None]:


blob_datastore_name='covid_datastore' # Name of the datastore to workspace
container_name=os.getenv("BLOB_CONTAINER", "jsonmetrics") # Name of Azure blob container
account_name=os.getenv("BLOB_ACCOUNTNAME",  blob_account_name ) # Storage account name
account_key=os.getenv("BLOB_ACCOUNT_KEY", blob_account_key) # Storage account access key


In [None]:
url = ("https://quickstartsws9073123377.blob.core.windows.net/"
       "azureml-blobstore-0d1c4218-a5f9-418b-bf55-902b65277b85/"
       "COVID19_Case_Surveillance_Data/COVID-19_Case_Surveillance_Public_Use_Data.csv")

In [None]:
#register the datastore where the Metrics Advisor data feed will be generated
ws = Workspace(subscription_id=subscription_id,
               resource_group=resource_group,
               workspace_name=workspace_name)

blob_datastore = Datastore.register_azure_blob_container(workspace=ws, 
                                                         datastore_name=blob_datastore_name, 
                                                         container_name=container_name, 
                                                         account_name=account_name,
                                                         account_key=account_key)

In [None]:
df = pd.read_csv(url)
len(df)

In [None]:
df.columns

In [None]:
#prepare the timestamp column in the expected format for the Metrics Advisor ingestion process
df['datekey'] =  pd.to_datetime(df['cdc_report_dt']).dt.strftime('%Y-%m-%d')

In [None]:
dfgroup = df.groupby(['datekey','age_group','death_yn']).size().to_frame()
dfgroup.head(10)


In [None]:
dfflat = dfgroup.reset_index()
dfflat.head(10)

In [None]:

dfgroup = df.groupby(['datekey']).size().to_frame()
dfgroup.head(10)

In [None]:
from pandas.io.json import json_normalize

dfflat['count'] = dfflat[0].apply(str)
dfflat = dfflat.drop(columns=[0])


In [None]:
#create the daily json file for data ingestion in Metrics Advisor
for row in dfgroup.axes[0]:
    print(row)
    is_date =  dfflat['datekey']==row
    df_date = dfflat[is_date]
    resultJSON = df_date.to_json(orient='records', date_format='%Y-%m-%d')
    filename_processed_json =  f'covid_age_death/{row}.json'
    with open(filename_processed_json, 'w') as f:
        f.write(resultJSON)

In [None]:
#upload the folder containing the generated json files to the blob storage container
blob_datastore.upload('./covid_age_death', 
                 target_path = '', 
                 overwrite = True, 
                 show_progress = True)

!!!OPTIONAL STEPS!!!: Create the Azure Blob Data Feed (manually performed in this lab using the web-based workspace portal)

In [None]:
pip install azure-ai-metricsadvisor --pre

In [None]:
subscription_key = "<MADV-SUBSCRIPTIONKEY>" #from Azure portal, metrics advisor created resource , from the left menu go to Keys and Endpoint page
api_key = "<MADV-APIKEY>" #from the metrics advisor portal, from the left menu go to API keys page
service_endpoint = "https://<MADV-ACCOUNT>.cognitiveservices.azure.com/" #go to keys and endpoint in the azure portal->metrics advisor resource
storage_accountkey = "<BLOBSTORAGEACCOUNTKEY>" #Azure portal -> blob storage -> Access keys


In [None]:
from azure.ai.metricsadvisor import MetricsAdvisorKeyCredential, MetricsAdvisorAdministrationClient
from azure.ai.metricsadvisor.models import (
        AzureBlobDataFeed,
        DataFeedSchema,
        DataFeedMetric,
        DataFeedDimension,
        DataFeedOptions,
        DataFeedRollupSettings,
        DataFeedMissingDataPointFillSettings
    )

blobstorage_connection_string = f"DefaultEndpointsProtocol=https;AccountName=mcwmachinelear1659324051;AccountKey={storage_accountkey};EndpointSuffix=core.windows.net"
blob_container = "jsonmetrics"
blobtemplate = "%Y-%m-%d.json"
#templateversion = "v2"

client = MetricsAdvisorAdministrationClient(service_endpoint,MetricsAdvisorKeyCredential(subscription_key, api_key))

data_feed = client.create_data_feed(
    name="covid-dailybyage-feed-fromcode",
    source=AzureBlobDataFeed(
        connection_string = blobstorage_connection_string, 
        container = blob_container, 
        blob_template = blobtemplate,
        #missing jsonFormatVersion = v2 (property is not exposed in the constructor)
    ),
    
    granularity="Daily",
    schema=DataFeedSchema(
        metrics=[
            DataFeedMetric(name="count", display_name="Count")
        ],
        dimensions=[
            DataFeedDimension(name="age_group", display_name="Age Group"),
            DataFeedDimension(name="death_yn", display_name="Death")
        ],
        timestamp='dateKey'
    ),
    ingestion_settings=datetime.datetime(2020, 1, 1),
    options=DataFeedOptions(
        data_feed_description="cases by age data feed",
        rollup_settings=DataFeedRollupSettings(
            rollup_type="AutoRollup",
            rollup_method="Sum",
            rollup_identification_value="__CUSTOM_SUM__"
        ),
        missing_data_point_fill_settings=DataFeedMissingDataPointFillSettings(
            fill_type="SmartFilling"
        )
    )
)