# Connect to Azure ML workspace

In [1]:
import azureml.core
from azureml.core import Workspace, Datastore, Dataset

workspace_name = 'sd-ml'
subscription_id = 'c374c749-c070-4b3b-9fb4-40a657b1d4a5' # subscription id of ADLS account
resource_group = 'rs-sd-learn-sth-new' # resource group of ADLS account

workspace = Workspace.get(
    name = workspace_name,
    subscription_id = subscription_id,
    resource_group = resource_group
)

print ("workspace to be used: " + workspace.name)


workspace to be used: sd-ml


# Get or set up datastore

In [2]:

datastore_name = 'learnsthnew_datastore'
filesystem = 'learnsthnew'

subscription_id = 'c374c749-c070-4b3b-9fb4-40a657b1d4a5' # subscription id of ADLS account
resource_group = 'rs-sd-learn-sth-new' # resource group of ADLS account

account_name = 'sdsalearnsthnew' # ADLS Gen2 account name
tenant_id = '680b5d20-b41e-46f8-a077-f482d0c64dbb' # tenant id of service principal
client_id = '6df9c689-4854-45ec-a9c2-55194c54c511' # client id of service principal
client_secret = 'Is5A5Jctw1~Ge-hi4EOS_RmRahG5_s43F4' # the secret of service principal

try:
    datastore = Datastore.get(
        workspace = workspace,
        datastore_name = datastore_name
    )
except Exception as exc:
    datastore = Datastore.register_azure_data_lake_gen2(
        workspace = workspace,
        subscription_id = subscription_id,
        resource_group = resource_group,
        datastore_name = datastore_name,
        account_name = account_name, # ADLS Gen2 account name
        filesystem = filesystem, # ADLS Gen2 filesystem
        tenant_id = tenant_id, # tenant id of service principal
        client_id = client_id, # client id of service principal
        client_secret = client_secret # the secret of service principal
    )

print ("datastore to be used: " + datastore.name)


datastore to be used: learnsthnew_datastore


# Get or set up dataset

In [3]:
from azureml.core import Workspace, Datastore, Dataset

datastore_name = 'learnsthnew_datastore'
dataset_name = 'playlist_statistics'
dataset_description = 'radio song playlist statistics'

try:
    dataset = Dataset.get_by_name(
        workspace = workspace, 
        name = dataset_name
    )
except Exception as exc:
    datastore_paths = [(datastore, '/analytics/playlist_statistics.parquet/*.parquet')]
    dataset = Dataset.Tabular.from_parquet_files(path=datastore_paths, validate=False)
    dataset = dataset.register(
        workspace=workspace,
        name = dataset_name,
        description = dataset_description
   )

print ("dataset to be used: " + dataset.name)

dataset to be used: playlist_statistics


# Create prediction model using DecisionTreeClassifier as an experiment, validate model and register in Azure ML 

* **experiment = Experiment()** - we will create an Experiment using AzureML SDK, that will allow us to keep track of the model in Azure ML workspace
* **run = experiment.start_logging()** - will allow us to log the script execution and save the logs and make them avaliable in Azure ML workspace
* **AUC** - another metric that is used to validate the model performance
* **run.register_model** - will allow us to register the model in the AzureML workspace
* **run.upload_file** - is uploading the model content to the Azure ML workspace

In [7]:
from azureml.core import Experiment
from azureml.core import Model
from azureml.core import Run

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score, confusion_matrix
import numpy as np
import pandas as pd
import joblib

import logging
logging.basicConfig(filename='./outputs/log.txt', 
                            filemode='a', 
                            format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s', 
                            datefmt='%H:%M:%S',
                            level=logging.DEBUG)

# Create an Azure ML experiment in your workspace
experiment = Experiment(workspace=workspace, name="try-sth-new-songs")

run = experiment.start_logging()
logging.info("Start experiment")
print("Starting experiment:", experiment.name)


print("Loading Data...")
df = dataset.to_pandas_dataframe()

print("Separating features and labels...")
features = ["radio_name", "month_name", "artist_and_title"]
label = 'played'
X, y = df[features].values, df[label].values

print("Spliting data 70%-30% into training set and test set...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)
print ('Training cases: %d\nTest cases: %d' % (X_train.shape[0], X_test.shape[0]))


print ("Defining preprocessing for categorical features (encode the Age column...")
categorical_features = [0,1]
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])
categorical_features_song = [2]
categorical_transformer_song = Pipeline(steps=[
    ('labelenc', OrdinalEncoder())])


print ("Combining preprocessing steps...")
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('catsong', categorical_transformer_song, categorical_features_song)
    ])

print ("Creating preprocessing and training pipeline...")
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('logregressor', DecisionTreeClassifier())])

print ("Fitting the pipeline to train a DecisionTreeClassifier model on the training set...")
model = pipeline.fit(X_train, (y_train))

print("Calculating AUC...")
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))
run.log('AUC', np.float(auc))

print("Calculating Accuracy score...")
predictions = model.predict(X_test)
accuracy_score = accuracy_score(y_test, predictions)
print('Accuracy: ', accuracy_score)
run.log('Accuracy', np.float(accuracy_score))

conf_matrix = confusion_matrix(y_test, predictions)
conf_matrix_json = """{
       "schema_type": "confusion_matrix",
       "schema_version": "1.0.0",
       "data": {
           "class_labels": ["0", "1"],
           "matrix": [
               ["""+str(conf_matrix[0][0])+""", """+str(conf_matrix[0][1])+"""],
               ["""+str(conf_matrix[1][0])+""", """+str(conf_matrix[1][1])+"""]
           ]
       }
   }"""
run.log_confusion_matrix(name="confusion matrix", value=conf_matrix_json)
print (conf_matrix)

print ("Saving the model...")
model_file = 'song_prd_model.pkl'
joblib.dump(value=model, filename=model_file)
run.upload_file(name = 'models/' + model_file, path_or_stream = './' + model_file)

print ("Completing the run...")
logging.info("Complete run")
run.complete()

# Register the model
run.register_model(model_path='models/song_prd_model.pkl', model_name='song_prd_model',
                   tags={'Training context':'Inline Training'},
                   properties={'AUC': run.get_metrics()['AUC'], 'Accuracy': run.get_metrics()['Accuracy']})

print('Model trained and registered.')


Starting experiment: try-sth-new-songs
Loading Data...
Separating features and labels...
Spliting data 70%-30% into training set and test set...
Training cases: 19237748
Test cases: 8244750
Defining preprocessing for categorical features (encode the Age column...
Combining preprocessing steps...
Creating preprocessing and training pipeline...
Fitting the pipeline to train a DecisionTreeClassifier model on the training set...
Calculating AUC...
AUC: 0.9223391352372876
Calculating Accuracy score...
Accuracy:  0.9571007004457383
[[7884085    5635]
 [ 348059    6971]]
Saving the model...
Completing the run...
Model trained and registered.


# Retrive model from workspace

In [8]:
model = workspace.models['song_prd_model']
print(model.name, 'version', model.version)

song_prd_model version 4


# Create webservice to use the model using the API

In [13]:
import os

folder_name = 'song_service'

# Create a folder for the web service files
experiment_folder = './' + folder_name
os.makedirs(experiment_folder, exist_ok=True)

print(folder_name, 'folder created.')

# Set path for scoring script
script_file = os.path.join(experiment_folder,"predict_song.py")

song_service folder created.


# Create webservice script

In [14]:
%%writefile $script_file
import json
import joblib
import numpy as np
from azureml.core.model import Model

# Called when the service is loaded
def init():
    global model
    # Get the path to the deployed model file and load it
    model_path = Model.get_model_path('song_prd_model')
    model = joblib.load(model_path)

# Called when a request is received
def run(raw_data):
    # Get the input data as a numpy array
    data = np.array(json.loads(raw_data)['data'])
    # Get a prediction from the model
    predictions = model.predict(data)
    # Get the corresponding classname for each prediction (0 or 1)
    classnames = ['0', '1']
    predicted_classes = []
    for prediction in predictions:
        predicted_classes.append(classnames[prediction])
    # Return the predictions as JSON
    return json.dumps(predicted_classes)

Overwriting ./song_service/predict_song.py


# Webservice will be running from container and we need to create a file with the dependencies to be installed

In [15]:
from azureml.core.conda_dependencies import CondaDependencies 

# Add the dependencies for our model (AzureML defaults is already included)
myenv = CondaDependencies()
myenv.add_conda_package('scikit-learn')

# Save the environment config as a .yml file
env_file = os.path.join(experiment_folder,"song_env.yml")
with open(env_file,"w") as f:
    f.write(myenv.serialize_to_string())
print("Saved dependency info in", env_file)

# Print the .yml file
with open(env_file,"r") as f:
    print(f.read())

Saved dependency info in ./song_service/song_env.yml
# Conda environment specification. The dependencies defined in this file will
# be automatically provisioned for runs with userManagedDependencies=False.

# Details about the Conda environment file format:
# https://conda.io/docs/user-guide/tasks/manage-environments.html#create-env-file-manually

name: project_environment
dependencies:
  # The python interpreter version.
  # Currently Azure ML only supports 3.5.2 and later.
- python=3.6.2

- pip:
    # Required packages for AzureML execution, history, and data preparation.
  - azureml-defaults

- scikit-learn
channels:
- anaconda
- conda-forge



# Deploy model and create the webservice

In [16]:
from azureml.core.webservice import AciWebservice
from azureml.core.model import InferenceConfig



# Configure the scoring environment
inference_config = InferenceConfig(runtime= "python",
                                   entry_script=script_file,
                                   conda_file=env_file)

deployment_config = AciWebservice.deploy_configuration(cpu_cores = 1, memory_gb = 1)

service_name = "song-service"

service = Model.deploy(workspace, service_name, [model], inference_config, deployment_config,overwrite=True)

service.wait_for_deployment(True)
print(service.state)

Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running
2021-05-09 21:26:12+00:00 Creating Container Registry if not exists.
2021-05-09 21:26:13+00:00 Registering the environment.
2021-05-09 21:26:14+00:00 Use the existing image.
2021-05-09 21:26:14+00:00 Generating deployment configuration.
2021-05-09 21:26:15+00:00 Submitting deployment to compute..
2021-05-09 21:26:21+00:00 Checking the status of deployment song-service..
2021-05-09 21:37:24+00:00 Checking the status of inference endpoint song-service.
Succeeded
ACI service creation operation finished, operation "Succeeded"
Healthy


# Retrive the webservice endpoint URL

In [17]:
endpoint = service.scoring_uri
print(endpoint)

http://18a690c1-d1e2-404a-9cf6-32ac6bd76b8e.westeurope.azurecontainer.io/score


# Use the webservice to predict the value

In [18]:
import requests
import json

x_new = [
    ["RMFFM", "December", "Wham! - Last Christmas"],
    ["RMFFM", "June"    , "Wham! - Last Christmas"],
    ["RMFFM", "December", "Rotary - Na Jednej Z Dzikich Plaż"],
    ["RMFFM", "June", "Rotary - Na Jednej Z Dzikich Plaż"]
]

# Convert the array to a serializable list in a JSON document
input_json = json.dumps({"data": x_new})

# Set the content type
headers = { 'Content-Type':'application/json' }

predictions = requests.post(endpoint, input_json, headers = headers)
predicted_classes = json.loads(predictions.json())

for i in range(len(x_new)):
    print ("The song in radio in given month: {}".format(x_new[i]), predicted_classes[i] )

The song in radio in given month: ['RMFFM', 'December', 'Wham! - Last Christmas'] 1
The song in radio in given month: ['RMFFM', 'June', 'Wham! - Last Christmas'] 0
The song in radio in given month: ['RMFFM', 'December', 'Rotary - Na Jednej Z Dzikich Plaż'] 0
The song in radio in given month: ['RMFFM', 'June', 'Rotary - Na Jednej Z Dzikich Plaż'] 1
