# Train a classifier to determine product seasonality


First, check if XGBoost is properly installed in the Spark environment (shoud have version 1.0.2)


In [None]:
import pip
pip.get_installed_distributions()

Import all necessary libraries.


In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from xgboost import XGBClassifier

from onnxmltools.convert import convert_xgboost
from onnxmltools.convert.common.data_types import FloatTensorType

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Exploratory data analysis (basic stats)

Create Spark temporary views for sales and products.

**IMPORTANT!** Make sure the name of the SQL pool (`#SQL_POOL_NAME#` below) matches the name of your SQL pool.


In [None]:
%%spark
val df = spark.read.sqlanalytics("#SQL_POOL_NAME#.wwi.SaleSmall") 
df.createOrReplaceTempView("sale")

val df2 = spark.read.sqlanalytics("#SQL_POOL_NAME#.wwi.Product") 
df2.createOrReplaceTempView("product")
display(df2)

Load daily product sales from the SQL pool.


In [None]:
sqlQuery = """
SELECT
    P.ProductId
    ,P.Seasonality
    ,S.TransactionDateId
    ,COUNT(*) as TransactionItemsCount
FROM
    sale S
    JOIN product P ON
        S.ProductId = P.ProductId
WHERE
    S.TransactionDateId NOT IN (20120229, 20160229)
GROUP BY
    P.ProductId
    ,P.Seasonality
    ,S.TransactionDateId
"""

prod_df = spark.sql(sqlQuery)
prod_df.cache()

Check the number of records in the data farame (should be around 13 million rows).

In [None]:
prod_df.count()

Display some statistics about the data frame.


In [None]:
display(prod_df.describe())

Pivot the data frame to make daily sale items counts columns. 


In [None]:
prod_prep_df = prod_df.groupBy(['ProductId', 'Seasonality']).pivot('TransactionDateId').sum('TransactionItemsCount').toPandas()

Clean up the nulls and take a look at the result.


In [None]:
prod_prep_df = prod_prep_df.fillna(0)
prod_prep_df.head(10)

Isloate features and prediction classes.

Standardize features by removing the mean and scaling to unit variance.


In [None]:
X = prod_prep_df.iloc[:, 2:].values
y = prod_prep_df['Seasonality'].values

X_scale = StandardScaler().fit_transform(X)

## Use PCA for dimensionality reduction

Perform dimensionality reduction using Principal Components Analysis and two target components.


In [None]:
pca = PCA(n_components=2)
principal_components = pca.fit_transform(X_scale)
principal_components = MinMaxScaler().fit_transform(principal_components)

pca_df = pd.DataFrame(data = principal_components, columns = ['pc1', 'pc2'])
pca_df = pd.concat([pca_df, prod_prep_df[['Seasonality']]], axis = 1)

Display the products data frame in two dimensions (mapped to the two principal components).

Note the clear separation of clusters.


In [None]:
fig = plt.figure(figsize = (6,6))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 component PCA', fontsize = 20)
targets = [1, 2, 3]
colors = ['r', 'g', 'b']
for target, color in zip(targets,colors):
    indicesToKeep = pca_df['Seasonality'] == target
    ax.scatter(pca_df.loc[indicesToKeep, 'pc1']
               , pca_df.loc[indicesToKeep, 'pc2']
               , c = color
               , s = 1)
ax.legend(['All Season Products', 'Summer Products', 'Winter Products'])
ax.plot([-0.05, 1.05], [0.77, 1.0], linestyle=':', linewidth=1, color='y')
ax.plot([-0.05, 1.05], [0.37, 0.6], linestyle=':', linewidth=1, color='y')
ax.grid()

plt.show()
plt.close()

Redo the Principal Components Analysis, this time with twenty dimensions.


In [None]:
def col_name(x):
    return f'f{x:02}'

pca = PCA(n_components=20)
principal_components = pca.fit_transform(X_scale)
principal_components = MinMaxScaler().fit_transform(principal_components)

X = pd.DataFrame(data = principal_components, columns = list(map(col_name, np.arange(0, 20))))
pca_df = pd.concat([X, prod_prep_df[['ProductId']]], axis = 1)
pca_automl_df = pd.concat([X, prod_prep_df[['Seasonality']]], axis = 1)

X = X[:4500]
y = prod_prep_df['Seasonality'][:4500]
pca_automl_df = pca_automl_df[:4500]

Save the PCA components to the SQL pool.


In [None]:
pca_sdf = spark.createDataFrame(pca_df)
pca_sdf.createOrReplaceTempView("productpca")

In [None]:
%%spark
// Make sure the name of the SQL pool (#SQL_POOL_NAME# below) matches the name of your SQL pool.
val df = spark.sqlContext.sql("select * from productpca")
df.write.sqlanalytics("#SQL_POOL_NAME#.wwi_ml.ProductPCA", Constants.INTERNAL)

## Train ensemble of trees classifier (using XGBoost)

Split into test and training data sets.


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)

Train the ensemble classifier using XGBoost.


In [None]:
model = XGBClassifier()
model.fit(X_train, y_train)

Perform predictions with the newly trained model.


In [None]:
y_pred = model.predict(X_test)

Calculate the accuracy of the model using test data.


In [None]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Convert trained model to ONNX format.


In [None]:
initial_types = [
    ('input', FloatTensorType([1, 20]))
]

onnx_model = convert_xgboost(model, initial_types=initial_types)

In [None]:

#model.get_booster().get_dump(with_stats=True, dump_format='json')
#model.get_booster().feature_names
#onnx_model

## Train classifier using Auto ML


In [None]:
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.train.automl.run import AutoMLRun
from azureml.train.automl import AutoMLConfig
from azureml.automl.runtime.onnx_convert import OnnxConverter

In [None]:
pca_automl_df.head(10)

Configure the connection to the Azure Machine Learning workspace. The Azure portal provides all the values below.


In [None]:
subscription_id='#SUBSCRIPTION_ID#'         # ensure it matches your Azure subscription id
resource_group='#RESOURCE_GROUP_NAME#'      # ensure it matches your resource group name
workspace_name='#AML_WORKSPACE_NAME#'       # ensure it matches your Azure Machine Learning workspace name
ws = Workspace(subscription_id = subscription_id, resource_group = resource_group, workspace_name = workspace_name)
ws.write_config()
ws = Workspace.from_config()
experiment = Experiment(ws, "Product_Seasonality")

Configure the Automated Machine Learning experiment and start it (will run on local compute resources).


In [None]:
automl_classifier_config = AutoMLConfig(
        task='classification',
        #experiment_exit_score = 0.995,
        experiment_timeout_minutes=15,
        enable_onnx_compatible_models=True,
        training_data=pca_automl_df,
        label_column_name='Seasonality',
        n_cross_validations=5,
        enable_voting_ensemble=False,
        enable_stack_ensemble=False
        )

local_run = experiment.submit(automl_classifier_config, show_output=True)

Retrieve the best model directly in ONNX format and take a look at it.


In [None]:
best_run, onnx_model2 = local_run.get_output(return_onnx_model=True)
#onnx_fl_path = "./best_model.onnx"
#OnnxConverter.save_onnx_model(onnx_mdl, onnx_fl_path)

In [None]:
onnx_model2

Replace below the placeholders with the name of the primary data lake account and one of it's security keys.

In [None]:
from azure.storage.blob import BlockBlobService

block_blob_service = BlockBlobService(
    account_name='#DATA_LAKE_ACCOUNT_NAME#', account_key='#DATA_LAKE_ACCOUNT_KEY#') 

block_blob_service.create_blob_from_text('wwi-02', '/ml/onnx/product_seasonality_classifier.onnx', onnx_model2.SerializeToString())