# Install Synapse ML in the Apache Spark session

In [None]:
%%configure -f
{
    "name": "synapseml",
    "conf": {
        "spark.jars.packages": "com.microsoft.azure:synapseml_2.12:0.9.4",
        "spark.jars.repositories": "https://mmlspark.azureedge.net/maven",
        "spark.jars.excludes": "org.scala-lang:scala-reflect,org.apache.spark:spark-tags_2.12,org.scalactic:scalactic_2.12,org.scalatest:scalatest_2.12",
        "spark.yarn.user.classpath.first": "true"
    }
}

# Part 1 - Entity detection with Cognitive Services

Detect entities in text using the Cognitive Services entity detector transformer from Synapse ML.

Retrieve the Cognitive Services credentials and create the test dataset.

In [None]:
key = mssparkutils.credentials.getSecret('#KEY_VAULT_NAME#', '#COGNITIVE_SERVICES_SECRET_NAME#')
location = '#COGNITIVE_SERVICES_ACCOUNT_LOCATION#'

df = spark.createDataFrame(data=[
        [1, "Muad'Dib learned rapidly because his first training was in how to learn. And the first lesson of all was the basic trust that he could learn. It's shocking to find how many people do not believe they can learn, and how many more believe learning to be difficult. Muad'Dib knew that every experience carries its lesson."],
        [2, "It's the ship that made the Kessel run in less than twelve parsecs. I've outrun Imperial starships. Not the local bulk cruisers, mind you. I'm talking about the big Corellian ships, now. She's fast enough for you, old man."]
    ], 
    schema=["id","text"])

Define the transformer to detect the entities mentioned in text.

In [None]:
from synapse.ml.cognitive import *

entity = (EntityDetector()
      .setSubscriptionKey(key)
      .setLocation(location)
      .setLanguage("en")
      .setOutputCol("entities")
      .setErrorCol("error"))

df_entities = entity.transform(df)

Check out the entities identified from the first phrase.

In [None]:
print(df_entities.head(1)[0].entities[0].entities[0].id)
print(df_entities.head(1)[0].entities[0].entities[0].url)

Check out the entities identified from the second phrase.

In [None]:
print(df_entities.tail(1)[0].entities[0].entities[0].id)
print(df_entities.tail(1)[0].entities[0].entities[0].url)

# Part 2 - Train a customer recommendation model

Use the LightGBM Synapse ML algorithm to train a model for retail product recommendation.

## Configure

Reference the required libraries and check Synapse ML version (shoudl be 0.9.4).

In [None]:
import logging
logging.getLogger("py4j").setLevel(logging.ERROR)

import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

from pyspark.version import __version__ as pyspark_version

from synapse.ml.core import __spark_package_version__
from synapse.ml.train import ComputeModelStatistics
from synapse.ml.lightgbm import LightGBMClassifier
from pyspark.ml.feature import VectorAssembler

pd.set_option('display.max_columns', 50)

print(f"PySpark version: {pyspark_version}")
print(f"SynapseML version: {__spark_package_version__}")

Set the data preparation and model training parameters. 

Check the [Synapse ML LightGBM documentation](https://microsoft.github.io/SynapseML/docs/features/lightgbm/LightGBM%20-%20Overview/) for more details on setting the parameters of the ML model.

In [None]:
# Blob url
# Original blob: "https://recodatasets.z20.web.core.windows.net/random-dataset/PersonalizedData.csv"
url = "wasbs://files@synapsemlpublic.blob.core.windows.net/PersonalizedData.csv"

# Data parameters
LABEL_COL = "Rating"
FEATURE_COL = "features"
RATIO = 0.8
SEED = 42

# Model parameters
OBJECTIVE = "binary"
BOOSTING = "gbdt"
NUM_LEAVES = 32
NUM_ITERATIONS = 100
LEARNING_RATE = 0.1
FEATURE_FRACTION = 0.8
EARLY_STOPPING_ROUND = 10
MODEL_NAME = "lgb-quickstart"


## Prepare and analyze data


Load the data from the source and observe the schema.

In [None]:
# Added the file to linked ADLSv2
raw_data = spark.read.csv(url, header=True, inferSchema=True)
print("Schema: ")
# raw_data.printSchema()

df = raw_data.toPandas()
print("Shape: ", df.shape)


Take a look at some of the items in the dataset. Notice the two-class ratings (0 vs. 1) provided by customers to products.
The goal of this exercise is to build a Machine Learning classification model capable of predicting the rating based on Cost, Size, Price, PrimaryBrandId, GenderId, MaritalStatus, LowerIncomeBound, and UpperIncomeBound. To achieve the goal, you will use Azure Machine Learning (AML) automated machine learning (Auto ML).

In [None]:
display(df.iloc[:10, :])

Check out the statistical properties of the dataset.

In [None]:
df.describe()


Calculate and display the dataset fratures correlation matrix.

In [None]:
# calculate the correlation matrix
corr = df.corr()

# plot the correlation heatmap
fig, ax = plt.subplots(figsize=(10,10))         # Sample figsize in inches

sns.heatmap(corr, 
            xticklabels=corr.columns, 
            yticklabels=corr.columns, 
            cmap='RdBu', 
            vmin=-1, 
            vmax=1, 
            ax=ax, 
            annot=True,
            fmt='.2f', 
            annot_kws={'size': 10})
plt.show()

Display the paiwise feature correlations as scatterplots.

In [None]:
#scatterplot
sns.set()
sns.pairplot(df, height=2.5)
plt.show()

## Train the ML model

Split the dataset into train and test subsets.

In [None]:
raw_train, raw_test = raw_data.randomSplit([RATIO, 1 - RATIO], seed=SEED)
print("Train: (rows, columns) = {}".format((raw_train.count(), len(raw_train.columns))))
print("Test: (rows, columns) = {}".format((raw_test.count(), len(raw_test.columns))))

Perform feature engineering - transform the original data feature columns into feature vectors.

In [None]:
columns = raw_data.columns[3:]
featurizer = VectorAssembler(inputCols=columns, outputCol=FEATURE_COL)
train = featurizer.transform(raw_train)[LABEL_COL, FEATURE_COL]
test = featurizer.transform(raw_test)[LABEL_COL, FEATURE_COL]

Check if data is unbalanced.

In [None]:
display(train.groupBy(LABEL_COL).count())


Define the LBGM model.


In [None]:
lgbm = LightGBMClassifier(
    labelCol=LABEL_COL,
    featuresCol=FEATURE_COL,
    objective=OBJECTIVE,
    isUnbalance=False,
    boostingType=BOOSTING,
    boostFromAverage=True,
    baggingSeed=SEED,
    numLeaves=NUM_LEAVES,
    numIterations=NUM_ITERATIONS,
    learningRate=LEARNING_RATE,
    featureFraction=FEATURE_FRACTION,
    earlyStoppingRound=EARLY_STOPPING_ROUND
)


Train the LGBM model.

In [None]:
model = lgbm.fit(train)

Display the relative feature importance as it results from the training process.

In [None]:
feature_importances = model.getFeatureImportances()
fi = pd.Series(feature_importances,index = columns)
fi = fi.sort_values(ascending = True)
f_index = fi.index
f_values = fi.values
 
# print feature importances 
print ('f_index:',f_index)
print ('f_values:',f_values)

# plot
x_index = list(range(len(fi)))
x_index = [x/len(fi) for x in x_index]
plt.rcParams['figure.figsize'] = (10,10)
plt.barh(x_index,f_values,height = 0.028 ,align="center",color = 'tan',tick_label=f_index)
plt.xlabel('importances')
plt.ylabel('features')
plt.show()

## Perform predictions with the model

In [None]:
predictions = model.transform(test)


In [None]:
display(predictions.limit(10))

Evaluate the performance of the model.

In [None]:
evaluator = (
    ComputeModelStatistics()
    .setScoredLabelsCol("prediction")
    .setLabelCol(LABEL_COL)
    .setEvaluationMetric("classification")
)

metrics = evaluator.transform(predictions)

Observe the main performance metrics of a classification model:

- Confision matrix
- Accuracy
- Precision
- Recall
- AUC (Area Under the Curve)

In [None]:
display(metrics)

## Save the model

Save the model to storage.

In [None]:
print(MODEL_NAME)
(model
 .write()
 .overwrite()
 .save(MODEL_NAME))


##