## Install Synapse ML into the Apache Spark session

In [1]:
%%configure -f
{
  "name": "synapseml",
  "conf": {
      "spark.jars.packages": "com.microsoft.azure:synapseml_2.12:0.9.4",
      "spark.jars.repositories": "https://mmlspark.azureedge.net/maven",
      "spark.jars.excludes": "org.scala-lang:scala-reflect,org.apache.spark:spark-tags_2.12,org.scalactic:scalactic_2.12,org.scalatest:scalatest_2.12",
      "spark.yarn.user.classpath.first": "true"
  }
}

StatementMeta(, 11, -1, Finished, Available)

## Detect entities in text using the Cognitive Services entity detector transformer from Synapse ML

Retrieve the Cognitive Services credentials and create the test dataset.

In [3]:
key = mssparkutils.credentials.getSecret('asakeyvault524101', 'CognitiveService')
location = 'northeurope'

df = spark.createDataFrame(data=[
        [1, "Muad'Dib learned rapidly because his first training was in how to learn. And the first lesson of all was the basic trust that he could learn. It's shocking to find how many people do not believe they can learn, and how many more believe learning to be difficult. Muad'Dib knew that every experience carries its lesson."],
        [2, "It’s the ship that made the Kessel run in less than twelve parsecs. I’ve outrun Imperial starships. Not the local bulk cruisers, mind you. I’m talking about the big Corellian ships, now. She’s fast enough for you, old man."]
    ], 
    schema=["id","text"])

StatementMeta(SparkPool02, 11, 2, Finished, Available)

Run the transformer and detect the entities mentioned in text.

In [4]:
from synapse.ml.cognitive import *

entity = (EntityDetector()
      .setSubscriptionKey(key)
      .setLocation(location)
      .setLanguage("en")
      .setOutputCol("entities")
      .setErrorCol("error"))

df_entities = entity.transform(df)

StatementMeta(SparkPool02, 11, 3, Finished, Available)

Check out the entities identified from the first phrase.

In [5]:
print(df_entities.head(1)[0].entities[0].entities[0].id)
print(df_entities.head(1)[0].entities[0].entities[0].url)

StatementMeta(SparkPool02, 11, 4, Finished, Available)

TypeError: 'NoneType' object is not subscriptable

Check out the entities identified from the second phrase.

In [83]:
print(df_entities.tail(1)[0].entities[0].entities[0].id)
print(df_entities.tail(1)[0].entities[0].entities[0].url)

StatementMeta(SparkPool02, 0, 82, Finished, Available)

Millennium Falcon
https://en.wikipedia.org/wiki/Millennium_Falcon

## Train a customer recommendation model


This notebook uses sample data to train a LightGBM model for retail product recommendation. The data is randomly generated.

In [1]:
%%configure -f
{
  "name": "synapseml",
  "conf": {
      "spark.jars.packages": "com.microsoft.azure:synapseml_2.12:0.9.4",
      "spark.jars.repositories": "https://mmlspark.azureedge.net/maven",
      "spark.jars.excludes": "org.scala-lang:scala-reflect,org.apache.spark:spark-tags_2.12,org.scalactic:scalactic_2.12,org.scalatest:scalatest_2.12",
      "spark.yarn.user.classpath.first": "true"
  }
}

In [2]:
import logging
logging.getLogger("py4j").setLevel(logging.ERROR)

import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

from pyspark.version import __version__ as pyspark_version

from synapse.ml.core import __spark_package_version__
from synapse.ml.train import ComputeModelStatistics
from synapse.ml.lightgbm import LightGBMClassifier
from pyspark.ml.feature import VectorAssembler

pd.set_option('display.max_columns', 50)

print(f"PySpark version: {pyspark_version}")
print(f"SynapseML version: {__spark_package_version__}")

## Parameters


Note: If you're using a Managed VNet enabled workspace, please download the dataset from the 
[url](https://synapsemlpublic.blob.core.windows.net/files/PersonalizedData.csv) and then upload it to your own storage account in order to access it.

In [3]:
# Blob url
# Original blob: "https://recodatasets.z20.web.core.windows.net/random-dataset/PersonalizedData.csv"
url = "wasbs://files@synapsemlpublic.blob.core.windows.net/PersonalizedData.csv"

# Data parameters
LABEL_COL = "Rating"
FEATURE_COL = "features"
RATIO = 0.8
SEED = 42

# Model parameters
OBJECTIVE = "binary"
BOOSTING = "gbdt"
NUM_LEAVES = 32
NUM_ITERATIONS = 100
LEARNING_RATE = 0.1
FEATURE_FRACTION = 0.8
EARLY_STOPPING_ROUND = 10
MODEL_NAME = "lgb-quickstart"


## Read the data from Blob

In [None]:
# Added the file to linked ADLSv2
raw_data = spark.read.csv(url, header=True, inferSchema=True)
print("Schema: ")
# raw_data.printSchema()

df = raw_data.toPandas()
print("Shape: ", df.shape)

In [5]:
display(df.iloc[:10, :])

## Data visualization

In [6]:
df.describe()


In [7]:
# calculate the correlation matrix
corr = df.corr()

# plot the correlation heatmap
fig, ax = plt.subplots(figsize=(10,10))         # Sample figsize in inches

sns.heatmap(corr, 
            xticklabels=corr.columns, 
            yticklabels=corr.columns, 
            cmap='RdBu', 
            vmin=-1, 
            vmax=1, 
            ax=ax, 
            annot=True,
            fmt='.2f', 
            annot_kws={'size': 10})
plt.show()

In [8]:
#scatterplot
sns.set()
sns.pairplot(df, height=2.5)
plt.show()

## Split the data into train, test



In [6]:
raw_train, raw_test = raw_data.randomSplit([RATIO, 1 - RATIO], seed=SEED)
print("Train: (rows, columns) = {}".format((raw_train.count(), len(raw_train.columns))))
print("Test: (rows, columns) = {}".format((raw_test.count(), len(raw_test.columns))))

## Feature engineering 
Transform the original data feature columns into feature vectors

In [17]:
columns = raw_data.columns[3:]
featurizer = VectorAssembler(inputCols=columns, outputCol=FEATURE_COL)
train = featurizer.transform(raw_train)[LABEL_COL, FEATURE_COL]
test = featurizer.transform(raw_test)[LABEL_COL, FEATURE_COL]

In [18]:
# Check if data is unbalanced
display(train.groupBy(LABEL_COL).count())


## Model Training


In [19]:
lgbm = LightGBMClassifier(
    labelCol=LABEL_COL,
    featuresCol=FEATURE_COL,
    objective=OBJECTIVE,
    isUnbalance=False,
    boostingType=BOOSTING,
    boostFromAverage=True,
    baggingSeed=SEED,
    numLeaves=NUM_LEAVES,
    numIterations=NUM_ITERATIONS,
    learningRate=LEARNING_RATE,
    featureFraction=FEATURE_FRACTION,
    earlyStoppingRound=EARLY_STOPPING_ROUND
)


In [20]:
model = lgbm.fit(train)

## Feature Importances

In [21]:
feature_importances = model.getFeatureImportances()
fi = pd.Series(feature_importances,index = columns)
fi = fi.sort_values(ascending = True)
f_index = fi.index
f_values = fi.values
 
# print feature importances 
print ('f_index:',f_index)
print ('f_values:',f_values)

# plot
x_index = list(range(len(fi)))
x_index = [x/len(fi) for x in x_index]
plt.rcParams['figure.figsize'] = (10,10)
plt.barh(x_index,f_values,height = 0.028 ,align="center",color = 'tan',tick_label=f_index)
plt.xlabel('importances')
plt.ylabel('features')
plt.show()

## Model Prediction

In [22]:
predictions = model.transform(test)


In [23]:
display(predictions.limit(10))

## Evaluation

In [24]:
evaluator = (
    ComputeModelStatistics()
    .setScoredLabelsCol("prediction")
    .setLabelCol(LABEL_COL)
    .setEvaluationMetric("classification")
)

metrics = evaluator.transform(predictions)

In [25]:
display(metrics)

## Save the model

Save the model to linked ADLS

In [26]:
print(MODEL_NAME)
(model
 .write()
 .overwrite()
 .save(MODEL_NAME))


##