# Cuisine Type Prediction Sample

## Data Source

The sample data was taken from *kaggle*'s [*What's Cooking*](https://www.kaggle.com/c/whats-cooking/overview) competition.

## Read Training Data

We read some JSON training data using [*pandas*](https://pandas.pydata.org/). It contains ingrediences of sample recipies and corresponding cuisine types (e.g. *greek*, *german*).

In [None]:
import pandas as pd
train_json = pd.read_json("./train.json")
train_json.head()

In [None]:
train_json['ingredients']
df = train_json['ingredients']
# df[df.apply(lambda s: len(s) == 1 and s[0] == 'water')].index
train_json['id'][df[df.apply(lambda s: len(s) == 1 and s[0] == 'water')].index]

In [None]:
import nltk
nltk.download('wordnet')

from nltk import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

actual = lemmatizer.lemmatize('bats')
assert actual == 'bat', f'Lemmatizing did not work, received "{actual}"'

## Clean Data
For our data to be usable we clean it by stripping off unnecessary words and characters, 
as well as lemmatizing to get the ingredients' stems. All that is done in our preprocess function.

In [None]:
import re
import unidecode

def preprocess(ingredients):
    ingredients_text = ' '.join(ingredients)
    ingredients_text = ingredients_text.lower() #Lower - Casing
    ingredients_text = ingredients_text.replace('-', ' ') # Removing Hyphen
    words = []
    for word in ingredients_text.split():
        word = re.sub("[0-9]"," ",word) # removing numbers,punctuations and special characters
        word = re.sub((r'\b(oz|ounc|ounce|pound|lb|inch|inches|kg|to)\b'), ' ', word) # Removing Units
        if len(word) <= 2: continue # Removing words with less than two characters
        word = unidecode.unidecode(word) # Removing accents
        word = lemmatizer.lemmatize(word) # Lemmatize
        if len(word) > 0: words.append(word)
    return ' '.join(words)

actual = preprocess(['Half and Half 15 ounce of Grains'])
assert actual == 'half and half grain', f'Preprocessing did not work, received "{actual}"'

Getting our data ready.

In [None]:
X_train = train_json['ingredients'].apply(preprocess)
Y_train = train_json['cuisine']
print(X_train.size)
print(Y_train.size)
X_train.head()

## Buid and train models

### Multinomial Naive Bayes

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

vectorizer = CountVectorizer(min_df=0.05, max_df=0.90)

# different document frequency
vectorizer2 = CountVectorizer()
X_train_vec = vectorizer2.fit_transform(X_train)

mnb = MultinomialNB()
mnb.fit(X_train_vec, Y_train)

X_test = vectorizer2.transform([preprocess([
        "pork stew meat",
        "salt",
        "tomatoes",
        "tomatillos",
        "chile pepper",
        "pepper",
        "garlic"
        ]), preprocess([
        "pork stew meat",
        "salt",
        "tomatoes",
        "tomatillos",
        "chile pepper",
        "pepper",
        "garlic"
    ])])
mnb.predict(X_test)



In [None]:
from joblib import dump, load
dump(mnb, 'cook.model')


In [None]:
mnb_loaded = load('cook.model')
X_test = vectorizer2.transform([preprocess([
      "sugar",
      "large egg yolks",
      "grated lemon peel",
      "rhubarb",
      "cream",
      "salt",
      "ground cinnamon",
      "golden brown sugar",
      "all-purpose flour",
      "sliced almonds",
      "unsalted butter"
    ])])
mnb.predict(X_test)

Predict results using our model based on naive bayes algorithm and write to file.

In [107]:
test_json = pd.read_json('./test.json')
test_json.head()

test = test_json['ingredients'].apply(preprocess)
test.head()
testfinal = vectorizer2.transform(test)
result = mnb.predict(testfinal)
print(result)
print(len(result))
result_transformed = pd.DataFrame(result)
result_with_ids = pd.concat([test_json['id'], result_transformed], join = 'outer', axis = 1)
print(result_with_ids) 
result_with_ids.to_csv('result_vectorizer2.csv', index = False)

['british' 'southern_us' 'italian' ... 'italian' 'cajun_creole' 'mexican']
9944
         id             0
0     18009       british
1     28583   southern_us
2     41580       italian
3     29752  cajun_creole
4     35687       italian
...     ...           ...
9939  30246        french
9940  36028   southern_us
9941  22339       italian
9942  42525  cajun_creole
9943   1443       mexican

[9944 rows x 2 columns]


## Logistic Regression
Train a model based on logistic regression and write the prediction's results to file.

In [None]:
from sklearn.linear_model import LogisticRegression
# clf = LogisticRegression(max_iter=1000, class_weight="balanced").fit(X_train_vec, Y_train)
clf = LogisticRegression(max_iter=1000).fit(X_train_vec, Y_train)
result = clf.predict(testfinal)
print(result)
print(len(result))
result_transformed = pd.DataFrame(result)
result_with_ids = pd.concat([test_json['id'], result_transformed], join = 'outer', axis = 1)
print(result_with_ids) 
result_with_ids.to_csv('result_logistic_regression.csv', index = False)

## XGB Classifier
Train a model based on an xgb classifier.

In [None]:
import xgboost
print(xgboost.__version__)
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
model = XGBClassifier()
model.fit(X_train_vec, Y_train)

Get prediction using the xgb classifier model and write results to file.

In [None]:
result = model.predict(testfinal)
print(result)
print(len(result))
result_transformed = pd.DataFrame(result)
result_with_ids = pd.concat([test_json['id'], result_transformed], join = 'outer', axis = 1)
print(result_with_ids) 
result_with_ids.to_csv('result_xgbclassifier.csv', index = False)

### Connect a Data Lake:

In [None]:
from azureml.core import Workspace, Datastore
ADLSGEN2_DATASTORE = 'stdatasciencelab'
ADLSGEN2_ACCOUNT = 'stdatasciencelab'
TENANT_ID = '022e4faf-c745-475a-be06-06b1e1c9e39d'

ws = Workspace.from_config()

kv = ws.get_default_keyvault()
CLIENT_ID = kv.get_secret('azureml-dls-appid')
CLIENT_SECRET = kv.get_secret('azureml-dls-secret')

adlsgen2_datastore = Datastore.register_azure_data_lake_gen2(
    workspace=ws, 
    datastore_name=ADLSGEN2_DATASTORE, 
    account_name=ADLSGEN2_ACCOUNT,
    filesystem='data',
    tenant_id=TENANT_ID,
    client_id=CLIENT_ID,
    client_secret=CLIENT_SECRET)

In [None]:
import dns.resolver

ADLS_URL = 'stdatasciencelab.blob.core.windows.net'
answers = dns.resolver.resolve(ADLS_URL)

for entry in answers:
    print('Server {srv} has IP address {ip}'.format(srv=ADLS_URL, ip=entry))

Convert json to json-line format 

In [None]:
train_json.to_json("train.jl", orient="records", lines=True)

Read in json line data for processing.

In [5]:
from azureml.core import Workspace, Dataset, Datastore
from azureml.data.datapath import DataPath

ws = Workspace.from_config()
ds = ws.datastores['stdatasciencelab']

df = Dataset.Tabular.from_json_lines_files(path = [(ds, '/train.jl')]).to_pandas_dataframe()
df.head()

# X_train = df['ingredients'].apply(preprocess)
# Y_train = df['cuisine']
# print(X_train.size)
# print(Y_train.size)
# X_train.head()

Unnamed: 0,id,cuisine,ingredients
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,22213,indian,"[water, vegetable oil, wheat, salt]"
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe..."


In [None]:
# just some test code. can be deleted when not needed anymore

datastore = Datastore.get(ws, datastore_name='stdatasciencelab')
print(datastore)
#dataset = Dataset.get_by_id(ws, id='stdatasciencelab')

ds = Datastore.get(ws, datastore_name='stdatasciencelab')
print(ds)

# get the input dataset by ID
#dataset = Dataset.get_by_id(ws, id='stdatasciencelab')

# load the TabularDataset to pandas DataFrame
#df = dataset.to_pandas_dataframe()

df = Dataset.Tabular.from_json_lines_files(path = [(ds, '/train.jl')]).to_pandas_dataframe()
X_train = df['ingredients'].apply(preprocess)
Y_train = df['cuisine']

vectorizer2 = CountVectorizer()
X_train_vec = vectorizer2.fit_transform(X_train)
mnb = MultinomialNB()
mnb.fit(X_train_vec, Y_train)
X_test = vectorizer2.transform(X_train)
mnb.predict(X_test)

my_model = load('cooking.pkl')

my_model.predict(X_test)

### Glossary of Azure ML

#### Azure ML - General:

- Azure ML Designer: Drag-and-Drop Interface ("Designer" in the sidebar to the left)
- Azure ML Ops: comparable to Azure DevOps
- Azure ML Pipelines: comparable to Azure Pipelines ("Pipelines" in the sidebar to the left)

#### Data:
In this sample project we retrieve our training data from a Data Lake, 
but of course there are many, many different options for Data according to your needs.
In the sidebar to the left ("Datastores") you can inspect your current choice.

#### Ways to run ML experiments:
Your experiments as well as your finished models can be found in the sidebar to the left ("Experiments", and "Models" respectively)
- Automated ML ("Automated ML" in the sidebar to the left): 
    - Either no-code or code-first. 
    - Easy to use, but limited capabilities - only pre-defined algorithms can be used.
- Script Configuration: Provide your own Python scripts for ML experiments.

#### Targets for model training:
Those can be found in the sidebar to the left ("Compute")
- Azure ML Compute Instance: For small-scale experiments
- Azure ML Compute Cluster: For larger-scale experiments
- You can also plug-in Non-ML targets of Azure:
    - Azure Data Bricks
    - Azure Data Lake Analytics
    - Azure HD Insights
    - Azure Batch

#### Targets for model deployment:
Those can be found in the sidebar to the left ("Endpoints" as well as "Compute")
- Azure Container Instance: For small-scale experiments
- Azure ML Compute Cluster: For larger-scale experiments
- Azure Kubernetes Service: For largest-scale experiments

## Automated ML
Let Azure ML take care of the experiment's details - just choose your preferred configuration and you're good to go.

In [None]:
import logging

automl_settings = {
    "iteration_timeout_minutes": 10,
    "experiment_timeout_hours": 0.3,
    "enable_early_stopping": True,
    "primary_metric": 'spearman_correlation',
    "featurization": 'auto',
    "verbosity": logging.INFO,
    "n_cross_validations": 5
}

from azureml.train.automl import AutoMLConfig

automl_config = AutoMLConfig(task='regression',
                             debug_log='automated_ml_errors.log',
                             training_data=X_train_vec,
                             label_column_name="ingredients",
                             **automl_settings)

from azureml.core.experiment import Experiment
experiment = Experiment(ws, "cooking")
local_run = experiment.submit(automl_config, show_output=True)

## Run Experiments using your own scripts for training & deployment:

In [135]:
from azureml.core import Experiment
experiment_name = 'cooking-naive-bayes'

exp = Experiment(workspace=ws, name=experiment_name)

We take advantage of an Azure ML Compute Cluster for running trainings of our ML model

In [137]:
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
import os

# choose a name for your cluster
compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME", "cpu-cluster")
compute_min_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MIN_NODES", 0)
compute_max_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MAX_NODES", 4)

# This example uses CPU VM. For using GPU VM, set SKU to STANDARD_NC6
vm_size = os.environ.get("AML_COMPUTE_CLUSTER_SKU", "STANDARD_D2_V2")


if compute_name in ws.compute_targets:
    compute_target = ws.compute_targets[compute_name]
    if compute_target and type(compute_target) is AmlCompute:
        print('found compute target. just use it. ' + compute_name)
else:
    print('creating a new compute target...')
    provisioning_config = AmlCompute.provisioning_configuration(vm_size=vm_size,
                                                                min_nodes=compute_min_nodes,
                                                                max_nodes=compute_max_nodes)

    # create the cluster
    compute_target = ComputeTarget.create(
        ws, compute_name, provisioning_config)

    # can poll for a minimum number of nodes and for a specific timeout.
    # if no min node count is provided it will use the scale settings for the cluster
    compute_target.wait_for_completion(
        show_output=True, min_node_count=None, timeout_in_minutes=20)

    # For a more detailed view of current AmlCompute status, use get_status()
    print(compute_target.get_status().serialize())

found compute target. just use it. cpu-cluster


Configure the directory of your experiment's scripts

In [136]:
import os
script_folder = os.path.join(os.getcwd(), "sklearn-cooking")
os.makedirs(script_folder, exist_ok=True)

Set up your remote cluster's environment by installing required python packages.

In [138]:
from azureml.core.environment import Environment
from azureml.core.conda_dependencies import CondaDependencies

# to install required packages
env = Environment('cooking-test-env')
cd = CondaDependencies.create(pip_packages=['azureml-dataset-runtime[pandas,fuse]', 'azureml-defaults', 'unidecode', 'nltk'], conda_packages=['scikit-learn==0.22.1'])

env.python.conda_dependencies = cd

# Register environment to re-use later
env.register(workspace=ws)

{
    "databricks": {
        "eggLibraries": [],
        "jarLibraries": [],
        "mavenLibraries": [],
        "pypiLibraries": [],
        "rcranLibraries": []
    },
    "docker": {
        "arguments": [],
        "baseDockerfile": null,
        "baseImage": "mcr.microsoft.com/azureml/intelmpi2018.3-ubuntu16.04:20210104.v1",
        "baseImageRegistry": {
            "address": null,
            "password": null,
            "registryIdentity": null,
            "username": null
        },
        "enabled": false,
        "platform": {
            "architecture": "amd64",
            "os": "Linux"
        },
        "sharedVolumes": true,
        "shmSize": null
    },
    "environmentVariables": {
        "EXAMPLE_ENV_VAR": "EXAMPLE_VALUE"
    },
    "inferencingStackVersion": null,
    "name": "cooking-test-env",
    "python": {
        "baseCondaEnvironment": null,
        "condaDependencies": {
            "channels": [
                "anaconda",
                "conda-fo

This is where the action happens - the train.py script gets executed in a training session. 
The top line writes the script's content in the folder we specified earlier. 

In [139]:
%%writefile $script_folder/train.py

import argparse
import os
import numpy as np
import glob
import joblib
from azureml.core import Workspace, Dataset, Datastore, Run
from azureml.data.datapath import DataPath
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from utils import preprocess
import nltk
nltk.download('wordnet')

run = Run.get_context()
ws = run.experiment.workspace

ds = Datastore.get(ws, datastore_name='stdatasciencelab')
print(ds)

df = Dataset.Tabular.from_json_lines_files(path = [(ds, '/train.jl')]).to_pandas_dataframe()
X_train = df['ingredients'].apply(preprocess)
Y_train = df['cuisine']

vectorizer2 = CountVectorizer()
X_train_vec = vectorizer2.fit_transform(X_train)

mnb = MultinomialNB()
mnb.fit(X_train_vec, Y_train)

X_test = vectorizer2.transform(X_train)
mnb.predict(X_test)
os.makedirs('outputs', exist_ok=True)
joblib.dump(value=mnb, filename='outputs/cooking.pkl')

Overwriting /mnt/batch/tasks/shared/LS_root/mounts/clusters/stefan/code/Users/rainer/sklearn-cooking/train.py


For our model to work we need a preprocess-function, that's inside a utils.py file. We also copy it to the according folder.

In [140]:
import shutil
shutil.copy('utils.py', script_folder)

'/mnt/batch/tasks/shared/LS_root/mounts/clusters/stefan/code/Users/rainer/sklearn-cooking/utils.py'

In [None]:
The following code creates a scriptRunConfiguration.

In [141]:
from azureml.core import ScriptRunConfig

#args = ['--data-folder', mnist_file_dataset.as_mount(), '--regularization', 0.5]

src = ScriptRunConfig(source_directory=script_folder,
                      script='train.py', 
                      #arguments=args,
                      compute_target=compute_target,
                      environment=env)

Starts a run. After this code is executed, we can watch the cluster start-up and train our model. ("Experiments" in the sidebar to the left.)

In [142]:
run = exp.submit(config=src)
run

Experiment,Id,Type,Status,Details Page,Docs Page
cooking-naive-bayes,cooking-naive-bayes_1614287371_39111bb7,azureml.scriptrun,Starting,Link to Azure Machine Learning studio,Link to Documentation


As an alternative to the GUI-based "Experiments" I mentioned we can also have a widget provide the information regarding the current experiment training run.

In [143]:
from azureml.widgets import RunDetails
RunDetails(run).show()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

On Completion we can get metadata of the run.

In [144]:
run.wait_for_completion(show_output=False)
print(run.get_metrics())

{}


We can access the run's outputs folder:

In [124]:
print(run.get_file_names())

['azureml-logs/55_azureml-execution-tvmps_90564ec88d84d5aaefebf9748d8268e12f853c10134e3482a4681949f5161cf7_d.txt', 'azureml-logs/65_job_prep-tvmps_90564ec88d84d5aaefebf9748d8268e12f853c10134e3482a4681949f5161cf7_d.txt', 'azureml-logs/70_driver_log.txt', 'azureml-logs/75_job_post-tvmps_90564ec88d84d5aaefebf9748d8268e12f853c10134e3482a4681949f5161cf7_d.txt', 'azureml-logs/process_info.json', 'azureml-logs/process_status.json', 'logs/azureml/104_azureml.log', 'logs/azureml/dataprep/backgroundProcess.log', 'logs/azureml/dataprep/backgroundProcess_Telemetry.log', 'logs/azureml/dataprep/engine_spans_l_0a55da96-2682-4450-ba75-88752d173b1f.jsonl', 'logs/azureml/dataprep/python_span_l_0a55da96-2682-4450-ba75-88752d173b1f.jsonl', 'logs/azureml/job_prep_azureml.log', 'logs/azureml/job_release_azureml.log', 'outputs/cooking.pkl']


And we can also register our finished model so others can use it, too. Finally we deploy it to an Azure Container Instance. 
As we use sci-kit learn, Azure ML automatically takes care of creating the container.

In [132]:
from azureml.core import Model
from azureml.core.resource_configuration import ResourceConfiguration
from datetime import datetime

dt = datetime.now()
# register model
model = run.register_model(model_name='cooking_naivebayes_' + dt.strftime('%d%m%Y_%H%M%S'),
                           model_path='outputs/cooking.pkl',
                           model_framework=Model.Framework.SCIKITLEARN,
                           model_framework_version='0.19.1',
                           resource_configuration=ResourceConfiguration(cpu=1, memory_in_gb=0.5))

# deploy model
web_service = Model.deploy(ws, "scikit-learn-service", [model])

print(model.name, model.id, model.version, sep='\t')

cooking_naivebayes_24022021_135814	cooking_naivebayes_24022021_135814:1	1


For manual deployment, we need to provide an entry script (entry.py, also listed here in our notebook) with two functions init and run.

In [145]:
import json
import numpy as np
import os
from sklearn.externals import joblib


def init():
    global model
    model_path = os.path.join(os.getenv('AZUREML_MODEL_DIR'), 'cooking.pkl')
    model = joblib.load(model_path)

def run(data):
    try:
        data = np.array(json.loads(data))
        result = model.predict(data)
        # You can return any data type, as long as it is JSON serializable.
        return result.tolist()
    except Exception as e:
        error = str(e)
        return error

We also need an inference config:

In [149]:
from azureml.core.environment import Environment
from azureml.core.model import InferenceConfig


env = Environment.get(ws, "AzureML-Minimal").clone(env)

for pip_package in ["scikit-learn"]:
    env.python.conda_dependencies.add_pip_package(pip_package)

inference_config = InferenceConfig(entry_script='entry.py',
                                    environment=env)

Pick web service target and start deployment.

In [None]:
from azureml.core.webservice import LocalWebservice, Webservice, AciWebservice

#deployment_config = LocalWebservice.deploy_configuration(port=8890)
deployment_config = AciWebservice.deploy_configuration(cpu_cores = 1, memory_gb = 1)
service = Model.deploy(ws, "myservice", [model], inference_config, deployment_config)
service.wait_for_deployment(show_output = True)
print(service.state)