<i>Copyright (c) Microsoft Corporation. All rights reserved.</i>

<i>Licensed under the MIT License.</i>

# Train LGBM Recommendation Model on MovieLens
## Using Azure Machine Learning service (Python, CPU)

In [1]:
import os
import shutil
import sys
import lightgbm as lgb
from tempfile import TemporaryDirectory

from ipywidgets import interact
import json
import pandas as pd
import requests

import azureml
from azureml.core import Experiment, Run, Workspace
from azureml.core.compute import AksCompute, AmlCompute, ComputeTarget
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.image import Image
from azureml.core.image.container import ContainerImage
from azureml.core.model import Model
from azureml.core.webservice import AksWebservice, Webservice
from azureml.train.estimator import Estimator
from azureml.widgets import RunDetails
print("azureml.core version: {}".format(azureml.core.VERSION))

sys.path.append('../..')
import reco_utils
from reco_utils.dataset.movielens import load_pandas_df
from reco_utils.dataset.movielens import GENRES
print("reco_utils version: {}".format(reco_utils.VERSION))

azureml.core version: 1.0.18
reco_utils version: 2019.05


In [2]:
# Point to the path of the config file from Azure portal
ws = Workspace.from_config(path='~/config.json')

Found the config file in: C:\Users\T-DARZHA\Downloads\config.json


In [3]:
# General variables
COL_USER = 'UserID'
COL_ITEM = 'ItemID'
COL_RATING = 'Rating'
COL_TIMESTAMP = 'Timestamp'
COL_TITLE = 'Title'
COL_GENRE = 'Genre'
COL_YEAR = 'Year'

HEADER = (COL_USER, COL_ITEM, COL_RATING, COL_TIMESTAMP)

TOP_K = 10
DATA_SIZE = '1m'

# AML Experiment config
EXPERIMENT_NAME = 'movielens-lgbm'
PIP_PACKAGES = ['azureml-sdk', 'pandas', 'sklearn', 'tqdm', 'lightgbm']

# AML Compute config
CLUSTER_NAME = 'recocluster'
VM_SIZE = 'STANDARD_D2_V2'
MIN_NODES = 0
MAX_NODES = 1

# AML Image config
IMAGE_NAME = 'lgbmnew{}'.format(DATA_SIZE)

# AML Model config
MODEL_NAME = 'lgbm_model.model'
MODEL_PATH = 'outputs/{}'.format(MODEL_NAME)

# AKS config
AKS_NAME = 'akscompute'
AKS_SERVICE = 'aksrecolgbmnew{}'.format(DATA_SIZE)

CURRENT_DIR = os.path.abspath('.')

In [4]:
os.chdir('~/recommenders/notebooks/00_quick_start')
TEMP_DIR = TemporaryDirectory()
print(TEMP_DIR)
def make_temp(name):
    return os.path.join(TEMP_DIR.name, name)

# copy reco_utils dependency to temp dir
shutil.copytree(os.path.join('..', '..', 'reco_utils'), make_temp('reco_utils'))

# it's necessary to move to this directory for the image to be built properly
os.chdir(TEMP_DIR.name)

TRAIN_FILE = make_temp('train.py')
ENTRY_SCRIPT = make_temp('entry.py')
CONDA_FILE = make_temp('conda.yml')

<TemporaryDirectory 'C:\\Users\\T-DARZHA\\AppData\\Local\\Temp\\tmp8h9uewwh'>


In [5]:
try:
    compute_target = ComputeTarget(workspace=ws, name=CLUSTER_NAME)
    print("Found compute target")
except:
    print("Creating compute target")
    # Specify the configuration for the new cluster
    compute_config = AmlCompute.provisioning_configuration(
        vm_size=VM_SIZE,
        min_nodes=MIN_NODES,
        max_nodes=MAX_NODES
    )
    
    # Create the cluster with the specified name and configuration
    compute_target = ComputeTarget.create(ws, CLUSTER_NAME, compute_config)
    
    # Wait for the cluster to complete, show the output log
    compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)

Found compute target


In [6]:
train_file = """

import sys, os
import lightgbm as lgb
import numpy as np
import pandas as pd

from reco_utils.dataset.movielens import load_pandas_df
from reco_utils.dataset.movielens import GENRES
from reco_utils.dataset.python_splitters import python_random_split
from sklearn.metrics import mean_squared_error
from sklearn.externals import joblib

df = load_pandas_df(genres_col="genre")

genre_cols = []
for genre in GENRES:
    genre_col = 'genre_' + genre
    df[genre_col] = df['genre'].apply(lambda x: genre in x).astype(int)
    genre_cols.append(genre_col)

# normalize genres for each user
users = df.drop(['itemID', 'rating', 'timestamp', 'genre'], axis=1).groupby('userID').sum()
users = users.div(users.sum(axis=1), axis=0)

all_data = df[['userID','itemID', 'rating', 'genre']].set_index('userID').join(users)
all_data['itemID'] = all_data['itemID'].astype('category')

movie_genre_cols = []
for genre in GENRES:
    genre_col = 'movie_genre_' + genre
    all_data[genre_col] = all_data['genre'].apply(lambda x: genre in x).astype(int)
    movie_genre_cols.append(genre_col)
    
MAX_LEAF = 64
MIN_DATA = 20
NUM_OF_TREES = 100
TREE_LEARNING_RATE = 0.15
EARLY_STOPPING_ROUNDS = 20
METRIC = 'rmse'
SIZE = "sample"
    
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': "regression",
    'metric': METRIC,
    'num_leaves': MAX_LEAF,
    'min_data': MIN_DATA,
    'boost_from_average': True,
    'num_threads': 20,
    'feature_fraction': 0.8,
    'learning_rate': TREE_LEARNING_RATE,
}

# split data to 3 sets
train, test = python_random_split(all_data, ratio=[.7, .3])
test, validate = python_random_split(test, ratio=[.7, .3])

cols = genre_cols + movie_genre_cols + ["itemID"]

train_x = train[cols]
train_y = train['rating']
test_x = test[cols]
test_y = test['rating']
validate_x = validate[cols]
validate_y = validate['rating']

lgb_train = lgb.Dataset(train_x, train_y, params=params)
lgb_test = lgb.Dataset(test_x, test_y, reference=lgb_train)
lgb_validate = lgb.Dataset(validate_x, validate_y, reference=lgb_train)

lgb_model = lgb.train(params,
                      lgb_train,
                      num_boost_round=NUM_OF_TREES,
                      early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                      valid_sets=lgb_validate)

item_rows = all_data[movie_genre_cols + ["itemID"]].drop_duplicates().reset_index(drop=True)
item_rows['key'] = 0

joblib.dump(lgb_model, filename='outputs/lgbm_model.model')

"""

with open(TRAIN_FILE, 'w') as f:
    f.writelines(train_file)
    

In [7]:
est = Estimator(source_directory=TEMP_DIR.name,
                compute_target=compute_target,
                entry_script=os.path.basename(TRAIN_FILE),
                pip_packages=PIP_PACKAGES)

# create experiment
exp = Experiment(workspace=ws, name=EXPERIMENT_NAME)
run = exp.submit(config=est)

In [8]:
run

Experiment,Id,Type,Status,Details Page,Docs Page
movielens-lgbm,movielens-lgbm_1566223971_2161c775,azureml.scriptrun,Starting,Link to Azure Portal,Link to Documentation


In [9]:
RunDetails(run).show()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

In [10]:
# Get metrics
metrics = run.get_metrics()
print(metrics)

{}


In [11]:
# Register the model
model = run.register_model(model_name=MODEL_NAME, model_path=MODEL_PATH)
print(model.name, model.id, model.version, sep = '\t')

lgbm_model.model	lgbm_model.model:13	13


# Deploy SAR Recommendation Webservice
## Using Azure Machine Learning service (Local, AKS)

In [13]:
entry_file = """

import json
import pandas as pd
from sklearn.externals import joblib
from reco_utils.dataset.movielens import load_pandas_df
from reco_utils.dataset.movielens import GENRES
import lightgbm as lgb
from azureml.core.model import Model

TOP_K = 10


def init():
    global model
    model_path = Model.get_model_path('{MODEL_NAME}')
    model = joblib.load(model_path)
    
    global items_df
    df = load_pandas_df(size='1m', 
                        header=({HEADER}), 
                        title_col='Title', 
                        genres_col='Genre', 
                        year_col='Year')
    items_df = (df[['ItemID', 'Title', 'Genre', 'Year']]
                .dropna()
                .drop_duplicates()
                .set_index('ItemID'))
    genre_cols = []
    for genre in GENRES:
        genre_col = 'genre_' + genre
        df[genre_col] = df['Genre'].apply(lambda x: genre in x).astype(int)
        genre_cols.append(genre_col)
    users = df.drop(['ItemID', 'Rating', 'Timestamp', 'Genre'], axis=1).groupby('UserID').sum()
    users = users.div(users.sum(axis=1), axis=0)
    all_data = df[['UserID', 'ItemID', 'Rating', 'Genre']].set_index('UserID').join(users)
    all_data['ItemID'] = all_data['ItemID'].astype('category')
    movie_genre_cols = []
    for genre in GENRES:
        genre_col = 'movie_genre_' + genre
        all_data[genre_col] = all_data['Genre'].apply(lambda x: genre in x).astype(int)
        movie_genre_cols.append(genre_col)
    global item_rows
    item_rows = all_data[movie_genre_cols + ["ItemID"]].drop_duplicates().reset_index(drop=True)
    item_rows['key'] = 0

def run(data):
    try:
        df = pd.read_json(data, typ='series')
        test_df = pd.DataFrame(data=df.to_dict(), index=[0])
        test_df = pd.merge(test_df, item_rows, on='key')
        test_df['prediction'] = model.predict(test_df)
        top_result = test_df.sort_values(by='prediction', ascending=False).head(10)
        top_result.drop(top_result.iloc[:,1:40], inplace = True, axis = 1)
        return top_result.join(items_df, on='ItemID').to_dict()
    except Exception as e:
        return str(e)
        
""".format(TOP_K=TOP_K, 
           MODEL_NAME=MODEL_NAME, 
           DATA_SIZE=DATA_SIZE, 
           HEADER=HEADER, 
           COL_USER=COL_USER, 
           COL_ITEM=COL_ITEM, 
           COL_TITLE=COL_TITLE, 
           COL_GENRE=COL_GENRE, 
           COL_YEAR=COL_YEAR,
           MODEL_PATH=MODEL_PATH)

with open(ENTRY_SCRIPT, 'w') as f:
    f.writelines(entry_file)
    
with open(CONDA_FILE, "w") as f:
    f.write(CondaDependencies.create(pip_packages=PIP_PACKAGES).serialize_to_string())

In [14]:
model = Model(workspace=ws, name=MODEL_NAME)

In [15]:
image_config = ContainerImage.image_configuration(runtime="python",
                                                  execution_script=os.path.basename(ENTRY_SCRIPT),
                                                  conda_file=os.path.basename(CONDA_FILE),
                                                  dependencies=['reco_utils'])

try:
    image = Image(workspace=ws, name=IMAGE_NAME)
    print("Found Image")
except:
    print("Creating Container Image")
    # create the image
    image = Image.create(workspace=ws, 
                         name=IMAGE_NAME, 
                         models=[model], 
                         image_config=image_config)

    # wait for image creation to finish
    image.wait_for_creation(show_output=True)

Creating Container Image
Creating image
Running..............................
SucceededImage creation operation finished for image lgbmnew1m:2, operation "Succeeded"


In [16]:
try:
    aks_target = ComputeTarget(workspace=ws, name=AKS_NAME)
    print("Found AKS compute target")
except:
    print("Creating AKS compute target")

    # Use the default configuration for now
    prov_config = AksCompute.provisioning_configuration()

    # Create the cluster
    aks_target = ComputeTarget.create(workspace=ws,
                                      name=AKS_NAME,
                                      provisioning_configuration=prov_config)

    # Wait for the create process to complete
    aks_target.wait_for_completion(show_output=True)

Found AKS compute target


In [17]:
local = False
if local:
    # Test locally
    deployment_config = LocalWebservice.deploy_configuration(port=8889)
    service = Webservice.deploy_local_from_model(workspace=ws,
                                                 name='localservice',
                                                 models=[model],
                                                 image_config=image_config,
                                                 deployment_config=deployment_config)
else:
    # Deploy to AKS
    try:
        service = AksWebservice(workspace=ws, name=AKS_SERVICE)
        print('Found AKS service')
    except:
        print('Creating AKS service')
        deployment_config = AksWebservice.deploy_configuration()
        service = Webservice.deploy_from_image(workspace=ws,
                                               name=AKS_SERVICE,
                                               image=image,
                                               deployment_config=deployment_config,
                                               deployment_target=aks_target)

service.wait_for_deployment(show_output = True)
print(service.state)

Creating AKS service
Creating service
Running.........
SucceededAKS service creation operation finished, operation "Succeeded"
Healthy


# Test LightGBM Recommendations

In [18]:
df = load_pandas_df(size=DATA_SIZE, header=HEADER, title_col=COL_TITLE, genres_col=COL_GENRE, year_col=COL_YEAR)
df = df[[COL_ITEM, COL_TITLE, COL_GENRE, COL_YEAR]].dropna().drop_duplicates()

5.92MB [00:02, 4.03MB/s]                                                                                               


In [19]:
genres = set()
for genre_list in df[COL_GENRE].unique():
    for genre in genre_list.split('|'):
        genres.add(genre)
genres = ['All'] + sorted(genres)

years = ['All'] + sorted(df[COL_YEAR].unique(), reverse=True)

def view(title, genre, year):
    tmp_df = df[df[COL_TITLE].str.contains(title)].set_index(COL_ITEM)
    if genre != 'All':
        tmp_df = tmp_df[tmp_df[COL_GENRE].str.contains(genre)]
    if year != 'All':
        tmp_df = tmp_df[tmp_df[COL_YEAR] == year]
    return tmp_df.sort_values(COL_TITLE)

interact(lambda title, genre, year: view(title=title, genre=genre, year=year), title='', genre=genres, year=years);    

interactive(children=(Text(value='', description='title'), Dropdown(description='genre', options=('All', 'Acti…

In [20]:
# A test payload to show the format
items = {'UserID': 0, 
           'key': 0,
           'genre_unknown': 0,
           'genre_Action': 0,
           'genre_Adventure': 0,
           'genre_Animation': 0.25,
           'genre_Children\'s': 0.75,
           'genre_Comedy': 0,
           'genre_Crime': 0,
           'genre_Documentary': 0,
           'genre_Drama': 0,
           'genre_Fantasy': 0,
           'genre_Film-Noir': 0,
           'genre_Horror': 0,
           'genre_Musical': 0,
           'genre_Mystery': 0,
           'genre_Romance': 0,
           'genre_Sci-Fi': 0,
           'genre_Thriller': 0,
           'genre_War': 0,
           'genre_Western':0}

In [21]:
if service.compute_type == 'AKS':
    url = service.scoring_uri

    # Setup authentication using one of the keys from service
    headers = dict(Authorization='Bearer {}'.format(service.get_keys()[0]))
else:
    url = 'http://localhost:8889/score'
    headers = None

print(headers)
print('Service URI: {}'.format(url))

{'Authorization': 'Bearer ve9PgvhP3jVZEdnabnbrVZOgilEAEiJy'}
Service URI: http://52.170.115.80:80/api/v1/service/aksrecolgbmnew1m/score


In [25]:
response = requests.post(url=url, json=items, headers=headers)
if response.status_code != 200:
    print(response.content, response.status_code)
else:
    result = pd.DataFrame.from_dict(response.json(), orient='columns')

print(result)

      UserID  ItemID  prediction  \
1272     0.0    2917    4.361917   
1273     0.0     431    4.361917   
1279     0.0     521    4.361917   
135      0.0    3735    4.361917   
2101     0.0    3529    4.361917   
2179     0.0     869    4.361917   
2214     0.0    3783    4.361917   
2962     0.0    1598    4.361917   
2996     0.0    3335    4.361917   
982      0.0     293    4.361917   

                                                  Title  \
1272                                   Body Heat (1981)   
1273                               Carlito's Way (1993)   
1279                           Romeo Is Bleeding (1993)   
135                                      Serpico (1973)   
2101             Postman Always Rings Twice, The (1981)   
2179                                 Kansas City (1996)   
2214                                    Croupier (1998)   
2962                          Desperate Measures (1998)   
2996                                   Jail Bait (1954)   
982   Profess

# Cleanup

In [179]:
# clean up temporary directory
os.chdir(CURRENT_DIR)
TEMP_DIR.cleanup()