diff --git a/examples/00_quick_start/als_movielens.ipynb b/examples/00_quick_start/als_movielens.ipynb index 1f1b1a0bf0..5c8541b628 100644 --- a/examples/00_quick_start/als_movielens.ipynb +++ b/examples/00_quick_start/als_movielens.ipynb @@ -2,34 +2,46 @@ "cells": [ { "cell_type": "markdown", + "metadata": {}, "source": [ "Copyright (c) Microsoft Corporation. All rights reserved.\n", "\n", "Licensed under the MIT License." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "# Running ALS on MovieLens (PySpark)\n", "\n", "Matrix factorization by [ALS](https://spark.apache.org/docs/latest/api/python/_modules/pyspark/ml/recommendation.html#ALS) (Alternating Least Squares) is a well known collaborative filtering algorithm.\n", "\n", "This notebook provides an example of how to utilize and evaluate ALS PySpark ML (DataFrame-based API) implementation, meant for large-scale distributed datasets. We use a smaller dataset in this example to run ALS efficiently on multiple cores of a [Data Science Virtual Machine](https://azure.microsoft.com/en-gb/services/virtual-machines/data-science-virtual-machines/)." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "**Note**: This notebook requires a PySpark environment to run properly. Please follow the steps in [SETUP.md](../../SETUP.md) to install the PySpark environment." - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "System version: 3.8.0 (default, Nov 6 2019, 21:49:08) \n", + "[GCC 7.3.0]\n", + "Spark version: 3.2.0\n" + ] + } + ], "source": [ "# set the environment path to find Recommenders\n", "import sys\n", @@ -39,6 +51,8 @@ "from pyspark.sql import SparkSession\n", "from pyspark.sql.types import StructType, StructField\n", "from pyspark.sql.types import StringType, FloatType, IntegerType, LongType\n", + "import warnings\n", + "warnings.simplefilter(action='ignore', category=FutureWarning)\n", "\n", "from recommenders.utils.timer import Timer\n", "from recommenders.datasets import movielens\n", @@ -49,30 +63,24 @@ "\n", "print(\"System version: {}\".format(sys.version))\n", "print(\"Spark version: {}\".format(pyspark.__version__))\n" - ], - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "System version: 3.6.8 |Anaconda, Inc.| (default, Dec 30 2018, 01:22:34) \n", - "[GCC 7.3.0]\n", - "Spark version: 2.3.1\n" - ] - } - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Set the default parameters." - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 2, + "metadata": { + "tags": [ + "parameters" + ] + }, + "outputs": [], "source": [ "# top k items to recommend\n", "TOP_K = 10\n", @@ -85,69 +93,52 @@ "COL_ITEM = \"MovieId\"\n", "COL_RATING = \"Rating\"\n", "COL_TIMESTAMP = \"Timestamp\"" - ], - "outputs": [], - "metadata": { - "tags": [ - "parameters" - ] - } + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "### 0. Set up Spark context\n", "\n", "The following settings work well for debugging locally on VM - change when running on a cluster. We set up a giant single executor with many threads and specify memory cap. " - ], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, + "metadata": {}, + "outputs": [], "source": [ "# the following settings work well for debugging locally on VM - change when running on a cluster\n", "# set up a giant single executor with many threads and specify memory cap\n", - "spark = start_or_get_spark(\"ALS PySpark\", memory=\"16g\")" - ], - "outputs": [], - "metadata": {} + "spark = start_or_get_spark(\"ALS PySpark\", memory=\"16g\")\n", + "spark.conf.set(\"spark.sql.analyzer.failAmbiguousSelfJoin\", \"false\")" + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "### 1. Download the MovieLens dataset" - ], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 5, - "source": [ - "# Note: The DataFrame-based API for ALS currently only supports integers for user and item ids.\n", - "schema = StructType(\n", - " (\n", - " StructField(COL_USER, IntegerType()),\n", - " StructField(COL_ITEM, IntegerType()),\n", - " StructField(COL_RATING, FloatType()),\n", - " StructField(COL_TIMESTAMP, LongType()),\n", - " )\n", - ")\n", - "\n", - "data = movielens.load_spark_df(spark, size=MOVIELENS_DATA_SIZE, schema=schema)\n", - "data.show()" - ], + "execution_count": 4, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stderr", + "output_type": "stream", "text": [ - "100%|██████████| 4.81k/4.81k [00:00<00:00, 19.9kKB/s]\n" + "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4.81k/4.81k [00:05<00:00, 882KB/s]\n", + " \r" ] }, { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+------+-------+------+---------+\n", "|UserId|MovieId|Rating|Timestamp|\n", @@ -178,48 +169,63 @@ ] } ], - "metadata": {} + "source": [ + "# Note: The DataFrame-based API for ALS currently only supports integers for user and item ids.\n", + "schema = StructType(\n", + " (\n", + " StructField(COL_USER, IntegerType()),\n", + " StructField(COL_ITEM, IntegerType()),\n", + " StructField(COL_RATING, FloatType()),\n", + " StructField(COL_TIMESTAMP, LongType()),\n", + " )\n", + ")\n", + "\n", + "data = movielens.load_spark_df(spark, size=MOVIELENS_DATA_SIZE, schema=schema)\n", + "data.show()" + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "### 2. Split the data using the Spark random splitter provided in utilities" - ], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 6, - "source": [ - "train, test = spark_random_split(data, ratio=0.75, seed=123)\n", - "print (\"N train\", train.cache().count())\n", - "print (\"N test\", test.cache().count())" - ], + "execution_count": 5, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ - "N train 75193\n", - "N test 24807\n" + "N train 75018\n", + "N test 24982\n" ] } ], - "metadata": {} + "source": [ + "train, test = spark_random_split(data, ratio=0.75, seed=123)\n", + "print (\"N train\", train.cache().count())\n", + "print (\"N test\", test.cache().count())" + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "### 3. Train the ALS model on the training data, and get the top-k recommendations for our testing data\n", "\n", "To predict movie ratings, we use the rating data in the training set as users' explicit feedback. The hyperparameters used in building the model are referenced from [here](http://mymedialite.net/examples/datasets.html). We do not constrain the latent factors (`nonnegative = False`) in order to allow for both positive and negative preferences towards movies.\n", "Timing will vary depending on the machine being used to train." - ], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, + "metadata": {}, + "outputs": [], "source": [ "header = {\n", " \"userCol\": COL_USER,\n", @@ -238,42 +244,65 @@ " seed=42,\n", " **header\n", ")" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 8, - "source": [ - "with Timer() as train_time:\n", - " model = als.fit(train)\n", - "\n", - "print(\"Took {} seconds for training.\".format(train_time.interval))" - ], + "execution_count": 7, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ - "Took 3.2410509269684553 seconds for training.\n" + "Took 7.5410127229988575 seconds for training.\n" ] } ], - "metadata": {} + "source": [ + "with Timer() as train_time:\n", + " model = als.fit(train)\n", + "\n", + "print(\"Took {} seconds for training.\".format(train_time.interval))" + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "In the movie recommendation use case, recommending movies that have been rated by the users do not make sense. Therefore, the rated movies are removed from the recommended items.\n", "\n", "In order to achieve this, we recommend all movies to all users, and then remove the user-movie pairs that exist in the training dataset." - ], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 126:====================================================>(198 + 2) / 200]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Took 25.246142672998758 seconds for prediction.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\r", + " \r" + ] + } + ], "source": [ "with Timer() as test_time:\n", "\n", @@ -298,159 +327,218 @@ " top_all.cache().count()\n", "\n", "print(\"Took {} seconds for prediction.\".format(test_time.interval))" - ], - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Took 10.559875106438994 seconds for prediction.\n" - ] - } - ], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 10, - "source": [ - "top_all.show()" - ], + "execution_count": 9, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+------+-------+----------+\n", "|UserId|MovieId|prediction|\n", "+------+-------+----------+\n", - "| 1| 587| 3.0676804|\n", - "| 1| 869| 2.4396753|\n", - "| 1| 1208| 3.2788403|\n", - "| 1| 1357| 2.0567489|\n", - "| 1| 1677| 2.9661644|\n", - "| 2| 80| 2.3442159|\n", - "| 2| 472| 3.060428|\n", - "| 2| 582| 3.489215|\n", - "| 2| 838| 1.0985656|\n", - "| 2| 975| 1.8764799|\n", - "| 2| 1260| 3.0814102|\n", - "| 2| 1381| 3.288192|\n", - "| 2| 1530| 1.9368806|\n", - "| 3| 22| 4.2560363|\n", - "| 3| 57| 3.295701|\n", - "| 3| 89| 4.983886|\n", - "| 3| 367| 2.5427854|\n", - "| 3| 1091| 1.4424214|\n", - "| 3| 1167| 2.2066739|\n", - "| 3| 1499| 3.368075|\n", + "| 1| 587| 4.1602826|\n", + "| 1| 869| 2.7732863|\n", + "| 1| 1208| 2.033383|\n", + "| 1| 1348| 1.0019257|\n", + "| 1| 1357| 0.9430026|\n", + "| 1| 1677| 2.8777318|\n", + "| 2| 80| 2.351385|\n", + "| 2| 472| 2.5865319|\n", + "| 2| 582| 3.9548612|\n", + "| 2| 838| 0.9482963|\n", + "| 2| 975| 3.1133535|\n", + "| 2| 1260| 1.9871743|\n", + "| 2| 1325| 1.2368056|\n", + "| 2| 1381| 3.5477588|\n", + "| 2| 1530| 2.08829|\n", + "| 3| 22| 3.1524537|\n", + "| 3| 57| 3.6980162|\n", + "| 3| 89| 3.9733813|\n", + "| 3| 367| 3.6629045|\n", + "| 3| 1091| 0.9144474|\n", "+------+-------+----------+\n", "only showing top 20 rows\n", "\n" ] } ], - "metadata": {} + "source": [ + "top_all.show()" + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "### 4. Evaluate how well ALS performs" - ], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], "source": [ "rank_eval = SparkRankingEvaluation(test, top_all, k = TOP_K, col_user=COL_USER, col_item=COL_ITEM, \n", " col_rating=COL_RATING, col_prediction=\"prediction\", \n", " relevancy_method=\"top_k\")" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 12, - "source": [ - "print(\"Model:\\tALS\",\n", - " \"Top K:\\t%d\" % rank_eval.k,\n", - " \"MAP:\\t%f\" % rank_eval.map_at_k(),\n", - " \"NDCG:\\t%f\" % rank_eval.ndcg_at_k(),\n", - " \"Precision@K:\\t%f\" % rank_eval.precision_at_k(),\n", - " \"Recall@K:\\t%f\" % rank_eval.recall_at_k(), sep='\\n')" - ], + "execution_count": 11, + "metadata": {}, "outputs": [ { + "name": "stderr", "output_type": "stream", + "text": [ + "[Stage 463:> (0 + 2) / 2]\r" + ] + }, + { "name": "stdout", + "output_type": "stream", "text": [ "Model:\tALS\n", "Top K:\t10\n", - "MAP:\t0.005734\n", - "NDCG:\t0.047460\n", - "Precision@K:\t0.051911\n", - "Recall@K:\t0.017514\n" + "MAP:\t0.006527\n", + "NDCG:\t0.051718\n", + "Precision@K:\t0.051274\n", + "Recall@K:\t0.018840\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\r", + " \r" ] } ], - "metadata": {} + "source": [ + "print(\"Model:\\tALS\",\n", + " \"Top K:\\t%d\" % rank_eval.k,\n", + " \"MAP:\\t%f\" % rank_eval.map_at_k(),\n", + " \"NDCG:\\t%f\" % rank_eval.ndcg_at_k(),\n", + " \"Precision@K:\\t%f\" % rank_eval.precision_at_k(),\n", + " \"Recall@K:\\t%f\" % rank_eval.recall_at_k(), sep='\\n')" + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "### 5. Evaluate rating prediction" - ], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 13, - "source": [ - "# Generate predicted ratings.\n", - "prediction = model.transform(test)\n", - "prediction.cache().show()\n" - ], + "execution_count": 12, + "metadata": {}, "outputs": [ { + "name": "stderr", "output_type": "stream", + "text": [ + "[Stage 500:=============================================> (171 + 3) / 200]\r" + ] + }, + { "name": "stdout", + "output_type": "stream", "text": [ "+------+-------+------+---------+----------+\n", "|UserId|MovieId|Rating|Timestamp|prediction|\n", "+------+-------+------+---------+----------+\n", - "| 406| 148| 3.0|879540276| 2.2832825|\n", - "| 27| 148| 3.0|891543129| 1.7940072|\n", - "| 606| 148| 3.0|878150506| 3.7863157|\n", - "| 916| 148| 2.0|880843892| 2.3045797|\n", - "| 236| 148| 4.0|890117028| 1.9480721|\n", - "| 602| 148| 4.0|888638517| 3.1172547|\n", - "| 663| 148| 4.0|889492989| 2.7976327|\n", - "| 372| 148| 5.0|876869915| 4.170663|\n", - "| 190| 148| 4.0|891033742| 3.6491241|\n", - "| 1| 148| 2.0|875240799| 2.829558|\n", - "| 297| 148| 3.0|875239619| 2.1554093|\n", - "| 178| 148| 4.0|882824325| 3.932391|\n", - "| 308| 148| 3.0|887740788| 2.9132738|\n", - "| 923| 148| 4.0|880387474| 3.5403519|\n", - "| 54| 148| 3.0|880937490| 3.165133|\n", - "| 430| 148| 2.0|877226047| 2.891675|\n", - "| 92| 148| 2.0|877383934| 2.6483998|\n", - "| 447| 148| 4.0|878854729| 3.1101565|\n", - "| 374| 148| 4.0|880392992| 2.2130618|\n", - "| 891| 148| 5.0|891639793| 3.138905|\n", + "| 580| 148| 4.0|884125773| 3.4059548|\n", + "| 406| 148| 3.0|879540276| 2.7134619|\n", + "| 916| 148| 2.0|880843892| 2.2241986|\n", + "| 663| 148| 4.0|889492989| 2.714362|\n", + "| 330| 148| 4.0|876544781| 4.52321|\n", + "| 935| 148| 4.0|884472892| 4.3838587|\n", + "| 308| 148| 3.0|887740788| 2.6169493|\n", + "| 20| 148| 5.0|879668713| 4.3721194|\n", + "| 923| 148| 4.0|880387474| 3.9818575|\n", + "| 455| 148| 3.0|879110346| 3.0764186|\n", + "| 15| 148| 3.0|879456049| 2.9913845|\n", + "| 374| 148| 4.0|880392992| 3.2223384|\n", + "| 880| 148| 2.0|880167030| 2.8111982|\n", + "| 677| 148| 4.0|889399265| 3.8451843|\n", + "| 49| 148| 1.0|888068195| 1.3751594|\n", + "| 244| 148| 2.0|880605071| 2.6781514|\n", + "| 84| 148| 4.0|883452274| 3.6721768|\n", + "| 627| 148| 3.0|879530463| 2.6362069|\n", + "| 434| 148| 3.0|886724797| 3.0973828|\n", + "| 793| 148| 4.0|875104498| 2.2886577|\n", "+------+-------+------+---------+----------+\n", "only showing top 20 rows\n", "\n" ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\r", + "[Stage 500:=================================================> (186 + 3) / 200]\r", + "\r", + " \r" + ] } ], - "metadata": {} + "source": [ + "# Generate predicted ratings.\n", + "prediction = model.transform(test)\n", + "prediction.cache().show()\n" + ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 775:==============================================> (174 + 2) / 200]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model:\tALS rating prediction\n", + "RMSE:\t0.967434\n", + "MAE:\t0.753340\n", + "Explained variance:\t0.265916\n", + "R squared:\t0.259532\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\r", + " \r" + ] + } + ], "source": [ "rating_eval = SparkRatingEvaluation(test, prediction, col_user=COL_USER, col_item=COL_ITEM, \n", " col_rating=COL_RATING, col_prediction=\"prediction\")\n", @@ -460,25 +548,225 @@ " \"MAE:\\t%f\" % rating_eval.mae(),\n", " \"Explained variance:\\t%f\" % rating_eval.exp_var(),\n", " \"R squared:\\t%f\" % rating_eval.rsquared(), sep='\\n')" - ], + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, "outputs": [ { + "data": { + "application/scrapbook.scrap.json+json": { + "data": 0.006527288768086336, + "encoder": "json", + "name": "map", + "version": 1 + } + }, + "metadata": { + "scrapbook": { + "data": true, + "display": false, + "name": "map" + } + }, + "output_type": "display_data" + }, + { + "name": "stderr", "output_type": "stream", - "name": "stdout", "text": [ - "Model:\tALS rating prediction\n", - "RMSE:\t0.967296\n", - "MAE:\t0.753306\n", - "Explained variance:\t0.261864\n", - "R squared:\t0.255480\n" + " \r" + ] + }, + { + "data": { + "application/scrapbook.scrap.json+json": { + "data": 0.051717802220247217, + "encoder": "json", + "name": "ndcg", + "version": 1 + } + }, + "metadata": { + "scrapbook": { + "data": true, + "display": false, + "name": "ndcg" + } + }, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" ] + }, + { + "data": { + "application/scrapbook.scrap.json+json": { + "data": 0.05127388535031851, + "encoder": "json", + "name": "precision", + "version": 1 + } + }, + "metadata": { + "scrapbook": { + "data": true, + "display": false, + "name": "precision" + } + }, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\r", + "[Stage 904:> (0 + 2) / 2]\r", + "\r", + " \r" + ] + }, + { + "data": { + "application/scrapbook.scrap.json+json": { + "data": 0.018840283525491316, + "encoder": "json", + "name": "recall", + "version": 1 + } + }, + "metadata": { + "scrapbook": { + "data": true, + "display": false, + "name": "recall" + } + }, + "output_type": "display_data" + }, + { + "data": { + "application/scrapbook.scrap.json+json": { + "data": 0.9674342234414528, + "encoder": "json", + "name": "rmse", + "version": 1 + } + }, + "metadata": { + "scrapbook": { + "data": true, + "display": false, + "name": "rmse" + } + }, + "output_type": "display_data" + }, + { + "data": { + "application/scrapbook.scrap.json+json": { + "data": 0.7533395161385739, + "encoder": "json", + "name": "mae", + "version": 1 + } + }, + "metadata": { + "scrapbook": { + "data": true, + "display": false, + "name": "mae" + } + }, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "data": { + "application/scrapbook.scrap.json+json": { + "data": 0.2659161968930053, + "encoder": "json", + "name": "exp_var", + "version": 1 + } + }, + "metadata": { + "scrapbook": { + "data": true, + "display": false, + "name": "exp_var" + } + }, + "output_type": "display_data" + }, + { + "data": { + "application/scrapbook.scrap.json+json": { + "data": 0.2595322728476255, + "encoder": "json", + "name": "rsquared", + "version": 1 + } + }, + "metadata": { + "scrapbook": { + "data": true, + "display": false, + "name": "rsquared" + } + }, + "output_type": "display_data" + }, + { + "data": { + "application/scrapbook.scrap.json+json": { + "data": 7.5410127229988575, + "encoder": "json", + "name": "train_time", + "version": 1 + } + }, + "metadata": { + "scrapbook": { + "data": true, + "display": false, + "name": "train_time" + } + }, + "output_type": "display_data" + }, + { + "data": { + "application/scrapbook.scrap.json+json": { + "data": 25.246142672998758, + "encoder": "json", + "name": "test_time", + "version": 1 + } + }, + "metadata": { + "scrapbook": { + "data": true, + "display": false, + "name": "test_time" + } + }, + "output_type": "display_data" } ], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": null, "source": [ "if is_jupyter():\n", " # Record results with papermill for tests\n", @@ -494,26 +782,24 @@ " sb.glue(\"rsquared\", rating_eval.rsquared())\n", " sb.glue(\"train_time\", train_time.interval)\n", " sb.glue(\"test_time\", test_time.interval)" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 15, + "metadata": {}, + "outputs": [], "source": [ "# cleanup spark instance\n", "spark.stop()" - ], - "outputs": [], - "metadata": {} + ] } ], "metadata": { "kernelspec": { - "display_name": "Python (reco_pyspark)", + "display_name": "Python (reco)", "language": "python", - "name": "reco_pyspark" + "name": "reco" }, "language_info": { "codemirror_mode": { @@ -525,9 +811,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.0" + "version": "3.8.0" } }, "nbformat": 4, "nbformat_minor": 1 -} \ No newline at end of file +} diff --git a/examples/02_model_collaborative_filtering/als_deep_dive.ipynb b/examples/02_model_collaborative_filtering/als_deep_dive.ipynb index 0d90bb65d4..aff9541b6e 100644 --- a/examples/02_model_collaborative_filtering/als_deep_dive.ipynb +++ b/examples/02_model_collaborative_filtering/als_deep_dive.ipynb @@ -2,31 +2,32 @@ "cells": [ { "cell_type": "markdown", + "metadata": {}, "source": [ "Copyright (c) Microsoft Corporation. All rights reserved.\n", "\n", "Licensed under the MIT License." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "# Spark Collaborative Filtering (ALS) Deep Dive" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Spark MLlib provides a collaborative filtering algorithm that can be used for training a matrix factorization model, which predicts explicit or implicit ratings of users on items for recommendations.\n", "\n", "This notebook presents a deep dive into the Spark collaborative filtering algorithm." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## 1 Matrix factorization algorithm\n", "\n", @@ -53,11 +54,11 @@ "Owing to the term of $q_{i}^{T}p_{u}$ the loss function is non-convex. Gradient descent method can be applied but this will incur expensive computations. An Alternating Least Square (ALS) algorithm was therefore developed to overcome this issue. \n", "\n", "The basic idea of ALS is to learn one of $q$ and $p$ at a time for optimization while keeping the other as constant. This makes the objective at each iteration convex and solvable. The alternating between $q$ and $p$ stops when there is convergence to the optimal. It is worth noting that this iterative computation can be parallelised and/or distributed, which makes the algorithm desirable for use cases where the dataset is large and thus the user-item rating matrix is super sparse (as is typical in recommendation scenarios). A comprehensive discussion of ALS and its distributed computation can be found [here](http://stanford.edu/~rezab/classes/cme323/S15/notes/lec14.pdf)." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## 2 Spark Mllib implementation\n", "\n", @@ -66,28 +67,40 @@ "* The uniqueness of ALS implementation is that it distributes the matrix factorization model training by using \"Alternating Least Square\" method. \n", "* In the training method, there are parameters that can be selected to control the model performance.\n", "* Both explicit and implicit ratings are supported by Spark ALS model." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## 3 Spark ALS based MovieLens recommender\n", "\n", "In the following code, the MovieLens-100K dataset is used to illustrate the ALS algorithm in Spark." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "**Note**: This notebook requires a PySpark environment to run properly. Please follow the steps in [SETUP.md](https://github.com/Microsoft/Recommenders/blob/master/SETUP.md#dependencies-setup) to install the PySpark environment." - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "System version: 3.8.0 (default, Nov 6 2019, 21:49:08) \n", + "[GCC 7.3.0]\n", + "Pandas version: 1.3.5\n", + "PySpark version: 3.2.0\n" + ] + } + ], "source": [ "# set the environment path to find Recommenders\n", "import sys\n", @@ -97,6 +110,8 @@ "import seaborn as sns\n", "import sys\n", "import pandas as pd\n", + "import warnings\n", + "warnings.simplefilter(action='ignore', category=FutureWarning)\n", "\n", "import pyspark\n", "from pyspark.sql import SparkSession\n", @@ -116,31 +131,24 @@ "print(\"System version: {}\".format(sys.version))\n", "print(\"Pandas version: {}\".format(pd.__version__))\n", "print(\"PySpark version: {}\".format(pyspark.__version__))" - ], - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "System version: 3.6.9 (default, Jan 26 2021, 15:33:00) \n", - "[GCC 8.4.0]\n", - "Pandas version: 1.1.5\n", - "PySpark version: 2.4.8\n" - ] - } - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Data column names" - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 2, + "metadata": { + "tags": [ + "parameters" + ] + }, + "outputs": [], "source": [ "MOVIELENS_DATA_SIZE = \"100k\"\n", "\n", @@ -149,17 +157,13 @@ "COL_RATING = \"Rating\"\n", "COL_PREDICTION = \"prediction\"\n", "COL_TIMESTAMP = \"Timestamp\"" - ], - "outputs": [], - "metadata": { - "tags": [ - "parameters" - ] - } + ] }, { "cell_type": "code", "execution_count": 3, + "metadata": {}, + "outputs": [], "source": [ "schema = StructType(\n", " (\n", @@ -169,101 +173,101 @@ " StructField(COL_TIMESTAMP, LongType()),\n", " )\n", ")" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Model hyper parameters - these parameters are selected with reference to the benchmarking results [here](http://mymedialite.net/examples/datasets.html)." - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 4, + "metadata": {}, + "outputs": [], "source": [ "RANK = 10\n", "MAX_ITER = 15\n", "REG_PARAM = 0.05" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Number of recommended items" - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 5, + "metadata": {}, + "outputs": [], "source": [ "K = 10" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Initialize a Spark session." - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 6, - "source": [ - "spark = start_or_get_spark(\"ALS Deep Dive\", memory=\"16g\")" - ], + "metadata": {}, "outputs": [], - "metadata": {} + "source": [ + "spark = start_or_get_spark(\"ALS Deep Dive\", memory=\"16g\")\n", + "spark.conf.set(\"spark.sql.analyzer.failAmbiguousSelfJoin\", \"false\")" + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "### 3.1 Load and prepare data" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Data is read from csv into a Spark DataFrame." - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 7, - "source": [ - "dfs = movielens.load_spark_df(spark=spark, size=MOVIELENS_DATA_SIZE, schema=schema)" - ], + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stderr", + "output_type": "stream", "text": [ - "100%|██████████| 4.81k/4.81k [00:00<00:00, 20.5kKB/s]\n" + "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4.81k/4.81k [00:08<00:00, 593KB/s]\n", + " \r" ] } ], - "metadata": {} + "source": [ + "dfs = movielens.load_spark_df(spark=spark, size=MOVIELENS_DATA_SIZE, schema=schema)" + ] }, { "cell_type": "code", "execution_count": 8, - "source": [ - "dfs.show(5)" - ], + "metadata": { + "scrolled": true + }, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+------+-------+------+---------+\n", "|UserId|MovieId|Rating|Timestamp|\n", @@ -279,43 +283,45 @@ ] } ], - "metadata": { - "scrolled": true - } + "source": [ + "dfs.show(5)" + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Data is then randomly split by 80-20 ratio for training and testing." - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 9, + "metadata": {}, + "outputs": [], "source": [ "dfs_train, dfs_test = spark_random_split(dfs, ratio=0.75, seed=42)" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "### 3.2 Train a movielens model " - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "It is worth noting that Spark ALS model allows dropping cold users to favor a robust evaluation with the testing data. In case there are cold users, Spark ALS implementation allows users to drop cold users in order to make sure evaluations on the prediction results are sound." - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 10, + "metadata": {}, + "outputs": [], "source": [ "als = ALS(\n", " maxIter=MAX_ITER, \n", @@ -328,38 +334,56 @@ ")\n", "\n", "model = als.fit(dfs_train)" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "### 3.3 Prediction with the model\n", "\n", "The trained model can be used to predict ratings with a given test data." - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 11, + "metadata": {}, + "outputs": [], "source": [ "dfs_pred = model.transform(dfs_test).drop(COL_RATING)" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "With the prediction results, the model performance can be evaluated." - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "RMSE score = 0.9636472784840003\n", + "MAE score = 0.7508415106649321\n", + "R2 score = 0.2661953136123826\n", + "Explained variance score = 0.271522993322584\n" + ] + } + ], "source": [ "evaluations = SparkRatingEvaluation(\n", " dfs_test, \n", @@ -377,31 +401,70 @@ " \"Explained variance score = {}\".format(evaluations.exp_var()),\n", " sep=\"\\n\"\n", ")" - ], - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "RMSE score = 0.9726930349322086\n", - "MAE score = 0.7565710909806911\n", - "R2 score = 0.24411065820407096\n", - "Explained variance score = 0.249700271662727\n" - ] - } - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Oftentimes ranking metrics are also of interest to data scientists. Note usually ranking metrics apply to the scenario of recommending a list of items. In our case, the recommended items should be different from those that have been rated by the users. " - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 477:> (0 + 1) / 1]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+------+-------+----------+\n", + "|UserId|MovieId|prediction|\n", + "+------+-------+----------+\n", + "| 1| 3| 2.6714876|\n", + "| 1| 7| 4.319914|\n", + "| 1| 9| 4.151373|\n", + "| 1| 20| 3.829036|\n", + "| 1| 33| 3.5674067|\n", + "| 1| 36| 1.1223706|\n", + "| 1| 43| 2.3707652|\n", + "| 1| 46| 3.959927|\n", + "| 1| 48| 4.6534643|\n", + "| 1| 50| 5.2104144|\n", + "| 1| 52| 4.1401386|\n", + "| 1| 63| 2.2337403|\n", + "| 1| 65| 3.5661814|\n", + "| 1| 84| 2.0204403|\n", + "| 1| 113| 4.335376|\n", + "| 1| 117| 3.3492455|\n", + "| 1| 118| 2.856406|\n", + "| 1| 119| 4.807717|\n", + "| 1| 161| 2.8039317|\n", + "| 1| 190| 3.963532|\n", + "+------+-------+----------+\n", + "only showing top 20 rows\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\r", + "[Stage 481:> (0 + 1) / 1]\r", + "\r", + " \r" + ] + } + ], "source": [ "# Get the cross join of all user-item pairs and score them.\n", "users = dfs_train.select(COL_USER).distinct()\n", @@ -420,46 +483,39 @@ " .select('pred.' + COL_USER, 'pred.' + COL_ITEM, 'pred.' + \"prediction\")\n", "\n", "dfs_pred_final.show()" - ], + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, "outputs": [ { + "name": "stderr", "output_type": "stream", + "text": [ + "[Stage 560:> (0 + 2) / 2]\r" + ] + }, + { "name": "stdout", + "output_type": "stream", "text": [ - "+------+-------+----------+\n", - "|UserId|MovieId|prediction|\n", - "+------+-------+----------+\n", - "| 1| 587| 3.2763875|\n", - "| 1| 869| 1.996331|\n", - "| 1| 1208| 3.0924819|\n", - "| 1| 1677| 3.0549564|\n", - "| 2| 80| 2.2266486|\n", - "| 2| 303| 3.5071766|\n", - "| 2| 472| 2.4076686|\n", - "| 2| 582| 4.137449|\n", - "| 2| 838| 1.6214753|\n", - "| 2| 975| 2.7880914|\n", - "| 2| 1260| 3.155648|\n", - "| 2| 1325| 1.2494813|\n", - "| 2| 1381| 3.712147|\n", - "| 2| 1530| 2.04168|\n", - "| 3| 22| 2.5458775|\n", - "| 3| 57| 1.7472819|\n", - "| 3| 89| 3.85607|\n", - "| 3| 367| 3.2235723|\n", - "| 3| 1091| 1.5452085|\n", - "| 3| 1167| 3.5050836|\n", - "+------+-------+----------+\n", - "only showing top 20 rows\n", - "\n" + "Precision@k = 0.04379639448568401\n", + "Recall@k = 0.014286194686756822\n", + "NDCG@k = 0.03730295615527768\n", + "Mean average precision = 0.0034619726118607337\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\r", + " \r" ] } ], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 14, "source": [ "evaluations = SparkRankingEvaluation(\n", " dfs_test, \n", @@ -478,23 +534,11 @@ " \"Mean average precision = {}\".format(evaluations.map_at_k()),\n", " sep=\"\\n\"\n", ")" - ], - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Precision@k = 0.03170731707317073\n", - "Recall@k = 0.012679519170565132\n", - "NDCG@k = 0.02914424248125332\n", - "Mean average precision = 0.0033674440032626088\n" - ] - } - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "### 3.4 Fine tune the model\n", "\n", @@ -507,47 +551,56 @@ "|`maxIters`|Maximum number of iterations|10|The more iterations the better the model converges to the optimal point.|\n", "\n", "It is always a good practice to start model building with default parameter values and then sweep the parameter in a range to find the optimal combination of parameters. The following parameter set is used for training ALS models for comparison study purposes." - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 15, + "metadata": {}, + "outputs": [], "source": [ "param_dict = {\n", " \"rank\": [10, 15, 20],\n", " \"regParam\": [0.001, 0.1, 1.0]\n", "}" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Generate a dictionary for each parameter combination which can then be fed into model training." - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 16, + "metadata": {}, + "outputs": [], "source": [ "param_grid = generate_param_grid(param_dict)" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Train models with parameters specified in the parameter grid. Evaluate the model with, for example, the RMSE metric, and then record the metrics for visualization." - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], "source": [ "rmse_score = []\n", "\n", @@ -577,166 +630,195 @@ "\n", "rmse_score = [float('%.4f' % x) for x in rmse_score]\n", "rmse_score_array = np.reshape(rmse_score, (len(param_dict[\"rank\"]), len(param_dict[\"regParam\"]))) " - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 18, + "metadata": {}, + "outputs": [], "source": [ "rmse_df = pd.DataFrame(data=rmse_score_array, index=pd.Index(param_dict[\"rank\"], name=\"rank\"), \n", " columns=pd.Index(param_dict[\"regParam\"], name=\"reg. parameter\"))" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 19, - "source": [ - "fig, ax = plt.subplots()\n", - "sns.heatmap(rmse_df, cbar=False, annot=True, fmt=\".4g\")" - ], + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "" ] }, + "execution_count": 19, "metadata": {}, - "execution_count": 19 + "output_type": "execute_result" }, { - "output_type": "display_data", "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXwAAAEGCAYAAABmXi5tAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAAitElEQVR4nO3deXxU1d3H8c9vMlkhLAnIjmjBRxDRyiKIIFoFRFGkAloQcK2i4AIIWhXQtooIVn1wwaosPoJg2cSFpYWKCBIEAVksyKKBgLILJCSZnOePGQKBJKTKZEju9/16zYuZe86993dzX/nOzblnBnPOISIipZ8v0gWIiEjxUOCLiHiEAl9ExCMU+CIiHqHAFxHxCH+kCyjI3i5tNH2ohKo8Y0OkS5BfYf/gVpEuQX6FMk9PsoLadIUvIuIRCnwREY9Q4IuIeIQCX0TEIxT4IiIeocAXEfEIBb6IiEco8EVEPEKBLyLiEQp8ERGPUOCLiHiEAl9ExCMU+CIiHqHAFxHxCAW+iIhHKPBFRDxCgS8i4hEKfBERj1Dgi4h4hAJfRMQjFPgiIh6hwBcR8QgFvoiIRyjwRUQ8QoEvIuIRCnwREY9Q4IuIeIQCX0TEIxT4IiIeocAXEfEIBb6IiEf4I11AaZBw36NEN26B27+PA/1vP6k9uklL4m65A5yDQIDDY/+XwPrVRNWpS8LdD2PxCbicHDKmvkvWF/MBKPv0y1h8AgC+chXI3rieQyOeKNbjKs3atW3DqFFPE+Xz8fY7E3l+xOg87bVr1+DvY0ZRqXISe/fso2fvfmzblsZFF13A6FeeJbFcWQKBAM8+9wpTpswE4Mo2LRk+/EliYqJZvnw1d9/Tn0AgAMAVrVswcuQwoqP97N61h6uuvrnYj7k0iun0R/znXYI7dID00QNPao86vzExV3XFOQc5ATI/GU/O99/iO6cBMe175vbzVarOkSkvE1i/DN+5DYlp2x3MIDODI9New+3ZWZyHFTbmnIt0Dfna26XNmVlYPvz1G+Ey0inzwOP5Bj5x8ZCRDkBU7XMp88hQDjzUE1+1muAcOTu2YRWTKTd8DAce6oU7fDDP6mX6DyMrZRGZn80phqP59SrP2BDpEgrl8/lYt2Yh7TvcSmpqGksWf0yP2/qwbt2xuidNfIOPPp7HhAlTuLJNS3r16kbv2/tRr965OOfYuHEz1apVYemST2jYqA0HDvzMpo1Ladu+Gxs2bGLokAFs3ZrKO2MnUb58ORZ+NoPrru/ODz9sp3LlZH76aXcEfwKF2z+4VaRLKDLf2edDZgaxne/PN/CJiYXMIwBYldrEdX2Q9Ff65+0TX4aEB1/i8Mg+kJVJfL8XyXhvBG7XdvxNr8FXsy6Z014rhqM5Pco8PckKatOQzmmQvW4V7uDPBXcIhT0AcXHBK30gJy2VnB3bAHB7d5Ozfy9WrnzedeMT8De8hMyUz0932Z7VrOlv+e67LWze/D1ZWVlMnjyDGzq2y9Onfv16zJ+/CID5CxZxQ8e2AGzYsImNGzcDkJa2kx9/2k3lyskkJ1ckMzOTDRs2ATBv3md0vqkDALfechPTp3/CDz9sBzijw76kydm6Hpd+qOAOobAHsJjYfLv4GzQnsOFryMoMLXFYXPCva4tLwP289zRVG3kK/GIS3exyyv1tPGUfe45Drw0/qT2q7vmYP5qcndvzLI9pejnZ3yyH9MPFVWqpV71GVX5IPfZzTt2WRvXqVfP0WbVqLTd1uhaATp2upVy5RJKSKubp07TJxcTERPPdd1vYtWsPfr+fxpc0AqBz5+uoWas6APXqnUuFCuX559wpfLnkE3r00HBOcYqq35T4viOJ6z6II9NfP6ndf2ELsld/kfv6yIwxxPUYRHz/0fgvakXWwhnFWW5YKfCLSdbSzznwUE8OPf8E8d3uzNNmFZIo0/dxDr06PPfq/6iYy39H5uf/LM5SBXh00DO0bt2clKWzad2qOampabnj8QBVq57F2LEvc9ddj3B0WLR7jz6MfGEoixfN4uDBQwQCOQD4/VE0vqQRHW/sSYfr/sCfHnuIevXOjchxeVFgXQrpr/QnY+ILxFzVNU+bla2Ar0ptAhtX5i6LbtGBjHeHkz7yfrJXLCCm/W3FXXLYhCXwzay8mT1nZuvNbI+Z7TazdaFlFQpZ7x4zW2Zmy8Zu2l5QtxIte90qfFWqYYmhoZv4BMo+9hzpE98isGFtnr6WWJ6ouueTtXxJBCotvbZv20GtmtVzX9esUY3t23fk6ZOWtpMuXe+mabN2PPlU8C+y/fsPAJCYWJaZM8bz5FPD+XLp8tx1lnz5FW2u6kyLltezcOGS3OGdbdvSmDN3AYcPp7N7914Wfr6ERo0ahPsw5QQ5W9djFc+ChMTcZVENW5C9LgVyQm/mCYn4qp5NTupGALK/WUxUrfMiUW5YhOsKfzKwF2jjnEtyziUDV4aWTS5oJefcGOdcE+dck97nVi+oW4njq1oj93nUOfWw6Gjcz/vB76fswGfI/Pccspb8+6T1optfQdZXi48bW5TTIWXZ19Stew516tQiOjqarl1v5MNZeW+IJydXxCx472vwoL6MHTcJgOjoaP4x5S3effcDpk79KM86lSsnAxATE8PAAfczZswEAGZ+OJuWlzUjKiqK+Pg4mjX7LevXn9k3tksLS6qS+9xXrQ74o+Hwsftt/gsvI3v1omMrZBzCYuOx5GoARP2mETk/bSuucsMuXNMy6zjn8gxUO+d2AMPN7I4w7TNiyjz4JP4LLsYSy1P+9SmkT34HooI/2sy5M4m+tDWxV7TFBQKQeYSDLz4NQEyLK/HXvwhLLE/Mle0BODz6OQJbglcXMS2vImP6e5E5qFIsEAjw4ENP8PFH7xHl8zF23PusXfsfhg4ZwLKvVjJr1lyuuOIy/vLMYzgcCxcuoW+/PwHQpUtHWrW6lKTkivTsGRweuPOuh1m5cg0DHrmPDtddjc/n4403xjN/QTBI1q/fyOw581mxfB45OTm8/fZE1qz5NmLHX5rE3twX3zkNsIRE4vuPJmv+B+CLAiB72Tz8DS7Ff3Gr4O9ediZHJr+Uu65VqIyVTyZny7pjG8zJ4cjMN4m75eHgUF36oXzH/UuqsEzLNLM5wDxgnHNuZ2hZFaA3cI1z7upTbaMkTcuUvM70aZlSuJI0LVNOFolpmd2AZODfoTH8PcACIAnoEqZ9iohIIcIypOOc2wsMCj3yMLPbgXfCsV8RESlYJKZlDovAPkVEPC8sV/hmtqqgJqBKAW0iIhJG4ZqlUwVoR3Aa5vEM+OLk7iIiEm7hCvxZQFnn3NcnNpjZgjDtU0REChGum7Z3FtL2h3DsU0RECqfv0hER8QgFvoiIRyjwRUQ8QoEvIuIRCnwREY9Q4IuIeIQCX0TEIxT4IiIeocAXEfEIBb6IiEco8EVEPEKBLyLiEQp8ERGPUOCLiHiEAl9ExCMU+CIiHqHAFxHxCAW+iIhHKPBFRDxCgS8i4hEKfBERj1Dgi4h4hAJfRMQjFPgiIh7hj3QBBSk7+s1IlyC/1Iw2ka5ARPKhK3wREY9Q4IuIeIQCX0TEIxT4IiIeocAXEfEIBb6IiEco8EVEPEKBLyLiEQp8ERGPUOCLiHiEAl9ExCMU+CIiHqHAFxHxCAW+iIhHKPBFRDxCgS8i4hEKfBERj1Dgi4h4hAJfRMQjFPgiIh6hwBcR8QgFvoiIRyjwRUQ8QoEvIuIRCnwREY9Q4IuIeIQCX0TEI4oU+GYWm8+ypNNfjoiIhEtRr/Cnmln00RdmVg2YG56SREQkHIoa+NOByWYWZWZ1gNnAY+EqSkRETj9/UTo55940sxiCwV8H+KNz7osw1iUiIqdZoYFvZo8c/xKoDXwNNDez5s65UWGsTURETqNTXeEnnvB6agHLPe2JZ//GZ1+kkFSxPNPHv3pS+9IVq+j32J+pUa0KAFe3voz7br81tz0QCNDt7oc5q1Iyrz4/BIBBT49gzfqN+P1RNKx/HkMGPkC0v0h/kEkRtGvbhlGjnibK5+Ptdyby/IjRedpr167B38eMolLlJPbu2UfP3v3Yti2N2rVr8MGUt/D5fERH+xk9+h3GvDkBgEt+eyFvvfUi8XFxfPLpv3j4kacAGP7sE1x3/TVkZmayadNW7rzrEfbvP1Dsx1waxXT6I/7zLsEdOkD66IEntUed35iYq7rinIOcAJmfjCfn+2/xndOAmPY9c/v5KlXnyJSXCaxfhu/chsS07Q5mkJnBkWmv4fbsLM7DChtzzkW6hnxl/bjhzCwsH8u+/oaE+Dge/8uoAgN/7MRpuWF+onGTprHm240cPHQ4t89ni1No1bwJAI8OG0Hjixpyy00dwncQp1F8zTaRLqFQPp+PdWsW0r7DraSmprFk8cf0uK0P69ZtyO0zaeIbfPTxPCZMmMKVbVrSq1c3et/ej+joaMyMzMxMypRJYOWKf9HqihtJS9vJ4kWzeOjhp/hy6XJmzZzA/45+m09nz+eaq1vzr/mLCAQCPPvXxwF47PG/RurwT2n/4FaRLqHIfGefD5kZxHa+P9/AJyYWMo8AYFVqE9f1QdJf6Z+3T3wZEh58icMj+0BWJvH9XiTjvRG4XdvxN70GX826ZE57rRiO5vQo8/QkK6itqNMyzzOzMWY2x8z+dfRx+kos2Zpc3JDy5X7ZHz07ftzFZ4tT+P31bfMsb92iKWaGmXFh/fPY+dOu01GqAM2a/pbvvtvC5s3fk5WVxeTJM7ihY7s8ferXr8f8+YsAmL9gETd0DJ6frKwsMjMzAYiNjcXnC/4KVa16FonlEvly6XIAJvzfB9xwQ3sA5s77jEAgAMCSL5dTo0a18B+kR+RsXY9LP1Rwh1DYA1jMSbPLAfA3aE5gw9eQlRla4rC4hOA6cQm4n/eepmojr6izdKYAK4AngIHHPaSIVq5ZT+feD3DvgCFs3Lw1d/nwl8fwSJ87MF/+b8pZ2dl8OHs+l196SXGVWupVr1GVH1K3575O3ZZG9epV8/RZtWotN3W6FoBOna6lXLlEkpIqAlCzZnWWfzWXLZtSGPHCaNLSdlKjelW2pablrr8tNY0aJ2wT4Pbet/Dp7PnhOCwpQFT9psT3HUlc90Ecmf76Se3+C1uQvfrYHJQjM8YQ12MQ8f1H47+oFVkLZxRnuWFV1MDPds695pxb6pz76ujjdBdjZveY2TIzW/b38ZNO9+YjpsF5dZk75W2mjv1f/vD76+n3+J8BWLBoKUkVK3DB/9QtcN0/j3yVxhdfQOOLGhZXuQI8OugZWrduTsrS2bRu1ZzU1LTcq/TU1O1c0vga/qd+S3re1oWzzqpUpG0+Nrgf2dnZvPfe1FN3ltMmsC6F9Ff6kzHxBWKu6pqnzcpWwFelNoGNK3OXRbfoQMa7w0kfeT/ZKxYQ0/624i45bIoa+B+aWR8zq2ZmSUcfBXU2s/bHPS9vZm+Z2Soze8/MqhS0nnNujHOuiXOuyV09b/kvDuPMVrZMAgkJ8UBwqCY7O8DefftZsXotCxZ9SdsudzBw6PMsXb6KQU+/kLveq++8x959B3j0gbsiVXqptH3bDmrVrJ77umaNamzfviNPn7S0nXTpejdNm7XjyaeGA5x0ozUtbSffrPmWyy+/lG3bd1Cj5rGhmho1q7HtuG32vK0r13W4mtt6PhCOQ5IiyNm6Hqt4FiQcG36NatiC7HUpkBN8MychEV/Vs8lJ3QhA9jeLiap1XiTKDYuiBn4vgkM4XwBfhR7LCul//B2pkUAa0BFIAd7478ss2Xbt3svRm+Or135LTo6jQvlyPHxvb/45dRxzprzNiKGP0uySRgx/agAAH3w4m0VLl/P80IG548RyeqQs+5q6dc+hTp1aREdH07XrjXw4a06ePsnJFTELDrMNHtSXseOCf3HWqFGNuLg4ACpUKE/Lls34z3++Y8eOH/n5wM9c2iw49HZb95v58MPZQHBG0IAB99Gpc2/S0zOK6zAFsKRj15e+anXAHw2Hf85d5r/wMrJXLzq2QsYhLDYeSw6+eUf9phE5P20rrnLDrqgfvDrnV+yjiXPu4tDzF82s16/Y1hlp4NDnSVmxmn37D/C7zr3oc0d3srOzAejWqQNzFnzO+9M/ISrKR1xsLCOGPpobJgV5ZuRoqlU5i+73Bt8ATpzKKb9cIBDgwYee4OOP3iPK52PsuPdZu/Y/DB0ygGVfrWTWrLlcccVl/OWZx3A4Fi5cQt9+fwKg/vl1ef75p3AuOGtv1KjX+eab9QA80Pfx3GmZn86ezyefBuc1vPS3PxMbG8unnwTfNL78cjn3PzA4MgdfysTe3BffOQ2whETi+48ma/4H4IsCIHvZPPwNLsV/cStcIADZmRyZ/FLuulahMlY+mZwt645tMCeHIzPfJO6Wh4MXaemH8h33L6mKPC3TzBoCDYC4o8ucc+ML6JsKjCL4Ya37gd+40I7MbJVzrtGp9leSpmVKXmf6tEwpXEmaliknK2xaZpGu8M1sCNCGYOB/DFwLfA7kG/jAmxz7cNY4oBLwk5lVJfhJXRERKWZF/ejmzcBFwArn3O2hG6/vFtTZOTesgOU7zExz0kREIqCodwMznHM5QLaZlQN+BGr9wn3m+2YgIiLhdcorfAveXVxlZhUIDtV8BRwEFheyzqqCmoACp2WKiEj4nDLwnXPOzJo55/YBr5vZp0A551xBoQ7BUG8HnPiZZCM4tVNERIpZUcfwl5tZU+dcinNuSxH6zwLKOue+PrHBzBYUvTwRETldihr4lwLdzWwrcIjglboraHqlc+7OgjbknPvDf12liIj8akUN/Han7iIiImeyon7Sduupe4mIyJlMX9IiIuIRCnwREY9Q4IuIeIQCX0TEIxT4IiIeocAXEfEIBb6IiEco8EVEPEKBLyLiEQp8ERGPUOCLiHiEAl9ExCMU+CIiHqHAFxHxCAW+iIhHKPBFRDxCgS8i4hEKfBERj1Dgi4h4hAJfRMQjFPgiIh6hwBcR8Qh/pAsokC8q0hWIiJQqusIXEfEIBb6IiEco8EVEPEKBLyLiEQp8ERGPUOCLiHiEAl9ExCMU+CIiHqHAFxHxCAW+iIhHKPBFRDxCgS8i4hEKfBERj1Dgi4h4hAJfRMQjFPgiIh6hwBcR8QgFvoiIRyjwRUQ8QoEvIuIRCnwREY9Q4IuIeIQCX0TEIxT4IiIeocAXEfEIBb6IiEco8EVEPEKBLyLiEQp8ERGPUOCLiHiEAl9ExCP8kS6gNHjir6P4bNFSkipWYPq7r5/UvnT5KvoNHkaNalUBuPqKy7jvju657YFAgG539uOsypV4dcQwAJxzvDxmHHPmf47P56PbTdfRo8uNxXNAHtCubRtGjXqaKJ+Pt9+ZyPMjRudpr127Bn8fM4pKlZPYu2cfPXv3Y9u2NC666AJGv/IsieXKEggEePa5V5gyZSYAV7ZpyfDhTxITE83y5au5+57+BAIB+j9yL7fe2hkAvz+K+ufXo2r1Ruzdu6+4D7vUien0R/znXYI7dID00QNPao86vzExV3XFOQc5ATI/GU/O99/iO6cBMe175vbzVarOkSkvE1i/DN+5DYlp2x3MIDODI9New+3ZWZyHFTbmnIt0DfnK2rXpzCwsH8u+Xk1CfDyPP/NCgYE/duI/csP8ROMmTWXN+g0cPHQ4t8+0j+awdPkq/vKnR/D5fOzeu4/kihXCeRinTXz1VpEuoVA+n491axbSvsOtpKamsWTxx/S4rQ/r1m3I7TNp4ht89PE8JkyYwpVtWtKrVzd6396PevXOxTnHxo2bqVatCkuXfELDRm04cOBnNm1cStv23diwYRNDhwxg69ZU3hk7Kc++r7/uGh7sdzfXtOta3IddZPsHn9nn73i+s8+HzAxiO9+fb+ATEwuZRwCwKrWJ6/og6a/0z9snvgwJD77E4ZF9ICuT+H4vkvHeCNyu7fibXoOvZl0yp71WDEdzepR5epIV1KYhndOgycUXUr5c4i9ad8ePP/HZF0v5fcd2eZa/P+0j7rv9D/h8wVNUUsK+JGjW9Ld8990WNm/+nqysLCZPnsENJ/z869evx/z5iwCYv2ARN3RsC8CGDZvYuHEzAGlpO/nxp91UrpxMcnJFMjMz2bBhEwDz5n1G55s6nLTvbt1uZNL708N4dN6Ss3U9Lv1QwR1CYQ9gMbH5dvE3aE5gw9eQlRla4rC4hOA6cQm4n/eepmojT4FfTFZ+s47Ovfpwb/8n2bhpa+7y4S+9wSN97sQs76n4YVsan/zz33S9ox/39n+SrT9sK+6SS63qNaryQ+r23Nep29KoXr1qnj6rVq3lpk7XAtCp07WUK5dIUlLFPH2aNrmYmJhovvtuC7t27cHv99P4kkYAdO58HTVrVc/TPz4+jnZt2zB12sfhOCwpQFT9psT3HUlc90EcmX7yX+D+C1uQvfqL3NdHZowhrscg4vuPxn9RK7IWzijOcsMqLIFvZuXN7DkzW29me8xst5mtCy2rEI59nska/M9vmPuPcUwd9yp/+H1H+j32NAALFn1JUsUKXHB+vZPWyczKIjYmhslvv8zvO7bnyb++WNxle9qjg56hdevmpCydTetWzUlNTSMQCOS2V616FmPHvsxddz3C0WHR7j36MPKFoSxeNIuDBw8RCOTk2eb117fli8XLNHZfzALrUkh/pT8ZE18g5qq8Q2lWtgK+KrUJbFyZuyy6RQcy3h1O+sj7yV6xgJj2txV3yWETriv8ycBeoI1zLsk5lwxcGVo2uaCVzOweM1tmZsv+Pn5imEorfmXLlCEhIR6A1pc1Izs7m7379rNi1VoWfL6Etr/vxcAhz7H0q5UMGvY8AFUrV+LqK1oCwZu8//luc8TqL222b9tBrZrHrr5r1qjG9u078vRJS9tJl65307RZO558ajgA+/cfACAxsSwzZ4znyaeG8+XS5bnrLPnyK9pc1ZkWLa9n4cIlucM7R3XreoOGcyIoZ+t6rOJZkHBs+DWqYQuy16VATujNPCERX9WzyUndCED2N4uJqnVeJMoNi3AFfh3n3HDnXO5vkXNuh3NuOHB2QSs558Y455o455rc1fPWMJVW/Hbt3pN7Fbh67bfkOEeF8uV4+L7b+ef0d5nzj3GMGDaYZo0vYviQRwG4qnULli4PXnWkrFjN2bVqRKz+0iZl2dfUrXsOderUIjo6mq5db+TDWXPy9ElOrohZ8N7X4EF9GTsuePM1Ojqaf0x5i3ff/YCpUz/Ks07lyskAxMTEMHDA/YwZMyG3rVy5RFq3as7MmbPDeWhyAkuqkvvcV60O+KPh8M+5y/wXXkb26kXHVsg4hMXGY8nVAIj6TSNyfio9w6nhmpa51cweBcY553YCmFkVoDfwQ5j2GTEDhzxHyopV7Nt3gN916kGfO28jOzsbgG43Xcec+Z/z/rSPiPJHERcTw4hhg3PDpCB39ujKoGHPM+H96STExzFs8EPFcCTeEAgEePChJ/j4o/eI8vkYO+591q79D0OHDGDZVyuZNWsuV1xxGX955jEcjoULl9C3358A6NKlI61aXUpSckV69gwOD9x518OsXLmGAY/cR4frrsbn8/HGG+OZv+BYkHS68VrmzvuMw4fTI3LMpVXszX3xndMAS0gkvv9osuZ/AL4oALKXzcPf4FL8F7fCBQKQncmRyS/lrmsVKmPlk8nZsu7YBnNyODLzTeJueTh4kZZ+KN9x/5IqLNMyzawiMBi4EagCOGAnMBMY7pzbc6ptlKRpmZLXmT4tUwpXkqZlyskKm5YZlit859xeM3sHmAsscc4dPNpmZu2BT8OxXxERKVi4Zun0A2YADwDfmNnxHxH9azj2KSIihQvXGP7dQGPn3EEzqwN8YGZ1nHMvAYUPXouISFiEK/B9R4dxnHNbzKwNwdA/GwW+iEhEhGta5k4zu/joi1D4Xw9UAi4M0z5FRKQQ4Qr8nkCeT7I457Kdcz2B1mHap4iIFCJcs3RSC2lbVFCbiIiEj748TUTEIxT4IiIeocAXEfEIBb6IiEco8EVEPEKBLyLiEQp8ERGPUOCLiHiEAl9ExCMU+CIiHqHAFxHxCAW+iIhHKPBFRDxCgS8i4hEKfBERj1Dgi4h4hAJfRMQjFPgiIh6hwBcR8QgFvoiIRyjwRUQ8QoEvIuIRCnwREY9Q4IuIeIQCX0TEI8w5F+kaPMnM7nHOjYl0HfLL6PyVXF4+d7rCj5x7Il2A/Co6fyWXZ8+dAl9ExCMU+CIiHqHAjxxPjiGWIjp/JZdnz51u2oqIeISu8EVEPEKBLyLiEQr808DM2pvZt2a20cwG59Mea2bvh9q/NLM6x7U9Flr+rZm1O27522b2o5l9U0yHIScownltbWbLzSzbzG6ORI2Sv1P9/ljQy6Fzu8rMLinuGiNBgf8rmVkUMBq4FmgA3GpmDU7odiew1zlXF3gRGB5atwFwC3AB0B54NbQ9gLGhZRIBRTyv3wO9gfeKtzopgrEU/vtzLVAv9LgHeK0Yaoo4Bf6v1wzY6Jzb5JzLBCYBN57Q50ZgXOj5B8DvzMxCyyc554445zYDG0Pbwzn3GbCnOA5A8nXK8+qc2+KcWwXkRKJAKVgRfn9uBMa7oCVABTOrVjzVRY4C/9erAfxw3OvU0LJ8+zjnsoH9QHIR15XI0Lkp3Tx5fhX4IiIeocD/9bYBtY57XTO0LN8+ZuYHygO7i7iuRIbOTenmyfOrwP/1UoB6ZnaOmcUQvAk784Q+M4Feoec3A/9ywU+8zQRuCc3iOYfgDaSlxVS3FK4o51VKrplAz9BsnebAfudcWqSLCjd/pAso6Zxz2Wb2ADAbiALeds6tMbOngWXOuZnAW8AEM9tI8EbSLaF115jZZGAtkA3c75wLAJjZRKANUMnMUoEhzrm3ivnwPKso59XMmgLTgIpARzMb5py7IIJlS0h+vz9ANIBz7nXgY6ADwYkSh4HbI1Np8dJXK4iIeISGdEREPEKBLyLiEQp8ERGPUOCLiHiEAl9ExCMU+CJnIDN7PNI1SOmjaZlSYoS+cM6cc2fEl5WZWdTRz02EYdsHnXNlz5R6pHTQFb6c0cysTug76ccD3wC1zGygmaWEvsd82HF9nwz1/dzMJprZgFNsu7eZzTCzBWa2wcyGHNc23cy+MrM1ZnbPccsPmtlIM1sJtDCzp0K1fGNmY0JvSoS2+aKZLTOzdWbW1Mymhvbz5+O218PMlprZ12b2hplFmdlzQHxo2f8V1C+/ek7LD11KL+ecHnqcsQ+gDsGvH24eet2W4H9CbQQvWGYBrYGmwNdAHJAIbAAGnGLbvYE0gt9cGk/wDaVJqC0p9O/R5cmh1w7oetw2ko57PgHoGHq+ABgeev4gsB2oBsQS/GbGZKA+8CEQHer3KtAz9PzgcdstrF+eevTQo7CHvlpBSoKtLvid5RAM/LbAitDrsgS/gygRmOGcywAyzOzDIm57rnNuN4CZTQUuB5YB/czsplCfWqF97AYCwD+OW/9KM3sUSACSgDUEwxmOfffOamCNC31Xi5ltCm3zcqAxkBL6wyAe+DGfGn9XSL8T6xEpkAJfSoJDxz034Fnn3BvHdzCzh37htk+8ieXMrA1wNdDCOXfYzBYQ/MsBIMMd+76jOIJX202ccz+Y2dDj+gEcCf2bc9zzo6/9oWMZ55x77BQ1FtYvtx6RU9EYvpQ0s4E7zKwsgJnVMLOzgEUEv8AsLtR2fRG3d42ZJZlZPNAptJ3yBP9LysNmdj7QvIB1j4b7rtA+/9v/1/afwM2h+gnVcXaoLcvMoovQT6TIdIUvJYpzbo6Z1QcWh4Y3DgI9nHMpZjYTWAXsJDiMsh/AzO4Nrft6PptcSnBIpCbwrnNumZmtBu41s3XAt8CSfNbDObfPzN4kOMa/g+BXKv83x7LWzJ4A5piZD8gC7ge2ErxPscrMljvnuhfST6TINC1TSg0zK+ucO2hmCcBnwD3OueWF9O9NcDjmgeKqUSSSdIUvpckYM2tAcKhlXGFhL+JFusIXEfEI3bQVEfEIBb6IiEco8EVEPEKBLyLiEQp8ERGP+H8bcalIQKGLvQAAAABJRU5ErkJggg==", + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXwAAAEGCAYAAABmXi5tAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAAi60lEQVR4nO3de5xN9f7H8ddn7z0XlyEz5C45dTouHZVLcgqdnFAJEemUSHeFyiWnoptEN+midBLRjVKEilOc5JfGpTiik2unGUNyZ4yZ2fv7+2Nvk8GMqezZmfV+Ph770V7f9V1rfb5W+z1rvntZzDmHiIiUfL5YFyAiIsVDgS8i4hEKfBERj1Dgi4h4hAJfRMQjArEuoCA7u1+o24dOYBXf/S7WJcivtGtg81iXIL9BmeFTraB1usIXEfEIBb6IiEco8EVEPEKBLyLiEQp8ERGPUOCLiHiEAl9ExCMU+CIiHqHAFxHxCAW+iIhHKPBFRDxCgS8i4hEKfBERj1Dgi4h4hAJfRMQjFPgiIh6hwBcR8QgFvoiIRyjwRUQ8QoEvIuIRCnwREY9Q4IuIeIQCX0TEIxT4IiIeocAXEfEIBb6IiEco8EVEPEKBLyLiEQp8ERGPUOCLiHiEAl9ExCMCsS6gJCh18yDizm6G272TPYOuP2J9oNFfKNW1F4QcLhRk/2vPEfzvSvyn/IFS19+JlS4DoSBZ771OzqJ5edsldu1NXLOWEApxYO4Msj+eVpzDKtHaXNyKp556CL/Px/hX32TU48/nW1+rVnX+Oe4pKlZKZsf2nfTo2Zf09AwaNqzP88+OIKlcWYLBICMee5apU2fkbffwQ4Pp3PkygsEgL730Gs89P57u3TsxcMBtmBl79+yjzx1DWLFiVXEPuUSKv+JWAmc0wu3bxf4xdx+x3l+3MfGtr8I5B6Eg2bMmEPr+WwCsfEUSOt2ClU8BIGvio7idW/HVaUB8u2sxf4Bg+nqy3xsLoVCxjitaFPjHQfa/PyL74/cofduQo67PXbmUPUsXAuCrVYcyfYexZ8B1uAMHyBw7gtDmdKxCCknDXyJ3RSoucx/xLdviSzmZPXdfB85h5U4qxhGVbD6fjzHPDKftJd1JS8tg0Rez+WDmHFavXpPXZ9TIoUx6/R0mTZrKha3+wvBHhtCzV18yM/fT8/p+rF27gapVK5O66EPmzJnPrl27ua5HV2rUqEb9Bi1wzlGpUjhINm74gb9e1IWdO3fRts2FvPjCSJqf3z5Wwy9RcpfNJ3fRRyR0uf2o64PrVrJ/9QAArHItErvfxf7R/QFI6HI72fOnEVq3AuITwYXAjITOfcga/xBuWwZxF3UjcHYrcpd+Wkwjii5N6RwHwW9X4PbuLrjDgay8t5aQCDgAQpvTCG1OB8Dt2IbbvTMv2ONbX07WtIngwn3d7p3RKN2TmjY5m3XrNrJhw//IyclhypTpXN6+Tb4+deuezrx54R/S8+Yv5PL2FwOwZs161q7dAEBGxhZ+3LotL9hvubkHjwx/Onw1CWzdug2ALxYtYefOXQAs+nIZ1atXjf4gPSK0cTUuc2/BHbIP+ezFJ+Z9nqxSDfD5w2F/sF9ONpRKgmAublsGAMG1y/HXPzdq9Re3qAS+mZU3s8fM7Fsz225m28xsdaTtpGgc8/curvH5JD0xkTKDRpD50qgj1vv/8CcIBAht2QSAr3I14s67kLLDX6TM4MfwVale3CWXWNWqV+GHtE15y2npGVSrViVfnxUrVtGpYzsAOnZsR7lySSQnV8jXp0njs4iPj2Pduo0A1KlTm65XXs6iL2Yzc8YkTjvt1COOfX2vq/jo43lHtEv0+Os1pVT/0ST2GMKBaWMB8FWsisvaR8LVA0jsM4q4tteC+SBzN/j8+KrXASDQ4Dx85SvGsvzjKlpX+FOAHUAr51yycy4FuDDSNiVKx/xdy1nyOXsGXMe+J+8n8cr88/x2UjKlbxtC5osjf74CiYuHnGz23nsL2Z/OovTNg2JRtmcNGvwwLVo0Y3Hqx7S4oBlpaRkEg8G89VWqnMyECWO44Ya78q7oExLiyco6QLPzLuGf49/gn+OezLfPVi2b06tXd4b849FiHYvXBVelsn90f7JeH0V8627hRp8ff+26ZH/4Gllj78FX4WQC57QC4MDbo4m/pCeJt47AHdiPcyVj/h6iF/i1nXMjnXObDzY45zY750YCpxS0kZndZGZLzGzJhLWbCup2Qgt+uwLfyVWxpHLhhlKlKTNoBFlvv0Jw7eq8fqFtW8lJXQBAzuIF+GvViUW5JdKm9M3UrFEtb7lG9aps2rQ5X5+MjC1c2fVGmjRtw/1DRwKwa1d42i4pqSwzpr/G/UNH8mXqsrxt0tIzeO/92QC8//6HnHlm3bx1Z55Zl5defJwrOl/P9u07ojY2KVho42osuTKUTsLt3kYoYyNux48QChFcvRhftfBvZKEfviPr5aFkjR1CaOMq3E8lJ4uiFfjfm9kgM6t8sMHMKpvZYOCHgjZyzo1zzjV2zjXueVq1grqdcHyVfx6Lv/bpWFwcbs9u8Acoc9fD5CyYQ07qZ/m2yVnyOYH6ZwMQqNuQYEZasdZcki1e8jWnnXYqtWvXJC4ujq5dO/DBzDn5+qSkVMDMALhn8B1MmPgWAHFxcbw79RUmT36HadNm5dtmxoyPaNWyOQAtW5zHd2vWA1CzZjWmvv0yPXv1Y02kTYqHJf88VeerdioE4iBzD6G0dZBYGkqHL7x8dRoQ+jHyGSsTuRjzB4hr0ZGc1LnFXXbUROsunW7APcC/zezkSNsWYAZwZZSOGTOl77iPQN2zsKTylHtuClnvTICAH4Dsf31AXNMWxLdoA7m5uOwD7BvzEABx57Ui8Kc/4ytbjvgWbQHIfPExgt+v48CMNyh9+30ktOuCy9pP5rgnYjW8EicYDNKv/33MnvUGfp+PCRPfZtWq73hg2ACWLF3OzJlzadmyOcMfHoLDsWDBIu7oey8AV17ZngsuOJfklAr06NEVgN433Mny5d8wctTzTJr4HP363ci+vZncfMtAAO67905SUirw7LPhqZzc3FyanXdJbAZfwiR07YevTn2sdBKlBr1IzidTwB/+7OWmziVQ/1wCZ7fEhYKQk82Bt54Ob+hCZH84iVK9hwJGcNN6cpd8AkDcBR0InHEOmI+c1I8JrV8Zo9Edf3Zw/rHYDmjWyzn36rH67ex+YfEWJsdVxXe/i3UJ8ivtGtg81iXIb1Bm+FQraF0sbst8MAbHFBHxvKhM6ZjZioJWAZULWCciIlEUrTn8ykAbwrdhHsqA/4vSMUVEpBDRCvyZQFnn3NeHrzCz+VE6poiIFCIqge+c613IuqujcUwRESmcnqUjIuIRCnwREY9Q4IuIeIQCX0TEIxT4IiIeocAXEfEIBb6IiEco8EVEPEKBLyLiEQp8ERGPUOCLiHiEAl9ExCMU+CIiHqHAFxHxCAW+iIhHKPBFRDxCgS8i4hEKfBERj1Dgi4h4hAJfRMQjFPgiIh6hwBcR8QgFvoiIRyjwRUQ8QoEvIuIRCnwREY9Q4IuIeEQg1gUUJHHw3bEuQX4De/fmWJcgIofRFb6IiEco8EVEPEKBLyLiEQp8ERGPUOCLiHiEAl9ExCMU+CIiHqHAFxHxCAW+iIhHKPBFRDxCgS8i4hEKfBERj1Dgi4h4hAJfRMQjFPgiIh6hwBcR8QgFvoiIRyjwRUQ8QoEvIuIRCnwREY9Q4IuIeIQCX0TEIxT4IiIeocAXEfEIBb6IiEco8EVEPKJIgW9mCUdpSz7+5YiISLQU9Qp/mpnFHVwws6rA3OiUJCIi0VDUwH8fmGJmfjOrDXwMDIlWUSIicvwFitLJOfeymcUTDv7awM3Ouf+LYl0iInKcFRr4ZnbXoYtALeBroJmZNXPOPRXF2kRE5Dg61hV+0mHL0wpo97ShY9/is2WrSS5XlmlPDiyw38q1/6PH/c8yst81/K1ZQwCefn0mC5atBuCmzq1p2/xsAO5/4U2WrFpPUulEAB667Sr+VLt6lEfiHRdf3IqnnnoIv8/H+Fff5PHHn8+3vlat6rw87ikqVUpm+/adXNezL+npGTRsWJ/nnh1BUrmyhIJBRjz2LFOnzgCgVau/MGrk/cTFx/HVsv9w4013EwwG8/bZuFFDFiyYwd+vuY1p02YV63hLqvgrbiVwRiPcvl3sH3P3Eev9dRsT3/oqnHMQCpI9awKh778FwMpXJKHTLVj5FACyJj6K27kVX50GxLe7FvMHCKavJ/u9sRAKFeu4oqXQwHfOPVhchZzIOrRsQvc253Pv828W2CcYCjH6jVmc9+c/5rV9tmwV325IY8qou8jOyeWGB8dy/ll1KRsJ+buuuSzvB4McPz6fjzHPDKfdJd1JS8tg0RezmTlzDqtXr8nrM3LkUCa//g6TJk2lVau/MPyRIfTs1ZfMzP30ur4fa9duoGrVyny56EPmzJnP7t17GP/KaNq07caaNesZNmwAPa69klcnvJV3zEcfvZe5c/8dq2GXSLnL5pO76CMSutx+1PXBdSvZv3oAAFa5Fond72L/6P4AJHS5nez50witWwHxieBCYEZC5z5kjX8Ity2DuIu6ETi7FblLPy2mEUVXUW/L/KOZjTOzOWb26cFXIf3bHvK+vJm9YmYrzOwNM6t8PAr/PWlU7w+UK1u60D5vfvg5rc89k+TyZfPa1qdt4Zy6fyDg91M6MYHTT6nKwuXfRrtcz2va5GzWrdvIhg3/Iycnh7enTKd9+zb5+tStezrz5i0EYP78hbRvfzEAa9asZ+3aDQBkZGxh69ZtVKqUQkpKBbKzs1mzZj0A//rXZ3TqdEne/m7vcz3vvTeLrVu3FccQPSO0cTUuc2/BHbKz8t5afCI4F35fqQb4/OGwP9gvJxtKJUEwF7ctA4Dg2uX4658btfqLW1Hv0pkKfAXcBww85FWQRw95/ySQAbQHFgMv/fIyT2xbtu/i08X/oevfmudr/+Mp1fi/r79l/4Fsduzey+Jv1rL5p515659960O6DHyCxydOJzsnt5irLrmqVa9CWtqmvOX09AyqV6uSr8+KFavo1LEdAB07tqNcuSSSkyvk69Ok8VnExcexbt1GfvppO4FAgEbn/BmAzldcSs2a1cLHq1aFDh3a8uJLr0VzWFIAf72mlOo/msQeQzgwbSwAvopVcVn7SLh6AIl9RhHX9lowH2TuBp8fX/U6AAQanIevfMVYln9cFekuHSDXOTf2Vx6jsXPurMj7p83suoI6mtlNwE0Az93Xh96d2xbU9YTy+IT36X/1Zfh8+X++Nm94Bt+s+4Hr7n+WCuXK0PD0U/BH+vTtfikVT0oiJzfIQ+OmMn76p9zS5eJYlO9Jgwc/zDPPPEKPHl1ZsGARaWkZ+ebjq1Q5mVcnjKH39f3D88PANdfcxhNPPEBCQjxz//UZwWB43vfJJx/kH/94NK+fFK/gqlT2r0rFV7su8a27kfXqw+Dz469dl/3PDcTt+omEbncSOCc8dXPg7dHEX9ITAnEE1yzHuZIxfw9FD/wPzOw24D3gwMFG59z2AvqfHLnDx4ByZmbu5//bC/ytwjk3DhgHkPX1zBLz6fhmfRqDx0wCYMfufSz46lv8fh9/bXImN17RmhuvaA3APWMmc0q1SgBUqlAOgPi4AB1aNWHizPkxqb0k2pS+mRo1quUtV69elfRNm/P1ycjYQteuNwJQpkxpOnW6lF27dgOQlFSWGdNfY+jQkXyZuixvm0VfLuXCv14BQOvWLTj99PBVYqNz/szkyS8AULFiMm3b/pXc3FxmzPg4eoOUI4Q2rsaSK0PpJNzubYQyNuJ2/AhAcPVifDVPh6UQ+uE7sl4eCoD/tD/jq1g1lmUfV0UN/INX5YdO4zigTgH9X+bnO3kmAhWBrWZWhfBtnZ7y4XP35r2//4U3aXFOPf7a5EyCoRB79u3npKQyfPf9Jr77PoNH+oS/1N26YzeVKpTDOce8xSs5rWaVgnYvv9DiJV9z2mmnUrt2TdLTN9Otaweu7dEnX5+UlAps374T5xyDB9/BhInhL1/j4uJ4Z+orTJ78zhF32lSqlMLWrduIj49n4IA+jHhsDAB/POO8vD6v/PNpZs3+l8K+mFhyFdz28A9zX7VTIRAHmXsI7d8HiaWhdDnI3I2vTgNC6evCG5UpB/t2gz9AXIuOZM+fVsgRTixF/YtXp/6SnRZ0d49zbrOZzfsl+zoRDH5mEktWrWPnnn387daHuPXKNuRGfv0/fN7+ULm5QXoNC98OWKZUAo/ecTUBvx+AIc++zo7de3EOzqhdjftv7BL9gXhEMBikX//7mDXrDfw+HxMmvs2qVd8xbNgAli5dzsyZc2nZsjmPPDwEh+PzBYu4o2/4h/aVV7bnggvOJSWlAj16dAWg9w13snz5N9x9161ccmlrfD4f4156jfnzF8ZymJ6Q0LUfvjr1sdJJlBr0IjmfTIHIZyg3dS6B+ucSOLslLhSEnGwOvPV0eEMXIvvDSZTqPRQwgpvWk7vkEwDiLuhA4IxzwHzkpH5MaP3KGI3u+LOiziuaWQOgHpB4sM0594u/hTKz/znnah2rX0ma0vGipKY3x7oE+ZV2Diz4IkV+/8oMn2oFrSvSFb6ZDQNaEQ782UA74HPgqIFvZisK2hVQ4m7LFBE5ERR1Dr8L0BD4yjnXK3Iv/eRC+lcG2gA7Dms3QM/gERGJgaIGfpZzLmRmuWZWDvgRqFlI/5lAWefc14evMLP5v7hKERH5zY4Z+GZmwAozO4nw3TdLgb3AFwVt45zrXci6q395mSIi8lsdM/Cdc87MmjrndgIvmtlHQDnnXEHz9CIi8jtU1EcrLDOzJgDOuY0KexGRE09R5/DPBf5uZt8D+wh/+eqcc3+OWmUiInJcFTXw2xy7i4iI/J4V9W/afh/tQkREJLqKOocvIiInOAW+iIhHKPBFRDxCgS8i4hEKfBERj1Dgi4h4hAJfRMQjFPgiIh6hwBcR8QgFvoiIRyjwRUQ8QoEvIuIRCnwREY9Q4IuIeIQCX0TEIxT4IiIeocAXEfEIBb6IiEco8EVEPEKBLyLiEQp8ERGPUOCLiHhEINYFFMRfo16sS5DfwMW6ABE5gq7wRUQ8QoEvIuIRCnwREY9Q4IuIeIQCX0TEIxT4IiIeocAXEfEIBb6IiEco8EVEPEKBLyLiEQp8ERGPUOCLiHiEAl9ExCMU+CIiHqHAFxHxCAW+iIhHKPBFRDxCgS8i4hEKfBERj1Dgi4h4hAJfRMQjFPgiIh6hwBcR8QgFvoiIRyjwRUQ8QoEvIuIRCnwREY9Q4IuIeIQCX0TEIxT4IiIeocAXEfGIQKwLKAnue/QpPluYSnKFk3h/8otHrE9dtoK+9zxI9apVAGjdsjm3Xv/3vPXBYJBuvftycqWKvPD4gwAMfmAk33y7hkAgQIN6f2TYoL7EBXS6jpc2F7fiqacewu/zMf7VNxn1+PP51teqVZ1/jnuKipWS2bF9Jz169iU9PYOGDevz/LMjSCpXlmAwyIjHnmXq1BkAzP90GmWTygJwcqUUFi/5ms5detOyxXlMe3c8Gzb+AMD778/mkeGji3W8JVX8FbcSOKMRbt8u9o+5+4j1/rqNiW99Fc45CAXJnjWB0PffAmDlK5LQ6RasfAoAWRMfxe3ciq9OA+LbXYv5AwTT15P93lgIhYp1XNGiBDkOOl7yN67ufDn/ePiJAvuc07BBXpgfbvLU6dSpXYu9+zLz2i69+EIeGzYIgEEPjOTdDz7iqk6XHd/CPcrn8zHmmeG0vaQ7aWkZLPpiNh/MnMPq1Wvy+owaOZRJr7/DpElTubDVXxj+yBB69upLZuZ+el7fj7VrN1C1amVSF33InDnz2bVrN63+ekXe9lPeHseMD+bkLX/+eSodOl1XrOP0gtxl88ld9BEJXW4/6vrgupXsXz0AAKtci8Tud7F/dH8AErrcTvb8aYTWrYD4RHAhMCOhcx+yxj+E25ZB3EXdCJzditylnxbTiKIrKlM6ZlbezB4zs2/NbLuZbTOz1ZG2k6JxzFhqfNaZlC+X9Ku23fzjVj77v1Q6t2+Tr71F86aYGWbGmXXPYMuPPx2PUgVo2uRs1q3byIYN/yMnJ4cpU6Zz+WF//nXrns68eQsBmDd/IZe3vxiANWvWs3btBgAyMrbw49ZtVKqUkm/bpKSyXNjqL0yf/lExjMbbQhtX4zL3FtwhOyvvrcUngnPh95VqgM8fDvuD/XKyoVQSBHNx2zIACK5djr/+uVGrv7hFaw5/CrADaOWcS3bOpQAXRtqmROmYv2vLV67miutu45a772ft+u/z2kc+8xJ33dYbs6OfipzcXD74+BPOP7dxcZVa4lWrXoUf0jblLaelZ1CtWpV8fVasWEWnju0A6NixHeXKJZGcXCFfnyaNzyI+Po516zbma+/QoS2fzlvInj0/B1GzZo1YumQuM2dMol69Px7nEUlh/PWaUqr/aBJ7DOHAtLEA+CpWxWXtI+HqAST2GUVc22vBfJC5G3x+fNXrABBocB6+8hVjWf5xFa3Ar+2cG+mc23ywwTm32Tk3EjglSsf83ap3xh+Y++5Epk18gas7t6fvkIcAmL/wS5IrnET9P51e4LaPPPE8jRo2oNFZDYqrXAEGDX6YFi2asTj1Y1pc0Iy0tAyCwWDe+ipVTmbChDHccMNd4fnhQ1zVtQNvvf1+3vKyr/5DndOa0qjx33j+hVd5d+r44hqGAMFVqewf3Z+s10cR37pbuNHnx1+7LtkfvkbW2HvwVTiZwDmtADjw9mjiL+lJ4q0jcAf241zJmL+H6AX+92Y2yMwqH2wws8pmNhj4oaCNzOwmM1tiZkv++dqbUSqt+JUtU4bSpUsB4ama3NxcduzcxVcrVjH/80Vc3Pk6Bg57jNSlyxn84Ki87V4Y/zo7du5iUN+bYlV6ibQpfTM1a1TLW65RvSqbNm3O1ycjYwtXdr2RJk3bcP/QkQDs2rUbCE/ZzJj+GvcPHcmXqcvybZeSUoEmTc5m9uxP8tr27NnLvsj3Mx9+9ClxcQFSUvL/tiDRF9q4GkuuDKWTcLu3EcrYiNvxI4RCBFcvxlft1HC/H74j6+WhZI0dQmjjKtxPm46x5xNHtL607QbcA/w7EvoO2ALMALoWtJFzbhwwDiDnp/WuoH4nmp+2bScluQJmxn9W/ZeQc5xUvhx33tqLO2/tBYTv5Jnw5ruMjHxR+86Mj1j45VJeGTMCn093zx5Pi5d8zWmnnUrt2jVJT99M164duLZHn3x9UlIqsH37Tpxz3DP4DiZMfAuAuLg43p36CpMnv8O0abOO2HfnKy5j1ux/ceDAgby2ypUrsWXLViA8DeTz+di2bUcURygHWXIV3PbwD3NftVMhEAeZewjt3weJpaF0Ocjcja9OA0Lp68IblSkH+3aDP0Bci45kz58WwxEcX1EJfOfcDjN7FZgLLHLO5U1mmllboER9mzVw2GMs/moFO3fu5qKO13Bb72vJzc0FoFunS5kz73Pefm8W/oCfxPh4Hn/wHsys0H0+/MSzVK18Mn+/6S7gyFs55dcLBoP0638fs2e9gd/nY8LEt1m16jseGDaAJUuXM3PmXFq2bM7wh4fgcCxYsIg7+t4LwJVXtueCC84lOaUCPXqEr11633Any5d/A0C3rpcfcYtn5ysu5eabe5CbGyRrfxZ/v+a24h1wCZbQtR++OvWx0kmUGvQiOZ9MAb8fgNzUuQTqn0vg7Ja4UBBysjnw1tPhDV2I7A8nUar3UMAIblpP7pLwb2VxF3QgcMY5YD5yUj8mtH5ljEZ3/Nnh84/HZadmfYE+wGrgLKCfc256ZN0y59w5x9pHSbrC96JS1S6IdQnyK+0a2DzWJchvUGb41AKvJqM1pXMj0Mg5t9fMagPvmFlt59wzQOGXtiIiEhXRCnzfwWkc59xGM2tFOPRPQYEvIhIT0fo2cIuZnXVwIRL+lwEVgTOjdEwRESlEtAK/B5DvPjfnXK5zrgfQIkrHFBGRQkTrLp20QtYtjMYxRUSkcLrBW0TEIxT4IiIeocAXEfEIBb6IiEco8EVEPEKBLyLiEQp8ERGPUOCLiHiEAl9ExCMU+CIiHqHAFxHxCAW+iIhHKPBFRDxCgS8i4hEKfBERj1Dgi4h4hAJfRMQjFPgiIh6hwBcR8QgFvoiIRyjwRUQ8QoEvIuIRCnwREY9Q4IuIeIQCX0TEIxT4IiIeocAXEfEIc87FugZPMrObnHPjYl2H/Do6fycuL587XeHHzk2xLkB+E52/E5dnz50CX0TEIxT4IiIeocCPHU/OIZYgOn8nLs+eO31pKyLiEbrCFxHxCAW+iIhHKPCPAzNra2b/NbO1ZnbPUdYnmNnbkfVfmlntQ9YNibT/18zaHNI+3sx+NLOVxTQMOUwRzmsLM1tmZrlm1iUWNcrRHevzY2FjIud2hZmdU9w1xoIC/zcyMz/wPNAOqAd0N7N6h3XrDexwzp0GPA2MjGxbD7gKqA+0BV6I7A9gQqRNYqCI5/V/QE/gjeKtTopgAoV/ftoBp0deNwFji6GmmFPg/3ZNgbXOufXOuWzgLaDDYX06ABMj798BLjIzi7S/5Zw74JzbAKyN7A/n3GfA9uIYgBzVMc+rc26jc24FEIpFgVKwInx+OgCvubBFwElmVrV4qosdBf5vVx344ZDltEjbUfs453KBXUBKEbeV2NC5Kdk8eX4V+CIiHqHA/+3SgZqHLNeItB21j5kFgPLAtiJuK7Ghc1OyefL8KvB/u8XA6WZ2qpnFE/4SdsZhfWYA10XedwE+deG/8TYDuCpyF8+phL9ASi2muqVwRTmvcuKaAfSI3K3TDNjlnMuIdVHRFoh1ASc651yumd0OfAz4gfHOuW/M7CFgiXNuBvAKMMnM1hL+IumqyLbfmNkUYBWQC/RxzgUBzOxNoBVQ0czSgGHOuVeKeXieVZTzamZNgPeACkB7M3vQOVc/hmVLxNE+P0AcgHPuRWA2cAnhGyUygV6xqbR46dEKIiIeoSkdERGPUOCLiHiEAl9ExCMU+CIiHqHAFxHxCAW+yO+Qmf0j1jVIyaPbMuWEEXngnDnnfhcPKzMz/8G/NxGFfe91zpX9vdQjJYOu8OV3zcxqR55J/xqwEqhpZgPNbHHkOeYPHtL3/kjfz83sTTMbcIx99zSz6WY238zWmNmwQ9a9b2ZLzewbM7vpkPa9ZvakmS0HzjOzoZFaVprZuMgPJSL7fNrMlpjZajNrYmbTIsd55JD9XWNmqWb2tZm9ZGZ+M3sMKBVpe72gfker57j8oUvJ5ZzTS6/f7QuoTfjxw80iyxcT/keojfAFy0ygBdAE+BpIBJKANcCAY+y7J5BB+MmlpQj/QGkcWZcc+e/B9pTIsgO6HrKP5EPeTwLaR97PB0ZG3vcDNgFVgQTCT2ZMAeoCHwBxkX4vAD0i7/cest/C+uWrRy+9Cnvp0QpyIvjehZ9ZDuHAvxj4KrJclvAziJKA6c65LCDLzD4o4r7nOue2AZjZNOB8YAnQ18w6RfrUjBxjGxAE3j1k+wvNbBBQGkgGviEczvDzs3f+A3zjIs9qMbP1kX2eDzQCFkd+MSgF/HiUGi8qpN/h9YgUSIEvJ4J9h7w3YIRz7qVDO5hZ/1+578O/xHJm1gpoDZznnMs0s/mEf3MAyHI/P+8okfDVdmPn3A9m9sAh/QAORP4bOuT9weVAZCwTnXNDjlFjYf3y6hE5Fs3hy4nmY+B6MysLYGbVzexkYCHhB5glRtZdVsT9/c3Mks2sFNAxsp/yhP9Jykwz+xPQrIBtD4b7T5Fj/tJ/1/YToEukfiJ1nBJZl2NmcUXoJ1JkusKXE4pzbo6Z1QW+iExv7AWucc4tNrMZwApgC+FplF0AZnZLZNsXj7LLVMJTIjWAyc65JWb2H+AWM1sN/BdYdJTtcM7tNLOXCc/xbyb8SOVfMpZVZnYfMMfMfEAO0Af4nvD3FCvMbJlz7u+F9BMpMt2WKSWGmZV1zu01s9LAZ8BNzrllhfTvSXg65vbiqlEklnSFLyXJODOrR3iqZWJhYS/iRbrCFxHxCH1pKyLiEQp8ERGPUOCLiHiEAl9ExCMU+CIiHvH/Lufbq4FeUs0AAAAASUVORK5CYII=\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" - } + }, + "output_type": "display_data" } ], - "metadata": {} + "source": [ + "fig, ax = plt.subplots()\n", + "sns.heatmap(rmse_df, cbar=False, annot=True, fmt=\".4g\")" + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "The calculated RMSE scores can be visualized to comparatively study how model performance is affected by different parameters." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "It can be seen from this visualization that RMSE first decreases and then increases as rank increases, due to overfitting. When the rank equals 20 and the regularization parameter equals 0.1, the model achieves the lowest RMSE score." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "### 3.5 Top K recommendation" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "#### 3.5.1 Top k for all users (items)" - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 20, + "metadata": {}, + "outputs": [], "source": [ "dfs_rec = model.recommendForAllUsers(10)" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 21, - "source": [ - "dfs_rec.show(10)" - ], + "metadata": {}, "outputs": [ { + "name": "stderr", "output_type": "stream", + "text": [ + "[Stage 2198:> (0 + 1) / 1]\r" + ] + }, + { "name": "stdout", + "output_type": "stream", "text": [ "+------+--------------------+\n", "|UserId| recommendations|\n", "+------+--------------------+\n", - "| 471|[[814, 3.7504895]...|\n", - "| 463|[[814, 3.1264873]...|\n", - "| 833|[[814, 3.3154662]...|\n", - "| 496|[[814, 3.055388],...|\n", - "| 148|[[814, 4.03012], ...|\n", - "| 540|[[814, 3.8661027]...|\n", - "| 392|[[814, 4.119951],...|\n", - "| 243|[[814, 3.748784],...|\n", - "| 623|[[814, 3.9018161]...|\n", - "| 737|[[814, 3.8507497]...|\n", + "| 1|[{1536, 3.8911996...|\n", + "| 3|[{1536, 3.106386}...|\n", + "| 5|[{1536, 3.1345377...|\n", + "| 6|[{1536, 3.735431}...|\n", + "| 9|[{1536, 4.2483497...|\n", + "| 12|[{1536, 4.44719},...|\n", + "| 13|[{1536, 3.3997424...|\n", + "| 15|[{1536, 3.1361642...|\n", + "| 16|[{1536, 4.565808}...|\n", + "| 17|[{1536, 3.0498183...|\n", "+------+--------------------+\n", "only showing top 10 rows\n", "\n" ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\r", + " \r" + ] } ], - "metadata": {} + "source": [ + "dfs_rec.show(10)" + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "#### 3.5.2 Top k for a selected set of users (items)" - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 22, + "metadata": {}, + "outputs": [], "source": [ "users = dfs_train.select(als.getUserCol()).distinct().limit(3)\n", "\n", "dfs_rec_subset = model.recommendForUserSubset(users, 10)" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 23, - "source": [ - "dfs_rec_subset.show(10)" - ], + "metadata": {}, "outputs": [ { + "name": "stderr", "output_type": "stream", + "text": [ + "[Stage 2228:=================================================> (93 + 2) / 100]\r" + ] + }, + { "name": "stdout", + "output_type": "stream", "text": [ "+------+--------------------+\n", "|UserId| recommendations|\n", "+------+--------------------+\n", - "| 471|[[814, 3.7504895]...|\n", - "| 463|[[814, 3.1264873]...|\n", - "| 148|[[814, 4.03012], ...|\n", + "| 471|[{1536, 3.3287532...|\n", + "| 463|[{1536, 3.1125846...|\n", + "| 148|[{1536, 3.907574}...|\n", "+------+--------------------+\n", "\n" ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\r", + " \r" + ] } ], - "metadata": {} + "source": [ + "dfs_rec_subset.show(10)" + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "#### 3.5.3 Run-time considerations for top-k recommendations\n", "\n", @@ -745,28 +827,28 @@ "* Inner products of user-item pairs are calculated individually instead of leveraging matrix block multiplication features which are available in certain contemporary computing acceleration libraries (e.g., BLAS).\n", "\n", "More details about possible optimizations of the top k recommendations in Spark can be found [here](https://engineeringblog.yelp.com/2018/05/scaling-collaborative-filtering-with-pyspark.html)." - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 24, + "metadata": {}, + "outputs": [], "source": [ "# cleanup spark instance\n", "spark.stop()" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## References" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "1. Yehuda Koren, Robert Bell, and Chris Volinsky, \"Matrix Factorization Techniques for Recommender Systems\n", "\", ACM Computer, Vol. 42, Issue 8, pp 30-37, Aug., 2009.\n", @@ -776,8 +858,7 @@ "4. Seaborn. url: https://seaborn.pydata.org/\n", "5. Scaling collaborative filtering with PySpark. url: https://engineeringblog.yelp.com/2018/05/scaling-collaborative-filtering-with-pyspark.html\n", "6. Matrix Completion via Alternating Least Square (ALS). url: http://stanford.edu/~rezab/classes/cme323/S15/notes/lec14.pdf" - ], - "metadata": {} + ] } ], "metadata": { @@ -786,8 +867,9 @@ "hash": "7ec2189bea0434770dca7423a25e631e1cca9c4e2b4ff137a82f4dff32ac9607" }, "kernelspec": { - "name": "python3", - "display_name": "Python 3.6.9 64-bit ('.env': venv)" + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -799,9 +881,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.9" + "version": "3.8.0" } }, "nbformat": 4, "nbformat_minor": 2 -} \ No newline at end of file +} diff --git a/examples/02_model_content_based_filtering/mmlspark_lightgbm_criteo.ipynb b/examples/02_model_content_based_filtering/mmlspark_lightgbm_criteo.ipynb index 96a9d03284..ede0ff3e81 100644 --- a/examples/02_model_content_based_filtering/mmlspark_lightgbm_criteo.ipynb +++ b/examples/02_model_content_based_filtering/mmlspark_lightgbm_criteo.ipynb @@ -2,15 +2,16 @@ "cells": [ { "cell_type": "markdown", + "metadata": {}, "source": [ "Copyright (c) Microsoft Corporation. All rights reserved.\n", "\n", "Licensed under the MIT License." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "# Content-Based Personalization with LightGBM on Spark\n", "\n", @@ -22,30 +23,42 @@ "[MMLSpark](https://github.com/Azure/mmlspark) library, which allows LightGBM to be called in a Spark environment and be computed distributely.\n", "\n", "This scenario is a good example of **implicit feedback**, where binary labels indicate the interaction between a user and an item. This contrasts with explicit feedback, where the user explicitely rate the content, for example from 1 to 5. \n" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## Global Settings and Imports" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "This notebook can be run in a Spark environment in a DSVM or in Azure Databricks. For more details about the installation process, please refer to the [setup instructions](../../SETUP.md).\n", "\n", "**NOTE for Azure Databricks:**\n", "* A python script is provided to simplify setting up Azure Databricks with the correct dependencies. Run ```python tools/databricks_install.py -h``` for more details.\n", "* MMLSpark should not be run on a cluster with autoscaling enabled. Disable the flag in the Azure Databricks Cluster configuration before running this notebook." - ], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MMLSpark version: com.microsoft.azure:synapseml_2.12:0.9.5\n", + "System version: 3.8.0 (default, Nov 6 2019, 21:49:08) \n", + "[GCC 7.3.0]\n", + "PySpark version: 3.2.0\n" + ] + } + ], "source": [ "import os\n", "import sys\n", @@ -54,9 +67,12 @@ "import pyspark\n", "from pyspark.ml import PipelineModel\n", "from pyspark.ml.feature import FeatureHasher\n", + "import warnings\n", + "warnings.simplefilter(action='ignore', category=FutureWarning)\n", "import papermill as pm\n", "import scrapbook as sb\n", "\n", + "\n", "from recommenders.utils.notebook_utils import is_databricks\n", "from recommenders.utils.spark_utils import start_or_get_spark\n", "from recommenders.datasets.criteo import load_spark_df\n", @@ -64,35 +80,31 @@ "\n", "# Setup MML Spark\n", "from recommenders.utils.spark_utils import MMLSPARK_REPO, MMLSPARK_PACKAGE\n", + "\n", + "# On Spark >3.0.0,<3.2.0, the following should be set:\n", + "# MMLSPARK_PACKAGE = \"com.microsoft.azure:synapseml_2.12:0.9.4\"\n", "packages = [MMLSPARK_PACKAGE]\n", "repos = [MMLSPARK_REPO]\n", "spark = start_or_get_spark(packages=packages, repositories=repos)\n", "dbutils = None\n", "print(\"MMLSpark version: {}\".format(MMLSPARK_PACKAGE))\n", "\n", - "from mmlspark.train import ComputeModelStatistics\n", - "from mmlspark.lightgbm import LightGBMClassifier\n", + "from synapse.ml.train import ComputeModelStatistics\n", + "from synapse.ml.lightgbm import LightGBMClassifier\n", "\n", "print(\"System version: {}\".format(sys.version))\n", "print(\"PySpark version: {}\".format(pyspark.version.__version__))" - ], - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "MMLSpark version: com.microsoft.ml.spark:mmlspark_2.11:0.18.1\n", - "System version: 3.6.10 |Anaconda, Inc.| (default, May 8 2020, 02:54:21) \n", - "[GCC 7.3.0]\n", - "PySpark version: 2.4.3\n" - ] - } - ], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, + "metadata": { + "tags": [ + "parameters" + ] + }, + "outputs": [], "source": [ "# Criteo data size, it can be \"sample\" or \"full\"\n", "DATA_SIZE = \"sample\"\n", @@ -107,43 +119,33 @@ "\n", "# Model name\n", "MODEL_NAME = 'lightgbm_criteo.mml'" - ], - "outputs": [], - "metadata": { - "tags": [ - "parameters" - ] - } + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## Data Preparation\n", "\n", "The [Criteo Display Advertising Challenge](https://www.kaggle.com/c/criteo-display-ad-challenge) (Criteo DAC) dataset is a well-known industry benchmarking dataset for developing CTR prediction models, and is used frequently by research papers. The original dataset contains over 45M rows, but there is also a down-sampled dataset which has 100,000 rows (this can be used by setting `DATA_SIZE = \"sample\"`). Each row corresponds to a display ad served by Criteo and the first column is indicates whether this ad has been clicked or not.

\n", "The dataset contains 1 label column and 39 feature columns, where 13 columns are integer values (int00-int12) and 26 columns are categorical features (cat00-cat25).

\n", "What the columns represent is not provided, but for this case we can consider the integer and categorical values as features representing the user and / or item content. The label is binary and is an example of implicit feedback indicating a user's interaction with an item. With this dataset we can demonstrate how to build a model that predicts the probability of a user interacting with an item based on available user and item content features.\n" - ], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 4, - "source": [ - "raw_data = load_spark_df(size=DATA_SIZE, spark=spark, dbutils=dbutils)\n", - "# visualize data\n", - "raw_data.limit(2).toPandas().head()" - ], + "execution_count": 3, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stderr", + "output_type": "stream", "text": [ - "100%|██████████| 8.58k/8.58k [00:01<00:00, 5.15kKB/s]\n" + "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8.58k/8.58k [00:06<00:00, 1.24kKB/s]\n", + " \r" ] }, { - "output_type": "execute_result", "data": { "text/html": [ "
\n", @@ -257,53 +259,59 @@ "[2 rows x 40 columns]" ] }, + "execution_count": 3, "metadata": {}, - "execution_count": 4 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "raw_data = load_spark_df(size=DATA_SIZE, spark=spark, dbutils=dbutils)\n", + "# visualize data\n", + "raw_data.limit(2).toPandas().head()" + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "### Feature Processing\n", "The feature data provided has many missing values across both integer and categorical feature fields. In addition the categorical features have many distinct values, so effectively cleaning and representing the feature data is an important step prior to training a model.

\n", "One of the simplest ways of managing both features that have missing values as well as high cardinality is to use the hashing trick. The [FeatureHasher](http://spark.apache.org/docs/latest/ml-features.html#featurehasher) transformer will pass integer values through and will hash categorical features into a sparse vector of lower dimensionality, which can be used effectively by LightGBM.

\n", "First, the dataset is splitted randomly for training and testing and feature processing is applied to each dataset." - ], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, + "metadata": {}, + "outputs": [], "source": [ "raw_train, raw_test = spark_random_split(raw_data, ratio=0.8, seed=42)" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, + "metadata": {}, + "outputs": [], "source": [ "columns = [c for c in raw_data.columns if c != 'label']\n", "feature_processor = FeatureHasher(inputCols=columns, outputCol='features')" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, + "metadata": {}, + "outputs": [], "source": [ "train = feature_processor.transform(raw_train)\n", "test = feature_processor.transform(raw_test)" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## Model Training\n", "In MMLSpark, the LightGBM implementation for binary classification is invoked using the `LightGBMClassifier` class and specifying the objective as `\"binary\"`. In this instance, the occurrence of positive labels is quite low, so setting the `isUnbalance` flag to true helps account for this imbalance.

\n", @@ -315,12 +323,13 @@ "- `learningRate`: the learning rate for training across trees\n", "- `featureFraction`: the fraction of features used for training a tree\n", "- `earlyStoppingRound`: round at which early stopping can be applied to avoid overfitting" - ], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, + "metadata": {}, + "outputs": [], "source": [ "lgbm = LightGBMClassifier(\n", " labelCol=\"label\",\n", @@ -336,132 +345,154 @@ " featureFraction=FEATURE_FRACTION,\n", " earlyStoppingRound=EARLY_STOPPING_ROUND\n", ")" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "### Model Training and Evaluation" - ], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], "source": [ "model = lgbm.fit(train)\n" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, + "metadata": {}, + "outputs": [], "source": [ "predictions = model.transform(test)" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 11, - "source": [ - "evaluator = (\n", - " ComputeModelStatistics()\n", - " .setScoredLabelsCol(\"prediction\")\n", - " .setLabelCol(\"label\")\n", - " .setEvaluationMetric(\"AUC\")\n", - ")\n", - "\n", - "result = evaluator.transform(predictions)\n", - "auc = result.select(\"AUC\").collect()[0][0]\n", - "result.show()" - ], + "execution_count": 10, + "metadata": {}, "outputs": [ { + "name": "stderr", "output_type": "stream", + "text": [ + " \r" + ] + }, + { "name": "stdout", + "output_type": "stream", "text": [ "+---------------+------------------+\n", "|evaluation_type| AUC|\n", "+---------------+------------------+\n", - "| Classification|0.6892773832319504|\n", + "| Classification|0.6590485347443004|\n", "+---------------+------------------+\n", "\n" ] } ], - "metadata": {} + "source": [ + "evaluator = (\n", + " ComputeModelStatistics()\n", + " .setScoredLabelsCol(\"prediction\")\n", + " .setLabelCol(\"label\")\n", + " .setEvaluationMetric(\"AUC\")\n", + ")\n", + "\n", + "result = evaluator.transform(predictions)\n", + "auc = result.select(\"AUC\").collect()[0][0]\n", + "result.show()" + ] }, { "cell_type": "code", - "execution_count": 10, - "source": [ - "# Record results with papermill for tests\n", - "sb.glue(\"auc\", auc)" - ], + "execution_count": 11, + "metadata": {}, "outputs": [ { - "output_type": "display_data", "data": { - "application/papermill.record+json": { - "auc": 0.6870253907336659 + "application/scrapbook.scrap.json+json": { + "data": 0.6590485347443004, + "encoder": "json", + "name": "auc", + "version": 1 } }, - "metadata": {} + "metadata": { + "scrapbook": { + "data": true, + "display": false, + "name": "auc" + } + }, + "output_type": "display_data" } ], - "metadata": {} + "source": [ + "# Record results with papermill for tests\n", + "sb.glue(\"auc\", auc)" + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## Model Saving \n", "The full pipeline for operating on raw data including feature processing and model prediction can be saved and reloaded for use in another workflow." - ], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, + "metadata": {}, + "outputs": [], "source": [ "# save model\n", "pipeline = PipelineModel(stages=[feature_processor, model])\n", "pipeline.write().overwrite().save(MODEL_NAME)" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, + "metadata": {}, + "outputs": [], "source": [ "# cleanup spark instance\n", "if not is_databricks():\n", " spark.stop()" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## Additional Reading\n", "\\[1\\] Guolin Ke, Qi Meng, Thomas Finley, Taifeng Wang, Wei Chen, Weidong Ma, Qiwei Ye, and Tie-Yan Liu. 2017. LightGBM: A highly efficient gradient boosting decision tree. In Advances in Neural Information Processing Systems. 3146–3154. https://papers.nips.cc/paper/6907-lightgbm-a-highly-efficient-gradient-boosting-decision-tree.pdf
\n", "\\[2\\] MML Spark: https://mmlspark.blob.core.windows.net/website/index.html
\n" - ], - "metadata": {} + ] } ], "metadata": { "celltoolbar": "Tags", "kernelspec": { - "display_name": "reco_full", + "display_name": "Python (reco)", "language": "python", - "name": "conda-env-reco_full-py" + "name": "reco" }, "language_info": { "codemirror_mode": { @@ -473,9 +504,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.10" + "version": "3.8.0" } }, "nbformat": 4, "nbformat_minor": 2 -} \ No newline at end of file +} diff --git a/examples/03_evaluate/als_movielens_diversity_metrics.ipynb b/examples/03_evaluate/als_movielens_diversity_metrics.ipynb index 289d5b93fd..12a7ce0ed6 100644 --- a/examples/03_evaluate/als_movielens_diversity_metrics.ipynb +++ b/examples/03_evaluate/als_movielens_diversity_metrics.ipynb @@ -2,15 +2,16 @@ "cells": [ { "cell_type": "markdown", + "metadata": {}, "source": [ "Copyright (c) Microsoft Corporation. All rights reserved.\n", "\n", "Licensed under the MIT License." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "# Apply Diversity Metrics \n", "## -- Compare ALS and Random Recommenders on MovieLens (PySpark)\n", @@ -40,11 +41,11 @@ "The comparision results show that the ALS recommender outperforms the random recommender on ranking metrics (Precision@k, Recall@k, NDCG@k, and\tMean average precision), while the random recommender outperforms ALS recommender on diversity metrics. This is because ALS is optimized for estimating the item rating as accurate as possible, therefore it performs well on accuracy metrics including rating and ranking metrics. As a side effect, the items being recommended tend to be popular items, which are the items mostly sold or viewed. It leaves the [long-tail items](https://github.com/microsoft/recommenders/blob/main/GLOSSARY.md) having less chance to get introduced to the users. This is the reason why ALS is not performing as well as a random recommender on diversity metrics. \n", "\n", "From the algorithmic point of view, items in the tail suffer from the cold-start problem, making them hard for recommendation systems to use. However, from the business point of view, oftentimes the items in the tail can be highly profitable, since, depending on supply, business can apply a higher margin to them. Recommendation systems that optimize metrics like novelty and diversity, can help to find users willing to get these long tail items. Usually there is a trade-off between one type of metric vs. another. One should decide which set of metrics to optimize based on business scenarios." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "**Coverage**\n", "\n", @@ -64,11 +65,11 @@ "p(i|R) = \\frac{|M_r (i)|}{|\\textrm{reco_df}|}\n", "$$\n", "and $M_r (i)$ denotes the users who are recommended item $i$.\n" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "\n", "**Diversity**\n", @@ -88,11 +89,11 @@ "$$\n", "\\textrm{diversity} = 1 - \\textrm{IL}\n", "$$\n" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "\n", "**Novelty**\n", @@ -111,11 +112,11 @@ "$$\n", "\\textrm{novelty} = \\sum_{i \\in N_r} \\frac{|M_r (i)|}{|\\textrm{reco_df}|} \\textrm{novelty}(i)\n", "$$\n" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "**Serendipity**\n", "\n", @@ -130,19 +131,30 @@ "\\textrm{serendipity} = \\frac{1}{|M|} \\sum_{u \\in M_r}\n", "\\frac{1}{|N_r (u)|} \\sum_{i \\in N_r (u)} \\big(1 - \\textrm{expectedness}(i|u) \\big) \\, \\textrm{relevance}(i)\n", "$$\n" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "**Note**: This notebook requires a PySpark environment to run properly. Please follow the steps in [SETUP.md](https://github.com/Microsoft/Recommenders/blob/master/SETUP.md#dependencies-setup) to install the PySpark environment." - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "System version: 3.8.0 (default, Nov 6 2019, 21:49:08) \n", + "[GCC 7.3.0]\n", + "Spark version: 3.2.0\n" + ] + } + ], "source": [ "# set the environment path to find Recommenders\n", "%load_ext autoreload\n", @@ -156,6 +168,8 @@ "from pyspark.sql.types import FloatType, IntegerType, LongType, StructType, StructField\n", "from pyspark.ml.feature import Tokenizer, StopWordsRemover\n", "from pyspark.ml.feature import HashingTF, CountVectorizer, VectorAssembler\n", + "import warnings\n", + "warnings.simplefilter(action='ignore', category=FutureWarning)\n", "\n", "from recommenders.utils.timer import Timer\n", "from recommenders.datasets import movielens\n", @@ -171,31 +185,25 @@ "\n", "print(\"System version: {}\".format(sys.version))\n", "print(\"Spark version: {}\".format(pyspark.__version__))\n" - ], - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "System version: 3.6.9 (default, Jan 26 2021, 15:33:00) \n", - "[GCC 8.4.0]\n", - "Spark version: 2.4.8\n" - ] - } - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "\n", "Set the default parameters." - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 2, + "metadata": { + "tags": [ + "parameters" + ] + }, + "outputs": [], "source": [ "# top k items to recommend\n", "TOP_K = 10\n", @@ -209,72 +217,54 @@ "COL_RATING=\"Rating\"\n", "COL_TITLE=\"Title\"\n", "COL_GENRE=\"Genre\"" - ], - "outputs": [], - "metadata": { - "tags": [ - "parameters" - ] - } + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "### 1. Set up Spark context\n", "\n", "The following settings work well for debugging locally on VM - change when running on a cluster. We set up a giant single executor with many threads and specify memory cap. " - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 3, + "metadata": {}, + "outputs": [], "source": [ "# the following settings work well for debugging locally on VM - change when running on a cluster\n", "# set up a giant single executor with many threads and specify memory cap\n", "\n", "spark = start_or_get_spark(\"ALS PySpark\", memory=\"16g\")\n", - "\n", + "spark.conf.set(\"spark.sql.analyzer.failAmbiguousSelfJoin\", \"false\")\n", "spark.conf.set(\"spark.sql.crossJoin.enabled\", \"true\")" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "### 2. Download the MovieLens dataset" - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 4, - "source": [ - "# Note: The DataFrame-based API for ALS currently only supports integers for user and item ids.\n", - "schema = StructType(\n", - " (\n", - " StructField(COL_USER, IntegerType()),\n", - " StructField(COL_ITEM, IntegerType()),\n", - " StructField(COL_RATING, FloatType()),\n", - " StructField(\"Timestamp\", LongType()),\n", - " )\n", - ")\n", - "\n", - "data = movielens.load_spark_df(spark, size=MOVIELENS_DATA_SIZE, schema=schema, title_col=COL_TITLE, genres_col=COL_GENRE)\n", - "data.show()" - ], + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stderr", + "output_type": "stream", "text": [ - "100%|██████████| 4.81k/4.81k [00:00<00:00, 20.1kKB/s]\n" + "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4.81k/4.81k [00:05<00:00, 862KB/s]\n", + " \r" ] }, { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+-------+------+------+---------+--------------------+------+\n", "|MovieId|UserId|Rating|Timestamp| Title| Genre|\n", @@ -305,73 +295,116 @@ ] } ], - "metadata": {} + "source": [ + "# Note: The DataFrame-based API for ALS currently only supports integers for user and item ids.\n", + "schema = StructType(\n", + " (\n", + " StructField(COL_USER, IntegerType()),\n", + " StructField(COL_ITEM, IntegerType()),\n", + " StructField(COL_RATING, FloatType()),\n", + " StructField(\"Timestamp\", LongType()),\n", + " )\n", + ")\n", + "\n", + "data = movielens.load_spark_df(spark, size=MOVIELENS_DATA_SIZE, schema=schema, title_col=COL_TITLE, genres_col=COL_GENRE)\n", + "data.show()" + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "#### Split the data using the Spark random splitter provided in utilities" - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 5, - "source": [ - "train_df, test_df = spark_random_split(data.select(COL_USER, COL_ITEM, COL_RATING), ratio=0.75, seed=123)\n", - "print (\"N train_df\", train_df.cache().count())\n", - "print (\"N test_df\", test_df.cache().count())" - ], + "metadata": {}, "outputs": [ { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "N train_df 75147\n" + ] + }, + { + "name": "stderr", "output_type": "stream", + "text": [ + "[Stage 19:================================================> (178 + 3) / 200]\r" + ] + }, + { "name": "stdout", + "output_type": "stream", "text": [ - "N train_df 75066\n", - "N test_df 24934\n" + "N test_df 24853\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\r", + " \r" ] } ], - "metadata": {} + "source": [ + "train_df, test_df = spark_random_split(data.select(COL_USER, COL_ITEM, COL_RATING), ratio=0.75, seed=123)\n", + "print (\"N train_df\", train_df.cache().count())\n", + "print (\"N test_df\", test_df.cache().count())" + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "#### Get all possible user-item pairs" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Note: We assume that training data contains all users and all catalog items. " - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 6, + "metadata": {}, + "outputs": [], "source": [ "users = train_df.select(COL_USER).distinct()\n", "items = train_df.select(COL_ITEM).distinct()\n", "user_item = users.crossJoin(items)" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "### 3. Train the ALS model on the training data, and get the top-k recommendations for our testing data\n", "\n", "To predict movie ratings, we use the rating data in the training set as users' explicit feedback. The hyperparameters used in building the model are referenced from [here](http://mymedialite.net/examples/datasets.html). We do not constrain the latent factors (`nonnegative = False`) in order to allow for both positive and negative preferences towards movies.\n", "Timing will vary depending on the machine being used to train." - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 7, + "metadata": {}, + "outputs": [], "source": [ "header = {\n", " \"userCol\": COL_USER,\n", @@ -390,42 +423,86 @@ " seed=42,\n", " **header\n", ")" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 8, - "source": [ - "with Timer() as train_time:\n", - " model = als.fit(train_df)\n", - "\n", - "print(\"Took {} seconds for training.\".format(train_time.interval))" - ], + "metadata": {}, "outputs": [ { + "name": "stderr", "output_type": "stream", + "text": [ + " \r" + ] + }, + { "name": "stdout", + "output_type": "stream", "text": [ - "Took 4.189040212018881 seconds for training.\n" + "Took 10.75137371000028 seconds for training.\n" ] } ], - "metadata": {} + "source": [ + "with Timer() as train_time:\n", + " model = als.fit(train_df)\n", + "\n", + "print(\"Took {} seconds for training.\".format(train_time.interval))" + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "In the movie recommendation use case, recommending movies that have been rated by the users does not make sense. Therefore, the rated movies are removed from the recommended items.\n", "\n", "In order to achieve this, we recommend all movies to all users, and then remove the user-movie pairs that exist in the training dataset." - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1464772\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 235:> (0 + 2) / 2]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "9430\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\r", + " \r" + ] + } + ], "source": [ "# Score all user-item pairs\n", "dfs_pred = model.transform(user_item)\n", @@ -446,31 +523,22 @@ "top_k_reco = top_all.select(\"*\", F.row_number().over(window).alias(\"rank\")).filter(F.col(\"rank\") <= TOP_K).drop(\"rank\")\n", " \n", "print(top_k_reco.count())" - ], - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "1464853\n", - "9430\n" - ] - } - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "### 4. Random Recommender\n", "\n", "We define a recommender which randomly recommends unseen items to each user. " - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 10, + "metadata": {}, + "outputs": [], "source": [ "# random recommender\n", "window = Window.partitionBy(COL_USER).orderBy(F.rand())\n", @@ -491,20 +559,20 @@ " .filter(F.col(\"score\") <= TOP_K)\n", " .drop(COL_RATING)\n", ")" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "### 5. ALS vs Random Recommenders Performance Comparison" - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 11, + "metadata": {}, + "outputs": [], "source": [ "def get_ranking_results(ranking_eval):\n", " metrics = {\n", @@ -525,13 +593,13 @@ " \"serendipity\": diversity_eval.serendipity()\n", " }\n", " return metrics " - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 12, + "metadata": {}, + "outputs": [], "source": [ "def generate_summary(data, algo, k, ranking_metrics, diversity_metrics):\n", " summary = {\"Data\": data, \"Algo\": algo, \"K\": k}\n", @@ -546,20 +614,28 @@ " summary.update(ranking_metrics)\n", " summary.update(diversity_metrics)\n", " return summary" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "#### ALS Recommender Performance Results" - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], "source": [ "als_ranking_eval = SparkRankingEvaluation(\n", " test_df, \n", @@ -573,13 +649,21 @@ ")\n", "\n", "als_ranking_metrics = get_ranking_results(als_ranking_eval)" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], "source": [ "als_diversity_eval = SparkDiversityEvaluation(\n", " train_df = train_df, \n", @@ -589,29 +673,37 @@ ")\n", "\n", "als_diversity_metrics = get_diversity_results(als_diversity_eval)" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 15, + "metadata": {}, + "outputs": [], "source": [ "als_results = generate_summary(MOVIELENS_DATA_SIZE, \"als\", TOP_K, als_ranking_metrics, als_diversity_metrics)" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "#### Random Recommender Performance Results" - ], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], "source": [ "random_ranking_eval = SparkRankingEvaluation(\n", " test_df,\n", @@ -624,13 +716,21 @@ ")\n", "\n", "random_ranking_metrics = get_ranking_results(random_ranking_eval)" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], "source": [ "random_diversity_eval = SparkDiversityEvaluation(\n", " train_df = train_df, \n", @@ -640,48 +740,43 @@ ")\n", " \n", "random_diversity_metrics = get_diversity_results(random_diversity_eval)" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 18, + "metadata": {}, + "outputs": [], "source": [ "random_results = generate_summary(MOVIELENS_DATA_SIZE, \"random\", TOP_K, random_ranking_metrics, random_diversity_metrics)" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "#### Result Comparison" - ], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 19, + "metadata": {}, + "outputs": [], "source": [ "cols = [\"Data\", \"Algo\", \"K\", \"Precision@k\", \"Recall@k\", \"NDCG@k\", \"Mean average precision\",\"catalog_coverage\", \"distributional_coverage\",\"novelty\", \"diversity\", \"serendipity\" ]\n", "df_results = pd.DataFrame(columns=cols)\n", "\n", "df_results.loc[1] = als_results \n", "df_results.loc[2] = random_results " - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 23, - "source": [ - "df_results" - ], + "execution_count": 20, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/html": [ "
\n", @@ -722,30 +817,30 @@ " 100k\n", " als\n", " 10\n", - " 0.047296\n", - " 0.016015\n", - " 0.043097\n", - " 0.004579\n", - " 0.385793\n", - " 7.967257\n", - " 11.659776\n", - " 0.892277\n", - " 0.878733\n", + " 0.044374\n", + " 0.015567\n", + " 0.040657\n", + " 0.004202\n", + " 0.374158\n", + " 7.989889\n", + " 11.740626\n", + " 0.890659\n", + " 0.879359\n", " \n", " \n", " 2\n", " 100k\n", " random\n", " 10\n", - " 0.016543\n", - " 0.005566\n", - " 0.016373\n", - " 0.001441\n", - " 0.994489\n", - " 10.541850\n", - " 12.136439\n", - " 0.922613\n", - " 0.892511\n", + " 0.018259\n", + " 0.006516\n", + " 0.018537\n", + " 0.002038\n", + " 0.998775\n", + " 10.543160\n", + " 12.180267\n", + " 0.923302\n", + " 0.892897\n", " \n", " \n", "\n", @@ -753,43 +848,48 @@ ], "text/plain": [ " Data Algo K Precision@k Recall@k NDCG@k Mean average precision \\\n", - "1 100k als 10 0.047296 0.016015 0.043097 0.004579 \n", - "2 100k random 10 0.016543 0.005566 0.016373 0.001441 \n", + "1 100k als 10 0.044374 0.015567 0.040657 0.004202 \n", + "2 100k random 10 0.018259 0.006516 0.018537 0.002038 \n", "\n", " catalog_coverage distributional_coverage novelty diversity \\\n", - "1 0.385793 7.967257 11.659776 0.892277 \n", - "2 0.994489 10.541850 12.136439 0.922613 \n", + "1 0.374158 7.989889 11.740626 0.890659 \n", + "2 0.998775 10.543160 12.180267 0.923302 \n", "\n", " serendipity \n", - "1 0.878733 \n", - "2 0.892511 " + "1 0.879359 \n", + "2 0.892897 " ] }, + "execution_count": 20, "metadata": {}, - "execution_count": 23 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "df_results" + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "#### Conclusion\n", "The comparision results show that the ALS recommender outperforms the random recommender on ranking metrics (Precision@k, Recall@k, NDCG@k, and\tMean average precision), while the random recommender outperforms ALS recommender on diversity metrics. This is because ALS is optimized for estimating the item rating as accurate as possible, therefore it performs well on accuracy metrics including rating and ranking metrics. As a side effect, the items being recommended tend to be popular items, which are the items mostly sold or viewed. It leaves the long-tail less popular items having less chance to get introduced to the users. This is the reason why ALS is not performing as well as a random recommender on diversity metrics. " - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "### 6. Calculate diversity metrics using item feature vector based item-item similarity\n", "In the above section we calculate diversity metrics using item co-occurrence count based item-item similarity. In the scenarios when item features are available, we may want to calculate item-item similarity based on item feature vectors. In this section, we show how to calculate diversity metrics using item feature vector based item-item similarity." - ], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 21, + "metadata": {}, + "outputs": [], "source": [ "# Get movie features \"title\" and \"genres\"\n", "movies = (\n", @@ -799,13 +899,13 @@ " .withColumn(COL_TITLE, F.regexp_replace(F.col(COL_TITLE), \"[\\(),:^0-9]\", \"\")) # remove year from title\n", " .drop(\"count\") # remove unused columns\n", ")" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 22, + "metadata": {}, + "outputs": [], "source": [ "# tokenize \"title\" column\n", "title_tokenizer = Tokenizer(inputCol=COL_TITLE, outputCol=\"title_words\")\n", @@ -814,13 +914,51 @@ "# remove stop words\n", "remover = StopWordsRemover(inputCol=\"title_words\", outputCol=\"text\")\n", "clean_data = remover.transform(tokenized_data).drop(COL_TITLE, \"title_words\")" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 1441:============================================> (172 + 2) / 200]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-------+------------------------------------------------------------------------------------+\n", + "|MovieId|features |\n", + "+-------+------------------------------------------------------------------------------------+\n", + "|29 |(1043,[158,269,1025,1026,1029,1031],[1.0,1.0,1.0,1.0,1.0,1.0]) |\n", + "|26 |(1043,[54,139,1025],[1.0,1.0,1.0]) |\n", + "|1677 |(1043,[260,902,1024],[1.0,1.0,1.0]) |\n", + "|964 |(1043,[416,429,1024,1025],[1.0,1.0,1.0,1.0]) |\n", + "|474 |(1043,[112,302,329,517,540,787,933,1032,1034],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])|\n", + "|1258 |(1043,[114,799,1025,1028],[1.0,1.0,1.0,1.0]) |\n", + "|541 |(1043,[635,910,1026,1029],[1.0,1.0,1.0,1.0]) |\n", + "|1224 |(1043,[978,1024],[1.0,1.0]) |\n", + "|558 |(1043,[231,524,1024,1027,1041],[1.0,1.0,1.0,1.0,1.0]) |\n", + "|191 |(1043,[206,1024,1035],[1.0,1.0,1.0]) |\n", + "+-------+------------------------------------------------------------------------------------+\n", + "only showing top 10 rows\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\r", + " \r" + ] + } + ], "source": [ "# convert text input into feature vectors\n", "\n", @@ -841,43 +979,36 @@ "feature_data = assembler.transform(vectorized_data).select(COL_ITEM, \"features\")\n", "\n", "feature_data.show(10, False)" - ], - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "+------+---------------------------------------------+\n", - "|ItemId|features |\n", - "+------+---------------------------------------------+\n", - "|167 |(1043,[128,544,1025],[1.0,1.0,1.0]) |\n", - "|1343 |(1043,[38,300,1024],[1.0,1.0,1.0]) |\n", - "|1607 |(1043,[592,821,1024],[1.0,1.0,1.0]) |\n", - "|966 |(1043,[389,502,1028],[1.0,1.0,1.0]) |\n", - "|9 |(1043,[11,342,1014,1024],[1.0,1.0,1.0,1.0]) |\n", - "|1230 |(1043,[597,740,902,1025],[1.0,1.0,1.0,1.0]) |\n", - "|1118 |(1043,[702,1025],[1.0,1.0]) |\n", - "|673 |(1043,[169,690,1027,1040],[1.0,1.0,1.0,1.0]) |\n", - "|879 |(1043,[909,1026,1027,1034],[1.0,1.0,1.0,1.0])|\n", - "|66 |(1043,[256,1025,1028],[1.0,1.0,1.0]) |\n", - "+------+---------------------------------------------+\n", - "only showing top 10 rows\n", - "\n" - ] - } - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "The *features* column is represented with a SparseVector object. For example, in the feature vector (1043,[128,544,1025],[1.0,1.0,1.0]), 1043 is the vector length, indicating the vector consisting of 1043 item features. The values at index positions 128,544,1025 are 1.0, and the values at other positions are all 0. " - ], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.8742459916963194\n", + "0.8891175823541189\n" + ] + } + ], "source": [ "als_eval = SparkDiversityEvaluation(\n", " train_df = train_df, \n", @@ -892,22 +1023,29 @@ "als_serendipity=als_eval.serendipity()\n", "print(als_diversity)\n", "print(als_serendipity)" - ], + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, "outputs": [ { + "name": "stderr", "output_type": "stream", + "text": [ + " \r" + ] + }, + { "name": "stdout", + "output_type": "stream", "text": [ - "0.8738984131037538\n", - "0.8873467159479473\n" + "0.896073781038039\n", + "0.8925253230847529\n" ] } ], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 28, "source": [ "random_eval = SparkDiversityEvaluation(\n", " train_df = train_df, \n", @@ -922,28 +1060,18 @@ "random_serendipity=random_eval.serendipity()\n", "print(random_diversity)\n", "print(random_serendipity)" - ], - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "0.8982144953920664\n", - "0.8941807579293202\n" - ] - } - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "It's interesting that the value of diversity and serendipity changes when using different item-item similarity calculation approach, for both ALS algorithm and random recommender. The diversity and serendipity of random recommender are still higher than ALS algorithm. " - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "### References\n", "The metric definitions / formulations are based on the following references:\n", @@ -951,24 +1079,27 @@ "- G. Shani and A. Gunawardana, Evaluating recommendation systems, Recommender Systems Handbook pp. 257-297, 2010.\n", "- E. Yan, Serendipity: Accuracy’s unpopular best friend in recommender Systems, eugeneyan.com, April 2020\n", "- Y.C. Zhang, D.Ó. Séaghdha, D. Quercia and T. Jambor, Auralist: introducing serendipity into music recommendation, WSDM 2012\n" - ], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, + "metadata": {}, + "outputs": [], "source": [ "# cleanup spark instance\n", "spark.stop()" - ], - "outputs": [], - "metadata": {} + ] } ], "metadata": { + "interpreter": { + "hash": "7ec2189bea0434770dca7423a25e631e1cca9c4e2b4ff137a82f4dff32ac9607" + }, "kernelspec": { - "name": "python3", - "display_name": "Python 3.6.9 64-bit ('.env': venv)" + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -980,12 +1111,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.9" - }, - "interpreter": { - "hash": "7ec2189bea0434770dca7423a25e631e1cca9c4e2b4ff137a82f4dff32ac9607" + "version": "3.8.0" } }, "nbformat": 4, "nbformat_minor": 1 -} \ No newline at end of file +} diff --git a/recommenders/utils/spark_utils.py b/recommenders/utils/spark_utils.py index 42be301cd9..b294879375 100644 --- a/recommenders/utils/spark_utils.py +++ b/recommenders/utils/spark_utils.py @@ -9,7 +9,7 @@ except ImportError: pass # skip this import if we are in pure python environment -MMLSPARK_PACKAGE = "com.microsoft.ml.spark:mmlspark:1.0.0-rc3-184-3314e164-SNAPSHOT" +MMLSPARK_PACKAGE = "com.microsoft.azure:synapseml_2.12:0.9.5" MMLSPARK_REPO = "https://mmlspark.azureedge.net/maven" # We support Spark v3, but in case you wish to use v2, set # MMLSPARK_PACKAGE = "com.microsoft.ml.spark:mmlspark_2.11:0.18.1" diff --git a/setup.py b/setup.py index 15b7a08aae..5530d36044 100644 --- a/setup.py +++ b/setup.py @@ -74,7 +74,7 @@ "spark": [ "databricks_cli>=0.8.6,<1", "pyarrow>=0.12.1,<7.0.0", - "pyspark>=2.4.5,<3.2.0", + "pyspark>=2.4.5,<4.0.0", ], "dev": [ "black>=18.6b4,<21",