From cec970cfcec7a7498d4a576b67584d4b1a672184 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Mon, 25 Jan 2021 14:59:04 +0000 Subject: [PATCH 01/14] :bug: --- tests/smoke/test_notebooks_gpu.py | 37 +++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/tests/smoke/test_notebooks_gpu.py b/tests/smoke/test_notebooks_gpu.py index b2b81199d9..27e92a734e 100644 --- a/tests/smoke/test_notebooks_gpu.py +++ b/tests/smoke/test_notebooks_gpu.py @@ -1,8 +1,10 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. -import papermill as pm + import pytest +import papermill as pm +import scrapbook as sb from reco_utils.common.gpu_utils import get_number_gpus from tests.notebooks_common import OUTPUT_NOTEBOOK, KERNEL_NAME @@ -48,7 +50,9 @@ def test_ncf_deep_dive_smoke(notebooks): TOP_K=10, MOVIELENS_DATA_SIZE="100k", EPOCHS=1, BATCH_SIZE=1024 ), ) - results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"] + results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ + "value" + ] # There is too much variability to do an approx equal, just adding top values assert results["map"] == pytest.approx(0.0370396, rel=TOL, abs=ABS_TOL) @@ -71,7 +75,9 @@ def test_fastai_smoke(notebooks): kernel_name=KERNEL_NAME, parameters=dict(TOP_K=10, MOVIELENS_DATA_SIZE="100k", EPOCHS=1), ) - results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"] + results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ + "value" + ] assert results["rmse"] == pytest.approx(0.959352, rel=TOL, abs=ABS_TOL) assert results["mae"] == pytest.approx(0.766504, rel=TOL, abs=ABS_TOL) @@ -99,7 +105,9 @@ def test_xdeepfm_smoke(notebooks): RANDOM_SEED=42, ), ) - results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"] + results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ + "value" + ] assert results["res_syn"]["auc"] == pytest.approx(0.5043, rel=TOL, abs=ABS_TOL) assert results["res_syn"]["logloss"] == pytest.approx(0.7046, rel=TOL, abs=ABS_TOL) @@ -125,7 +133,10 @@ def test_wide_deep_smoke(notebooks, tmp): pm.execute_notebook( notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME, parameters=params ) - results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"] + results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ + "value" + ] + assert results["rmse"] == pytest.approx(1.06034, rel=TOL, abs=ABS_TOL) assert results["mae"] == pytest.approx(0.876228, rel=TOL, abs=ABS_TOL) assert results["ndcg_at_k"] == pytest.approx(0.181513, rel=TOL, abs=ABS_TOL) @@ -142,7 +153,9 @@ def test_naml_smoke(notebooks): kernel_name=KERNEL_NAME, parameters=dict(epochs=1, seed=42, MIND_type="demo"), ) - results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"] + results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ + "value" + ] assert results["res_syn"]["group_auc"] == pytest.approx( 0.5801, rel=TOL, abs=ABS_TOL @@ -160,7 +173,9 @@ def test_nrms_smoke(notebooks): kernel_name=KERNEL_NAME, parameters=dict(epochs=1, seed=42, MIND_type="demo"), ) - results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"] + results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ + "value" + ] assert results["res_syn"]["group_auc"] == pytest.approx( 0.5768, rel=TOL, abs=ABS_TOL @@ -178,7 +193,9 @@ def test_npa_smoke(notebooks): kernel_name=KERNEL_NAME, parameters=dict(epochs=1, seed=42, MIND_type="demo"), ) - results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"] + results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ + "value" + ] assert results["res_syn"]["group_auc"] == pytest.approx( 0.5861, rel=TOL, abs=ABS_TOL @@ -196,7 +213,9 @@ def test_lstur_smoke(notebooks): kernel_name=KERNEL_NAME, parameters=dict(epochs=1, seed=40, MIND_type="demo"), ) - results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"] + results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ + "value" + ] assert results["res_syn"]["group_auc"] == pytest.approx( 0.5977, rel=TOL, abs=ABS_TOL From f23bb0254e36b2e44b0681a8a024769dcea3a0a3 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Mon, 25 Jan 2021 15:01:33 +0000 Subject: [PATCH 02/14] :bug: --- tests/smoke/test_notebooks_gpu.py | 6 +++-- tests/smoke/test_notebooks_pyspark.py | 14 +++++++---- tests/smoke/test_notebooks_python.py | 34 ++++++++++++++++++--------- 3 files changed, 37 insertions(+), 17 deletions(-) diff --git a/tests/smoke/test_notebooks_gpu.py b/tests/smoke/test_notebooks_gpu.py index 27e92a734e..1aa977274b 100644 --- a/tests/smoke/test_notebooks_gpu.py +++ b/tests/smoke/test_notebooks_gpu.py @@ -30,8 +30,10 @@ def test_ncf_smoke(notebooks): kernel_name=KERNEL_NAME, parameters=dict(TOP_K=10, MOVIELENS_DATA_SIZE="100k", EPOCHS=1, BATCH_SIZE=256), ) - results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"] - + results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ + "value" + ] + assert results["map"] == pytest.approx(0.0409234, rel=TOL, abs=ABS_TOL) assert results["ndcg"] == pytest.approx(0.1773, rel=TOL, abs=ABS_TOL) assert results["precision"] == pytest.approx(0.160127, rel=TOL, abs=ABS_TOL) diff --git a/tests/smoke/test_notebooks_pyspark.py b/tests/smoke/test_notebooks_pyspark.py index 550de2e1b5..482a40215f 100644 --- a/tests/smoke/test_notebooks_pyspark.py +++ b/tests/smoke/test_notebooks_pyspark.py @@ -4,6 +4,8 @@ import sys import pytest import papermill as pm +import scrapbook as sb + from tests.notebooks_common import OUTPUT_NOTEBOOK, KERNEL_NAME @@ -21,8 +23,10 @@ def test_als_pyspark_smoke(notebooks): kernel_name=KERNEL_NAME, parameters=dict(TOP_K=10, MOVIELENS_DATA_SIZE="100k"), ) - nb = pm.read_notebook(OUTPUT_NOTEBOOK) - results = nb.dataframe.set_index("name")["value"] + + results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ + "value" + ] assert results["map"] == pytest.approx(0.0052, rel=TOL, abs=ABS_TOL) assert results["ndcg"] == pytest.approx(0.0463, rel=TOL, abs=ABS_TOL) @@ -45,6 +49,8 @@ def test_mmlspark_lightgbm_criteo_smoke(notebooks): kernel_name=KERNEL_NAME, parameters=dict(DATA_SIZE="sample", NUM_ITERATIONS=50, EARLY_STOPPING_ROUND=10), ) - nb = pm.read_notebook(OUTPUT_NOTEBOOK) - results = nb.dataframe.set_index("name")["value"] + + results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ + "value" + ] assert results["auc"] == pytest.approx(0.68895, rel=TOL, abs=ABS_TOL) diff --git a/tests/smoke/test_notebooks_python.py b/tests/smoke/test_notebooks_python.py index aa45890520..89d100fd3a 100644 --- a/tests/smoke/test_notebooks_python.py +++ b/tests/smoke/test_notebooks_python.py @@ -3,6 +3,8 @@ import pytest import papermill as pm +import scrapbook as sb + from tests.notebooks_common import OUTPUT_NOTEBOOK, KERNEL_NAME @@ -20,7 +22,9 @@ def test_sar_single_node_smoke(notebooks): kernel_name=KERNEL_NAME, parameters=dict(TOP_K=10, MOVIELENS_DATA_SIZE="100k"), ) - results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"] + results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ + "value" + ] assert results["map"] == pytest.approx(0.110591, rel=TOL, abs=ABS_TOL) assert results["ndcg"] == pytest.approx(0.382461, rel=TOL, abs=ABS_TOL) @@ -38,7 +42,9 @@ def test_baseline_deep_dive_smoke(notebooks): kernel_name=KERNEL_NAME, parameters=dict(TOP_K=10, MOVIELENS_DATA_SIZE="100k"), ) - results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"] + results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ + "value" + ] assert results["rmse"] == pytest.approx(1.054252, rel=TOL, abs=ABS_TOL) assert results["mae"] == pytest.approx(0.846033, rel=TOL, abs=ABS_TOL) @@ -60,7 +66,9 @@ def test_surprise_svd_smoke(notebooks): kernel_name=KERNEL_NAME, parameters=dict(MOVIELENS_DATA_SIZE="100k"), ) - results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"] + results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ + "value" + ] assert results["rmse"] == pytest.approx(0.96, rel=TOL, abs=ABS_TOL) assert results["mae"] == pytest.approx(0.75, rel=TOL, abs=ABS_TOL) @@ -82,7 +90,9 @@ def test_vw_deep_dive_smoke(notebooks): kernel_name=KERNEL_NAME, parameters=dict(MOVIELENS_DATA_SIZE="100k"), ) - results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"] + results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ + "value" + ] assert results["rmse"] == pytest.approx(0.985920, rel=TOL, abs=ABS_TOL) assert results["mae"] == pytest.approx(0.71292, rel=TOL, abs=ABS_TOL) @@ -111,7 +121,9 @@ def test_lightgbm_quickstart_smoke(notebooks): METRIC="auc", ), ) - results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"] + results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ + "value" + ] assert results["res_basic"]["auc"] == pytest.approx(0.7674, rel=TOL, abs=ABS_TOL) assert results["res_basic"]["logloss"] == pytest.approx( @@ -133,13 +145,16 @@ def test_cornac_bpr_smoke(notebooks): kernel_name=KERNEL_NAME, parameters=dict(MOVIELENS_DATA_SIZE="100k"), ) - results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"] + results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ + "value" + ] assert results["map"] == pytest.approx(0.1091, rel=TOL, abs=ABS_TOL) assert results["ndcg"] == pytest.approx(0.4034, rel=TOL, abs=ABS_TOL) assert results["precision"] == pytest.approx(0.3550, rel=TOL, abs=ABS_TOL) assert results["recall"] == pytest.approx(0.1802, rel=TOL, abs=ABS_TOL) + @pytest.mark.smoke def test_mind_utils(notebooks, tmp): notebook_path = notebooks["mind_utils"] @@ -148,8 +163,5 @@ def test_mind_utils(notebooks, tmp): notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME, - parameters=dict( - mind_type="small", - word_embedding_dim=300 - ), - ) \ No newline at end of file + parameters=dict(mind_type="small", word_embedding_dim=300), + ) From 15bf25dee977a2ce4dd75151f033f3bc7b832201 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Mon, 25 Jan 2021 15:08:20 +0000 Subject: [PATCH 03/14] :bug: --- tests/integration/test_notebooks_gpu.py | 76 +++++++++++++++------ tests/integration/test_notebooks_pyspark.py | 13 ++-- tests/integration/test_notebooks_python.py | 66 ++++++++++-------- 3 files changed, 103 insertions(+), 52 deletions(-) diff --git a/tests/integration/test_notebooks_gpu.py b/tests/integration/test_notebooks_gpu.py index cd47a5cd4b..84631d7084 100644 --- a/tests/integration/test_notebooks_gpu.py +++ b/tests/integration/test_notebooks_gpu.py @@ -1,12 +1,13 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. -import papermill as pm +import os import pytest +import papermill as pm +import scrapbook as sb from reco_utils.common.gpu_utils import get_number_gpus from tests.notebooks_common import OUTPUT_NOTEBOOK, KERNEL_NAME -import os TOL = 0.5 @@ -48,7 +49,9 @@ def test_ncf_integration(notebooks, size, epochs, expected_values, seed): TOP_K=10, MOVIELENS_DATA_SIZE=size, EPOCHS=epochs, BATCH_SIZE=512, SEED=seed ), ) - results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"] + results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ + "value" + ] for key, value in expected_values.items(): assert results[key] == pytest.approx(value, rel=TOL, abs=ABS_TOL) @@ -93,7 +96,9 @@ def test_ncf_deep_dive_integration( SEED=seed, ), ) - results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"] + results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ + "value" + ] for key, value in expected_values.items(): assert results[key] == pytest.approx(value, rel=TOL, abs=ABS_TOL) @@ -123,14 +128,15 @@ def test_ncf_deep_dive_integration( ) def test_fastai_integration(notebooks, size, epochs, expected_values): notebook_path = notebooks["fastai"] - pm.execute_notebook(notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME) pm.execute_notebook( notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME, parameters=dict(TOP_K=10, MOVIELENS_DATA_SIZE=size, EPOCHS=epochs), ) - results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"] + results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ + "value" + ] for key, value in expected_values.items(): assert results[key] == pytest.approx(value, rel=TOL, abs=ABS_TOL) @@ -168,7 +174,9 @@ def test_xdeepfm_integration( RANDOM_SEED=seed, ), ) - results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"] + results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ + "value" + ] for key, value in expected_values.items(): assert results[key]["auc"] == pytest.approx(value["auc"], rel=TOL, abs=ABS_TOL) @@ -215,7 +223,10 @@ def test_wide_deep_integration(notebooks, size, steps, expected_values, seed, tm pm.execute_notebook( notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME, parameters=params ) - results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"] + results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ + "value" + ] + for key, value in expected_values.items(): assert results[key] == pytest.approx(value, rel=TOL, abs=ABS_TOL) @@ -250,7 +261,10 @@ def test_slirec_quickstart_integration( pm.execute_notebook( notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME, parameters=params ) - results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"] + results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ + "value" + ] + for key, value in expected_values.items(): assert results[key]["auc"] == pytest.approx(value["auc"], rel=TOL, abs=ABS_TOL) assert results[key]["logloss"] == pytest.approx( @@ -278,14 +292,19 @@ def test_slirec_quickstart_integration( ) ], ) -def test_nrms_quickstart_integration(notebooks, epochs, seed, MIND_type, expected_values): +def test_nrms_quickstart_integration( + notebooks, epochs, seed, MIND_type, expected_values +): notebook_path = notebooks["nrms_quickstart"] params = {"epochs": epochs, "seed": seed, "MIND_type": MIND_type} pm.execute_notebook( notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME, parameters=params ) - results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"] + results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ + "value" + ] + for key, value in expected_values.items(): assert results[key]["group_auc"] == pytest.approx( value["group_auc"], rel=TOL, abs=ABS_TOL @@ -321,14 +340,19 @@ def test_nrms_quickstart_integration(notebooks, epochs, seed, MIND_type, expecte ) ], ) -def test_naml_quickstart_integration(notebooks, epochs, seed, MIND_type, expected_values): +def test_naml_quickstart_integration( + notebooks, epochs, seed, MIND_type, expected_values +): notebook_path = notebooks["naml_quickstart"] params = {"epochs": epochs, "seed": seed, "MIND_type": MIND_type} pm.execute_notebook( notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME, parameters=params ) - results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"] + results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ + "value" + ] + for key, value in expected_values.items(): assert results[key]["group_auc"] == pytest.approx( value["group_auc"], rel=TOL, abs=ABS_TOL @@ -364,14 +388,19 @@ def test_naml_quickstart_integration(notebooks, epochs, seed, MIND_type, expecte ) ], ) -def test_lstur_quickstart_integration(notebooks, epochs, seed, MIND_type, expected_values): +def test_lstur_quickstart_integration( + notebooks, epochs, seed, MIND_type, expected_values +): notebook_path = notebooks["lstur_quickstart"] params = {"epochs": epochs, "seed": seed, "MIND_type": MIND_type} pm.execute_notebook( notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME, parameters=params ) - results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"] + results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ + "value" + ] + for key, value in expected_values.items(): assert results[key]["group_auc"] == pytest.approx( value["group_auc"], rel=TOL, abs=ABS_TOL @@ -407,14 +436,19 @@ def test_lstur_quickstart_integration(notebooks, epochs, seed, MIND_type, expect ) ], ) -def test_npa_quickstart_integration(notebooks, epochs, seed, MIND_type, expected_values): +def test_npa_quickstart_integration( + notebooks, epochs, seed, MIND_type, expected_values +): notebook_path = notebooks["npa_quickstart"] params = {"epochs": epochs, "seed": seed, "MIND_type": MIND_type} pm.execute_notebook( notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME, parameters=params ) - results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"] + results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ + "value" + ] + for key, value in expected_values.items(): assert results[key]["group_auc"] == pytest.approx( value["group_auc"], rel=TOL, abs=ABS_TOL @@ -470,7 +504,9 @@ def test_lightgcn_deep_dive_integration( item_file=os.path.join(data_path, r"item_embeddings"), ), ) - results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"] + results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ + "value" + ] for key, value in expected_values.items(): assert results[key] == pytest.approx(value, rel=TOL, abs=ABS_TOL) @@ -486,7 +522,9 @@ def test_dkn_quickstart_integration(notebooks): kernel_name=KERNEL_NAME, parameters=dict(epochs=5, batch_size=500), ) - results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"] + results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ + "value" + ] assert results["res"]["auc"] == pytest.approx(0.5651, rel=TOL, abs=ABS_TOL) assert results["res"]["mean_mrr"] == pytest.approx(0.1639, rel=TOL, abs=ABS_TOL) diff --git a/tests/integration/test_notebooks_pyspark.py b/tests/integration/test_notebooks_pyspark.py index 256967292f..9872c606a6 100644 --- a/tests/integration/test_notebooks_pyspark.py +++ b/tests/integration/test_notebooks_pyspark.py @@ -4,6 +4,8 @@ import sys import pytest import papermill as pm +import scrapbook as sb + from tests.notebooks_common import OUTPUT_NOTEBOOK, KERNEL_NAME @@ -21,8 +23,9 @@ def test_als_pyspark_integration(notebooks): kernel_name=KERNEL_NAME, parameters=dict(TOP_K=10, MOVIELENS_DATA_SIZE="1m"), ) - nb = pm.read_notebook(OUTPUT_NOTEBOOK) - results = nb.dataframe.set_index("name")["value"] + results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ + "value" + ] assert results["map"] == pytest.approx(0.00201, rel=TOL, abs=ABS_TOL) assert results["ndcg"] == pytest.approx(0.02516, rel=TOL, abs=ABS_TOL) @@ -46,6 +49,8 @@ def test_mmlspark_lightgbm_criteo_integration(notebooks): kernel_name=KERNEL_NAME, parameters=dict(DATA_SIZE="full", NUM_ITERATIONS=50, EARLY_STOPPING_ROUND=10), ) - nb = pm.read_notebook(OUTPUT_NOTEBOOK) - results = nb.dataframe.set_index("name")["value"] + results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ + "value" + ] + assert results["auc"] == pytest.approx(0.68895, rel=TOL, abs=ABS_TOL) diff --git a/tests/integration/test_notebooks_python.py b/tests/integration/test_notebooks_python.py index 7068457df4..4264d5b60c 100644 --- a/tests/integration/test_notebooks_python.py +++ b/tests/integration/test_notebooks_python.py @@ -1,13 +1,15 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. -import papermill as pm -import pytest import sys +import pytest +import papermill as pm +import scrapbook as sb from reco_utils.tuning.nni.nni_utils import check_experiment_status, NNI_STATUS_URL from tests.notebooks_common import OUTPUT_NOTEBOOK, KERNEL_NAME + TOL = 0.05 ABS_TOL = 0.05 @@ -44,7 +46,9 @@ def test_sar_single_node_integration(notebooks, size, expected_values): kernel_name=KERNEL_NAME, parameters=dict(TOP_K=10, MOVIELENS_DATA_SIZE=size), ) - results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"] + results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ + "value" + ] for key, value in expected_values.items(): assert results[key] == pytest.approx(value, rel=TOL, abs=ABS_TOL) @@ -68,14 +72,15 @@ def test_sar_single_node_integration(notebooks, size, expected_values): ) def test_baseline_deep_dive_integration(notebooks, size, expected_values): notebook_path = notebooks["baseline_deep_dive"] - pm.execute_notebook(notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME) pm.execute_notebook( notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME, parameters=dict(TOP_K=10, MOVIELENS_DATA_SIZE=size), ) - results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"] + results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ + "value" + ] for key, value in expected_values.items(): assert results[key] == pytest.approx(value, rel=TOL, abs=ABS_TOL) @@ -109,7 +114,9 @@ def test_surprise_svd_integration(notebooks, size, expected_values): kernel_name=KERNEL_NAME, parameters=dict(MOVIELENS_DATA_SIZE=size), ) - results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"] + results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ + "value" + ] for key, value in expected_values.items(): assert results[key] == pytest.approx(value, rel=TOL, abs=ABS_TOL) @@ -142,7 +149,9 @@ def test_vw_deep_dive_integration(notebooks, size, expected_values): kernel_name=KERNEL_NAME, parameters=dict(MOVIELENS_DATA_SIZE=size, TOP_K=10), ) - results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"] + results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ + "value" + ] for key, value in expected_values.items(): assert results[key] == pytest.approx(value, rel=TOL, abs=ABS_TOL) @@ -179,11 +188,14 @@ def test_wikidata_integration(notebooks, tmp): MOVIELENS_DATA_SIZE="100k", MOVIELENS_SAMPLE=True, MOVIELENS_SAMPLE_SIZE=5 ), ) + results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ + "value" + ] - results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"] # NOTE: The return number should be always 5, but sometimes we get less because wikidata is unstable assert results["length_result"] >= 1 + @pytest.mark.integration def test_mind_utils_integration(notebooks, tmp): notebook_path = notebooks["mind_utils"] @@ -191,12 +203,12 @@ def test_mind_utils_integration(notebooks, tmp): notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME, - parameters=dict( - mind_type="small", word_embedding_dim=300 - ), + parameters=dict(mind_type="small", word_embedding_dim=300), ) + results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ + "value" + ] - results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"] assert results["utils_state"]["vert_num"] == 17 assert results["utils_state"]["subvert_num"] == 17 assert results["utils_state"]["word_num"] == 31029 @@ -205,6 +217,7 @@ def test_mind_utils_integration(notebooks, tmp): assert results["utils_state"]["embedding_exist_num_all"] == 48422 assert results["utils_state"]["uid2index"] == 50000 + @pytest.mark.integration @pytest.mark.parametrize( "size, expected_values", @@ -221,7 +234,9 @@ def test_cornac_bpr_integration(notebooks, size, expected_values): kernel_name=KERNEL_NAME, parameters=dict(MOVIELENS_DATA_SIZE=size), ) - results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"] + results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ + "value" + ] for key, value in expected_values.items(): assert results[key] == pytest.approx(value, rel=TOL, abs=ABS_TOL) @@ -235,31 +250,24 @@ def test_xlearn_fm_integration(notebooks): kernel_name=KERNEL_NAME, parameters=dict(LEARNING_RATE=0.2, EPOCH=10), ) - results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"] + results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ + "value" + ] assert results["auc_score"] == pytest.approx(0.75, rel=TOL, abs=ABS_TOL) @pytest.mark.integration @pytest.mark.parametrize( - "expected_values", - [ - ( - { - "rmse": 0.4969, - "mae": 0.4761 - } - ) - ], + "expected_values", [({"rmse": 0.4969, "mae": 0.4761})], ) def test_geoimc_integration(notebooks, expected_values): notebook_path = notebooks["geoimc_quickstart"] - pm.execute_notebook( - notebook_path, - OUTPUT_NOTEBOOK, - kernel_name=KERNEL_NAME - ) - results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"] + pm.execute_notebook(notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME) + results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ + "value" + ] for key, value in expected_values.items(): assert results[key] == pytest.approx(value, rel=TOL, abs=ABS_TOL) + From f3851a17bf6d0b186f992dee200738b1252759b4 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Mon, 25 Jan 2021 15:41:40 +0000 Subject: [PATCH 04/14] :bug: --- tests/smoke/test_notebooks_python.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tests/smoke/test_notebooks_python.py b/tests/smoke/test_notebooks_python.py index 89d100fd3a..347502b18e 100644 --- a/tests/smoke/test_notebooks_python.py +++ b/tests/smoke/test_notebooks_python.py @@ -15,7 +15,6 @@ @pytest.mark.smoke def test_sar_single_node_smoke(notebooks): notebook_path = notebooks["sar_single_node"] - pm.execute_notebook(notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME) pm.execute_notebook( notebook_path, OUTPUT_NOTEBOOK, @@ -35,7 +34,6 @@ def test_sar_single_node_smoke(notebooks): @pytest.mark.smoke def test_baseline_deep_dive_smoke(notebooks): notebook_path = notebooks["baseline_deep_dive"] - pm.execute_notebook(notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME) pm.execute_notebook( notebook_path, OUTPUT_NOTEBOOK, @@ -59,7 +57,6 @@ def test_baseline_deep_dive_smoke(notebooks): @pytest.mark.smoke def test_surprise_svd_smoke(notebooks): notebook_path = notebooks["surprise_svd_deep_dive"] - pm.execute_notebook(notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME) pm.execute_notebook( notebook_path, OUTPUT_NOTEBOOK, @@ -83,7 +80,6 @@ def test_surprise_svd_smoke(notebooks): @pytest.mark.smoke def test_vw_deep_dive_smoke(notebooks): notebook_path = notebooks["vowpal_wabbit_deep_dive"] - pm.execute_notebook(notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME) pm.execute_notebook( notebook_path, OUTPUT_NOTEBOOK, @@ -107,7 +103,6 @@ def test_vw_deep_dive_smoke(notebooks): @pytest.mark.smoke def test_lightgbm_quickstart_smoke(notebooks): notebook_path = notebooks["lightgbm_quickstart"] - pm.execute_notebook(notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME) pm.execute_notebook( notebook_path, OUTPUT_NOTEBOOK, @@ -138,7 +133,6 @@ def test_lightgbm_quickstart_smoke(notebooks): @pytest.mark.smoke def test_cornac_bpr_smoke(notebooks): notebook_path = notebooks["cornac_bpr_deep_dive"] - pm.execute_notebook(notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME) pm.execute_notebook( notebook_path, OUTPUT_NOTEBOOK, From 75ce8d46559b7f51e54b11b3f15496ee6ef6a424 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Mon, 25 Jan 2021 15:56:33 +0000 Subject: [PATCH 05/14] version --- tools/generate_conda_file.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/generate_conda_file.py b/tools/generate_conda_file.py index b8ab12748c..2fda8fa5cf 100644 --- a/tools/generate_conda_file.py +++ b/tools/generate_conda_file.py @@ -88,7 +88,7 @@ "xlearn": "xlearn==0.40a1", "transformers": "transformers==2.5.0", "tensorflow": "tensorflow==1.15.2", - "nteract-scrapbook": "nteract-scrapbook>=0.4.0", + "scrapbook": "scrapbook>=0.5.0", } PIP_GPU = { From 86f1ca79330467f68631fe7b49f3ea3682eaef85 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Tue, 26 Jan 2021 14:20:43 +0000 Subject: [PATCH 06/14] trick to copy-past faster --- SETUP.md | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/SETUP.md b/SETUP.md index a5938c8e07..6aaab4de6f 100644 --- a/SETUP.md +++ b/SETUP.md @@ -120,18 +120,21 @@ You also need to find where Spark is installed and set `SPARK_HOME` variable, on Then, create the file `$RECO_ENV/etc/conda/activate.d/env_vars.sh` and add: - #!/bin/sh - RECO_ENV=$(conda env list | grep reco_pyspark | awk '{print $NF}') - export PYSPARK_PYTHON=$RECO_ENV/bin/python - export PYSPARK_DRIVER_PYTHON=$RECO_ENV/bin/python - export SPARK_HOME=/dsvm/tools/spark/current +```bash +#!/bin/sh +RECO_ENV=$(conda env list | grep reco_pyspark | awk '{print $NF}') +export PYSPARK_PYTHON=$RECO_ENV/bin/python +export PYSPARK_DRIVER_PYTHON=$RECO_ENV/bin/python +export SPARK_HOME=/dsvm/tools/spark/current +``` This will export the variables every time we do `conda activate reco_pyspark`. To unset these variables when we deactivate the environment, create the file `$RECO_ENV/etc/conda/deactivate.d/env_vars.sh` and add: - #!/bin/sh - unset PYSPARK_PYTHON - unset PYSPARK_DRIVER_PYTHON - +```bash +#!/bin/sh +unset PYSPARK_PYTHON +unset PYSPARK_DRIVER_PYTHON +``` From 1c439497382f69a14c4d20be9dd02f608c3048af Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Tue, 26 Jan 2021 14:30:59 +0000 Subject: [PATCH 07/14] mind --- examples/01_prepare_data/mind_utils.ipynb | 40 ++++------------------- 1 file changed, 7 insertions(+), 33 deletions(-) diff --git a/examples/01_prepare_data/mind_utils.ipynb b/examples/01_prepare_data/mind_utils.ipynb index 2a49199c8f..6a231c352f 100644 --- a/examples/01_prepare_data/mind_utils.ipynb +++ b/examples/01_prepare_data/mind_utils.ipynb @@ -46,14 +46,12 @@ "import sys\n", "sys.path.append(\"../../\")\n", "import os\n", - "import papermill as pm\n", "import pandas as pd\n", "from collections import Counter\n", "from tqdm import tqdm\n", "import pickle\n", "import numpy as np\n", - "\n", - "print(\"System version: {}\".format(sys.version))\n", + "import scrapbook as sb\n", "\n", "from tempfile import TemporaryDirectory\n", "from reco_utils.dataset.mind import (download_mind,\n", @@ -62,7 +60,9 @@ " load_glove_matrix,\n", " word_tokenize\n", " )\n", - "from reco_utils.dataset.download_utils import unzip_file" + "from reco_utils.dataset.download_utils import unzip_file\n", + "\n", + "print(\"System version: {}\".format(sys.version))\n" ] }, { @@ -418,37 +418,11 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/apple/miniconda/envs/reco_base/lib/python3.6/site-packages/ipykernel_launcher.py:1: DeprecationWarning: Function record is deprecated and will be removed in verison 1.0.0 (current version 0.19.1). Please see `scrapbook.glue` (nteract-scrapbook) as a replacement for this functionality.\n", - " \"\"\"Entry point for launching an IPython kernel.\n" - ] - }, - { - "data": { - "application/papermill.record+json": { - "utils_state": { - "embedding_exist_num": 22408, - "embedding_exist_num_all": 37634, - "subvert_num": 17, - "uid2index": 5000, - "vert_num": 17, - "word_num": 23404, - "word_num_all": 41074 - } - } - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ - "pm.record(\"utils_state\", utils_state)" + "sb.glue(\"utils_state\", utils_state)" ] } ], From 2bf35d46eedd022678bf17bec6f339e40af06f04 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Tue, 26 Jan 2021 14:45:35 +0000 Subject: [PATCH 08/14] :bug: --- examples/00_quick_start/sar_movielens.ipynb | 12 ++++------- examples/template.ipynb | 8 +++---- tests/integration/test_notebooks_gpu.py | 24 ++++++++++----------- tests/integration/test_notebooks_pyspark.py | 4 ++-- tests/integration/test_notebooks_python.py | 18 ++++++++-------- tests/smoke/test_notebooks_gpu.py | 18 ++++++++-------- tests/smoke/test_notebooks_pyspark.py | 4 ++-- tests/smoke/test_notebooks_python.py | 12 +++++------ tests/unit/test_notebook_utils.py | 1 - 9 files changed, 48 insertions(+), 53 deletions(-) diff --git a/examples/00_quick_start/sar_movielens.ipynb b/examples/00_quick_start/sar_movielens.ipynb index 7cd9d429f3..3baeff99c8 100644 --- a/examples/00_quick_start/sar_movielens.ipynb +++ b/examples/00_quick_start/sar_movielens.ipynb @@ -455,13 +455,9 @@ "metadata": { "celltoolbar": "Tags", "kernelspec": { - "display_name": "Python 3.6.12 64-bit ('sb_full': conda)", - "metadata": { - "interpreter": { - "hash": "f28711ae1fad89778b64817fc2d746effb845deda73edae96b2473c20b2d4f70" - } - }, - "name": "python3" + "display_name": "Python (reco_base)", + "language": "python", + "name": "reco_base" }, "language_info": { "codemirror_mode": { @@ -473,7 +469,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.12-final" + "version": "3.6.11" } }, "nbformat": 4, diff --git a/examples/template.ipynb b/examples/template.ipynb index e8db35cb79..cfa88e3c10 100644 --- a/examples/template.ipynb +++ b/examples/template.ipynb @@ -259,9 +259,9 @@ "metadata": { "celltoolbar": "Tags", "kernelspec": { - "display_name": "Python (recommender)", + "display_name": "Python (reco_base)", "language": "python", - "name": "recommender" + "name": "reco_base" }, "language_info": { "codemirror_mode": { @@ -273,9 +273,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.0" + "version": "3.6.11" } }, "nbformat": 4, "nbformat_minor": 2 -} \ No newline at end of file +} diff --git a/tests/integration/test_notebooks_gpu.py b/tests/integration/test_notebooks_gpu.py index 84631d7084..4c7780abe9 100644 --- a/tests/integration/test_notebooks_gpu.py +++ b/tests/integration/test_notebooks_gpu.py @@ -50,7 +50,7 @@ def test_ncf_integration(notebooks, size, epochs, expected_values, seed): ), ) results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ - "value" + "data" ] for key, value in expected_values.items(): @@ -97,7 +97,7 @@ def test_ncf_deep_dive_integration( ), ) results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ - "value" + "data" ] for key, value in expected_values.items(): @@ -135,7 +135,7 @@ def test_fastai_integration(notebooks, size, epochs, expected_values): parameters=dict(TOP_K=10, MOVIELENS_DATA_SIZE=size, EPOCHS=epochs), ) results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ - "value" + "data" ] for key, value in expected_values.items(): @@ -175,7 +175,7 @@ def test_xdeepfm_integration( ), ) results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ - "value" + "data" ] for key, value in expected_values.items(): @@ -224,7 +224,7 @@ def test_wide_deep_integration(notebooks, size, steps, expected_values, seed, tm notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME, parameters=params ) results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ - "value" + "data" ] for key, value in expected_values.items(): @@ -262,7 +262,7 @@ def test_slirec_quickstart_integration( notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME, parameters=params ) results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ - "value" + "data" ] for key, value in expected_values.items(): @@ -302,7 +302,7 @@ def test_nrms_quickstart_integration( notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME, parameters=params ) results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ - "value" + "data" ] for key, value in expected_values.items(): @@ -350,7 +350,7 @@ def test_naml_quickstart_integration( notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME, parameters=params ) results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ - "value" + "data" ] for key, value in expected_values.items(): @@ -398,7 +398,7 @@ def test_lstur_quickstart_integration( notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME, parameters=params ) results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ - "value" + "data" ] for key, value in expected_values.items(): @@ -446,7 +446,7 @@ def test_npa_quickstart_integration( notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME, parameters=params ) results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ - "value" + "data" ] for key, value in expected_values.items(): @@ -505,7 +505,7 @@ def test_lightgcn_deep_dive_integration( ), ) results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ - "value" + "data" ] for key, value in expected_values.items(): @@ -523,7 +523,7 @@ def test_dkn_quickstart_integration(notebooks): parameters=dict(epochs=5, batch_size=500), ) results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ - "value" + "data" ] assert results["res"]["auc"] == pytest.approx(0.5651, rel=TOL, abs=ABS_TOL) diff --git a/tests/integration/test_notebooks_pyspark.py b/tests/integration/test_notebooks_pyspark.py index 9872c606a6..d8716b01e7 100644 --- a/tests/integration/test_notebooks_pyspark.py +++ b/tests/integration/test_notebooks_pyspark.py @@ -24,7 +24,7 @@ def test_als_pyspark_integration(notebooks): parameters=dict(TOP_K=10, MOVIELENS_DATA_SIZE="1m"), ) results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ - "value" + "data" ] assert results["map"] == pytest.approx(0.00201, rel=TOL, abs=ABS_TOL) @@ -50,7 +50,7 @@ def test_mmlspark_lightgbm_criteo_integration(notebooks): parameters=dict(DATA_SIZE="full", NUM_ITERATIONS=50, EARLY_STOPPING_ROUND=10), ) results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ - "value" + "data" ] assert results["auc"] == pytest.approx(0.68895, rel=TOL, abs=ABS_TOL) diff --git a/tests/integration/test_notebooks_python.py b/tests/integration/test_notebooks_python.py index 4264d5b60c..ce0a8ee89e 100644 --- a/tests/integration/test_notebooks_python.py +++ b/tests/integration/test_notebooks_python.py @@ -47,7 +47,7 @@ def test_sar_single_node_integration(notebooks, size, expected_values): parameters=dict(TOP_K=10, MOVIELENS_DATA_SIZE=size), ) results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ - "value" + "data" ] for key, value in expected_values.items(): @@ -79,7 +79,7 @@ def test_baseline_deep_dive_integration(notebooks, size, expected_values): parameters=dict(TOP_K=10, MOVIELENS_DATA_SIZE=size), ) results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ - "value" + "data" ] for key, value in expected_values.items(): @@ -115,7 +115,7 @@ def test_surprise_svd_integration(notebooks, size, expected_values): parameters=dict(MOVIELENS_DATA_SIZE=size), ) results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ - "value" + "data" ] for key, value in expected_values.items(): @@ -150,7 +150,7 @@ def test_vw_deep_dive_integration(notebooks, size, expected_values): parameters=dict(MOVIELENS_DATA_SIZE=size, TOP_K=10), ) results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ - "value" + "data" ] for key, value in expected_values.items(): @@ -189,7 +189,7 @@ def test_wikidata_integration(notebooks, tmp): ), ) results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ - "value" + "data" ] # NOTE: The return number should be always 5, but sometimes we get less because wikidata is unstable @@ -206,7 +206,7 @@ def test_mind_utils_integration(notebooks, tmp): parameters=dict(mind_type="small", word_embedding_dim=300), ) results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ - "value" + "data" ] assert results["utils_state"]["vert_num"] == 17 @@ -235,7 +235,7 @@ def test_cornac_bpr_integration(notebooks, size, expected_values): parameters=dict(MOVIELENS_DATA_SIZE=size), ) results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ - "value" + "data" ] for key, value in expected_values.items(): @@ -251,7 +251,7 @@ def test_xlearn_fm_integration(notebooks): parameters=dict(LEARNING_RATE=0.2, EPOCH=10), ) results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ - "value" + "data" ] assert results["auc_score"] == pytest.approx(0.75, rel=TOL, abs=ABS_TOL) @@ -265,7 +265,7 @@ def test_geoimc_integration(notebooks, expected_values): notebook_path = notebooks["geoimc_quickstart"] pm.execute_notebook(notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME) results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ - "value" + "data" ] for key, value in expected_values.items(): diff --git a/tests/smoke/test_notebooks_gpu.py b/tests/smoke/test_notebooks_gpu.py index 1aa977274b..fbd8fcf0a0 100644 --- a/tests/smoke/test_notebooks_gpu.py +++ b/tests/smoke/test_notebooks_gpu.py @@ -31,7 +31,7 @@ def test_ncf_smoke(notebooks): parameters=dict(TOP_K=10, MOVIELENS_DATA_SIZE="100k", EPOCHS=1, BATCH_SIZE=256), ) results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ - "value" + "data" ] assert results["map"] == pytest.approx(0.0409234, rel=TOL, abs=ABS_TOL) @@ -53,7 +53,7 @@ def test_ncf_deep_dive_smoke(notebooks): ), ) results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ - "value" + "data" ] # There is too much variability to do an approx equal, just adding top values @@ -78,7 +78,7 @@ def test_fastai_smoke(notebooks): parameters=dict(TOP_K=10, MOVIELENS_DATA_SIZE="100k", EPOCHS=1), ) results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ - "value" + "data" ] assert results["rmse"] == pytest.approx(0.959352, rel=TOL, abs=ABS_TOL) @@ -108,7 +108,7 @@ def test_xdeepfm_smoke(notebooks): ), ) results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ - "value" + "data" ] assert results["res_syn"]["auc"] == pytest.approx(0.5043, rel=TOL, abs=ABS_TOL) @@ -136,7 +136,7 @@ def test_wide_deep_smoke(notebooks, tmp): notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME, parameters=params ) results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ - "value" + "data" ] assert results["rmse"] == pytest.approx(1.06034, rel=TOL, abs=ABS_TOL) @@ -156,7 +156,7 @@ def test_naml_smoke(notebooks): parameters=dict(epochs=1, seed=42, MIND_type="demo"), ) results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ - "value" + "data" ] assert results["res_syn"]["group_auc"] == pytest.approx( @@ -176,7 +176,7 @@ def test_nrms_smoke(notebooks): parameters=dict(epochs=1, seed=42, MIND_type="demo"), ) results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ - "value" + "data" ] assert results["res_syn"]["group_auc"] == pytest.approx( @@ -196,7 +196,7 @@ def test_npa_smoke(notebooks): parameters=dict(epochs=1, seed=42, MIND_type="demo"), ) results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ - "value" + "data" ] assert results["res_syn"]["group_auc"] == pytest.approx( @@ -216,7 +216,7 @@ def test_lstur_smoke(notebooks): parameters=dict(epochs=1, seed=40, MIND_type="demo"), ) results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ - "value" + "data" ] assert results["res_syn"]["group_auc"] == pytest.approx( diff --git a/tests/smoke/test_notebooks_pyspark.py b/tests/smoke/test_notebooks_pyspark.py index 482a40215f..6489fced54 100644 --- a/tests/smoke/test_notebooks_pyspark.py +++ b/tests/smoke/test_notebooks_pyspark.py @@ -25,7 +25,7 @@ def test_als_pyspark_smoke(notebooks): ) results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ - "value" + "data" ] assert results["map"] == pytest.approx(0.0052, rel=TOL, abs=ABS_TOL) @@ -51,6 +51,6 @@ def test_mmlspark_lightgbm_criteo_smoke(notebooks): ) results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ - "value" + "data" ] assert results["auc"] == pytest.approx(0.68895, rel=TOL, abs=ABS_TOL) diff --git a/tests/smoke/test_notebooks_python.py b/tests/smoke/test_notebooks_python.py index 347502b18e..66eb92e5c1 100644 --- a/tests/smoke/test_notebooks_python.py +++ b/tests/smoke/test_notebooks_python.py @@ -22,7 +22,7 @@ def test_sar_single_node_smoke(notebooks): parameters=dict(TOP_K=10, MOVIELENS_DATA_SIZE="100k"), ) results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ - "value" + "data" ] assert results["map"] == pytest.approx(0.110591, rel=TOL, abs=ABS_TOL) @@ -41,7 +41,7 @@ def test_baseline_deep_dive_smoke(notebooks): parameters=dict(TOP_K=10, MOVIELENS_DATA_SIZE="100k"), ) results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ - "value" + "data" ] assert results["rmse"] == pytest.approx(1.054252, rel=TOL, abs=ABS_TOL) @@ -64,7 +64,7 @@ def test_surprise_svd_smoke(notebooks): parameters=dict(MOVIELENS_DATA_SIZE="100k"), ) results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ - "value" + "data" ] assert results["rmse"] == pytest.approx(0.96, rel=TOL, abs=ABS_TOL) @@ -87,7 +87,7 @@ def test_vw_deep_dive_smoke(notebooks): parameters=dict(MOVIELENS_DATA_SIZE="100k"), ) results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ - "value" + "data" ] assert results["rmse"] == pytest.approx(0.985920, rel=TOL, abs=ABS_TOL) @@ -117,7 +117,7 @@ def test_lightgbm_quickstart_smoke(notebooks): ), ) results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ - "value" + "data" ] assert results["res_basic"]["auc"] == pytest.approx(0.7674, rel=TOL, abs=ABS_TOL) @@ -140,7 +140,7 @@ def test_cornac_bpr_smoke(notebooks): parameters=dict(MOVIELENS_DATA_SIZE="100k"), ) results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ - "value" + "data" ] assert results["map"] == pytest.approx(0.1091, rel=TOL, abs=ABS_TOL) diff --git a/tests/unit/test_notebook_utils.py b/tests/unit/test_notebook_utils.py index ea6bab9902..1e193a3bc0 100644 --- a/tests/unit/test_notebook_utils.py +++ b/tests/unit/test_notebook_utils.py @@ -23,7 +23,6 @@ def test_is_jupyter(): nb = sb.read_notebook(OUTPUT_NOTEBOOK) df = nb.papermill_dataframe result_is_jupyter = df.loc[df["name"] == "is_jupyter", "value"].values[0] - assert result_is_jupyter assert result_is_jupyter is True result_is_databricks = df.loc[df["name"] == "is_databricks", "value"].values[0] assert result_is_databricks is False From 979987abe4eff3d6411aea3b667d4e9507949b02 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Tue, 26 Jan 2021 14:49:37 +0000 Subject: [PATCH 09/14] :bug: --- tests/unit/test_notebook_utils.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/unit/test_notebook_utils.py b/tests/unit/test_notebook_utils.py index 1e193a3bc0..b1b712a5cd 100644 --- a/tests/unit/test_notebook_utils.py +++ b/tests/unit/test_notebook_utils.py @@ -21,11 +21,11 @@ def test_is_jupyter(): path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME, ) nb = sb.read_notebook(OUTPUT_NOTEBOOK) - df = nb.papermill_dataframe - result_is_jupyter = df.loc[df["name"] == "is_jupyter", "value"].values[0] - assert result_is_jupyter is True - result_is_databricks = df.loc[df["name"] == "is_databricks", "value"].values[0] - assert result_is_databricks is False + df = nb.scraps.dataframe + result_is_jupyter = df.loc[df["name"] == "is_jupyter", "data"].values[0] + assert result_is_jupyter == True # is True not allowed + result_is_databricks = df.loc[df["name"] == "is_databricks", "data"].values[0] + assert result_is_databricks == False # @pytest.mark.notebooks From de49515c7b030e39f324e23dea5f0691f7737900 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Tue, 26 Jan 2021 15:45:25 +0000 Subject: [PATCH 10/14] weird --- tests/smoke/test_mind.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/tests/smoke/test_mind.py b/tests/smoke/test_mind.py index bebc69c631..72426ac4f7 100644 --- a/tests/smoke/test_mind.py +++ b/tests/smoke/test_mind.py @@ -6,39 +6,38 @@ import requests from reco_utils.dataset.mind import download_mind, extract_mind + @pytest.mark.smoke @pytest.mark.parametrize("url, content_length, etag", - [("https://recodatasets.z20.web.core.windows.net/newsrec/MINDdemo_train.zip", - "17372879", "0x8D82C63E386D09C"), + [ + ("https://recodatasets.z20.web.core.windows.net/newsrec/MINDdemo_train.zip", + "17372879", '"0x8D8B8AD5B233930"'), # NOTE: the z20 blob returns the etag with "" ("https://recodatasets.z20.web.core.windows.net/newsrec/MINDdemo_dev.zip", - "10080022", "0x8D82C6434EC3CEE"), + "10080022", '"0x8D8B8AD5B188839"'), ("https://recodatasets.z20.web.core.windows.net/newsrec/MINDdemo_utils.zip", - "97292694", "0x8D87F362FF7FB26"), + "97292694", '"0x8D8B8AD5B126C3B"'), ("https://mind201910small.blob.core.windows.net/release/MINDsmall_train.zip", - "52952752","0x8D834F2EB31BDEC"), + "52952752", "0x8D834F2EB31BDEC"), ("https://mind201910small.blob.core.windows.net/release/MINDsmall_dev.zip", - "30945572","0x8D834F2EBA8D865"), + "30945572", "0x8D834F2EBA8D865"), ("https://mind201910small.blob.core.windows.net/release/MINDsmall_utils.zip", "155178106", "0x8D87F67F4AEB960"), ("https://mind201910small.blob.core.windows.net/release/MINDlarge_train.zip", - "530196631","0x8D8244E90C15C07"), + "530196631", "0x8D8244E90C15C07"), ("https://mind201910small.blob.core.windows.net/release/MINDlarge_dev.zip", - "103456245","0x8D8244E92005849"), + "103456245", "0x8D8244E92005849"), ("https://mind201910small.blob.core.windows.net/release/MINDlarge_utils.zip", "150359301", "0x8D87F67E6CA4364"), ]) def test_mind_url(url, content_length, etag): - """ Test file sizes and etags. - Covers train, dev and utils files for demo, small and large datasets. - """ url_headers = requests.head(url).headers assert url_headers["Content-Length"] == content_length assert url_headers["ETag"] == etag + @pytest.mark.smoke @pytest.mark.parametrize("size",[("demo"),("small")]) def test_extract_mind(size,tmp): - """ Test file download and extration for demo and small datasets """ train_zip, valid_zip = download_mind(size, dest_path=tmp) train_path, valid_path = extract_mind(train_zip, valid_zip) From e2aad281e2e13fe6c57becedcac177f4b740426a Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Tue, 26 Jan 2021 18:30:00 +0000 Subject: [PATCH 11/14] :bug: --- tests/integration/test_mind.py | 22 ++++++++++++++++++++++ tests/integration/test_notebooks_python.py | 22 ---------------------- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/tests/integration/test_mind.py b/tests/integration/test_mind.py index 52cedd8607..b26a8d0258 100644 --- a/tests/integration/test_mind.py +++ b/tests/integration/test_mind.py @@ -37,3 +37,25 @@ def test_extract_mind(tmp): assert statinfo.st_size == 59055351 statinfo = os.stat(os.path.join(valid_path, "relation_embedding.vec")) assert statinfo.st_size == 1044588 + + +@pytest.mark.integration +def test_mind_utils_integration(notebooks, tmp): + notebook_path = notebooks["mind_utils"] + pm.execute_notebook( + notebook_path, + OUTPUT_NOTEBOOK, + kernel_name=KERNEL_NAME, + parameters=dict(mind_type="small", word_embedding_dim=300), + ) + results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ + "data" + ] + + assert results["utils_state"]["vert_num"] == 17 + assert results["utils_state"]["subvert_num"] == 17 + assert results["utils_state"]["word_num"] == 23404 + assert results["utils_state"]["word_num_all"] == 55028 + assert results["utils_state"]["embedding_exist_num"] == 29081 + assert results["utils_state"]["embedding_exist_num_all"] == 48422 + assert results["utils_state"]["uid2index"] == 50000 diff --git a/tests/integration/test_notebooks_python.py b/tests/integration/test_notebooks_python.py index ce0a8ee89e..20c4115eb5 100644 --- a/tests/integration/test_notebooks_python.py +++ b/tests/integration/test_notebooks_python.py @@ -196,28 +196,6 @@ def test_wikidata_integration(notebooks, tmp): assert results["length_result"] >= 1 -@pytest.mark.integration -def test_mind_utils_integration(notebooks, tmp): - notebook_path = notebooks["mind_utils"] - pm.execute_notebook( - notebook_path, - OUTPUT_NOTEBOOK, - kernel_name=KERNEL_NAME, - parameters=dict(mind_type="small", word_embedding_dim=300), - ) - results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[ - "data" - ] - - assert results["utils_state"]["vert_num"] == 17 - assert results["utils_state"]["subvert_num"] == 17 - assert results["utils_state"]["word_num"] == 31029 - assert results["utils_state"]["word_num_all"] == 55028 - assert results["utils_state"]["embedding_exist_num"] == 29081 - assert results["utils_state"]["embedding_exist_num_all"] == 48422 - assert results["utils_state"]["uid2index"] == 50000 - - @pytest.mark.integration @pytest.mark.parametrize( "size, expected_values", From b7675c059032bb9d1d5b9d0fec89dc0d91c66558 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Tue, 26 Jan 2021 18:31:46 +0000 Subject: [PATCH 12/14] :bug: --- tests/integration/test_mind.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_mind.py b/tests/integration/test_mind.py index b26a8d0258..1960b5b113 100644 --- a/tests/integration/test_mind.py +++ b/tests/integration/test_mind.py @@ -1,8 +1,11 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. -import pytest import os +import pytest +import papermill as pm +import scrapbook as sb + from reco_utils.dataset.mind import download_mind, extract_mind From 1c077a7b2614e9239cd5af9f0471e160069cd352 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Tue, 26 Jan 2021 18:33:47 +0000 Subject: [PATCH 13/14] :bug: --- tests/integration/test_mind.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration/test_mind.py b/tests/integration/test_mind.py index 1960b5b113..168fdc171a 100644 --- a/tests/integration/test_mind.py +++ b/tests/integration/test_mind.py @@ -7,6 +7,7 @@ import scrapbook as sb from reco_utils.dataset.mind import download_mind, extract_mind +from tests.notebooks_common import OUTPUT_NOTEBOOK, KERNEL_NAME @pytest.mark.integration From a3fe1249b1f501d74ad3da55da46b3e879dc2e70 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Tue, 26 Jan 2021 23:25:49 +0000 Subject: [PATCH 14/14] :bug: --- tests/integration/test_mind.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/integration/test_mind.py b/tests/integration/test_mind.py index 168fdc171a..088b4b8b38 100644 --- a/tests/integration/test_mind.py +++ b/tests/integration/test_mind.py @@ -59,7 +59,7 @@ def test_mind_utils_integration(notebooks, tmp): assert results["utils_state"]["vert_num"] == 17 assert results["utils_state"]["subvert_num"] == 17 assert results["utils_state"]["word_num"] == 23404 - assert results["utils_state"]["word_num_all"] == 55028 - assert results["utils_state"]["embedding_exist_num"] == 29081 - assert results["utils_state"]["embedding_exist_num_all"] == 48422 - assert results["utils_state"]["uid2index"] == 50000 + assert results["utils_state"]["word_num_all"] == 41074 + assert results["utils_state"]["embedding_exist_num"] == 22408 + assert results["utils_state"]["embedding_exist_num_all"] == 37634 + assert results["utils_state"]["uid2index"] == 5000