Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bug with papermill read notebook #1289

Merged
merged 14 commits into from
Feb 1, 2021
Merged
21 changes: 12 additions & 9 deletions SETUP.md
Original file line number Diff line number Diff line change
Expand Up @@ -120,18 +120,21 @@ You also need to find where Spark is installed and set `SPARK_HOME` variable, on

Then, create the file `$RECO_ENV/etc/conda/activate.d/env_vars.sh` and add:

#!/bin/sh
RECO_ENV=$(conda env list | grep reco_pyspark | awk '{print $NF}')
export PYSPARK_PYTHON=$RECO_ENV/bin/python
export PYSPARK_DRIVER_PYTHON=$RECO_ENV/bin/python
export SPARK_HOME=/dsvm/tools/spark/current
```bash
#!/bin/sh
RECO_ENV=$(conda env list | grep reco_pyspark | awk '{print $NF}')
export PYSPARK_PYTHON=$RECO_ENV/bin/python
export PYSPARK_DRIVER_PYTHON=$RECO_ENV/bin/python
export SPARK_HOME=/dsvm/tools/spark/current
```

This will export the variables every time we do `conda activate reco_pyspark`. To unset these variables when we deactivate the environment, create the file `$RECO_ENV/etc/conda/deactivate.d/env_vars.sh` and add:

#!/bin/sh
unset PYSPARK_PYTHON
unset PYSPARK_DRIVER_PYTHON

```bash
#!/bin/sh
unset PYSPARK_PYTHON
unset PYSPARK_DRIVER_PYTHON
```

</details>

Expand Down
12 changes: 4 additions & 8 deletions examples/00_quick_start/sar_movielens.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -455,13 +455,9 @@
"metadata": {
"celltoolbar": "Tags",
"kernelspec": {
"display_name": "Python 3.6.12 64-bit ('sb_full': conda)",
"metadata": {
"interpreter": {
"hash": "f28711ae1fad89778b64817fc2d746effb845deda73edae96b2473c20b2d4f70"
}
},
"name": "python3"
"display_name": "Python (reco_base)",
"language": "python",
"name": "reco_base"
},
"language_info": {
"codemirror_mode": {
Expand All @@ -473,7 +469,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.12-final"
"version": "3.6.11"
}
},
"nbformat": 4,
Expand Down
40 changes: 7 additions & 33 deletions examples/01_prepare_data/mind_utils.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -46,14 +46,12 @@
"import sys\n",
"sys.path.append(\"../../\")\n",
"import os\n",
"import papermill as pm\n",
"import pandas as pd\n",
"from collections import Counter\n",
"from tqdm import tqdm\n",
"import pickle\n",
"import numpy as np\n",
"\n",
"print(\"System version: {}\".format(sys.version))\n",
"import scrapbook as sb\n",
"\n",
"from tempfile import TemporaryDirectory\n",
"from reco_utils.dataset.mind import (download_mind,\n",
Expand All @@ -62,7 +60,9 @@
" load_glove_matrix,\n",
" word_tokenize\n",
" )\n",
"from reco_utils.dataset.download_utils import unzip_file"
"from reco_utils.dataset.download_utils import unzip_file\n",
"\n",
"print(\"System version: {}\".format(sys.version))\n"
]
},
{
Expand Down Expand Up @@ -418,37 +418,11 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/apple/miniconda/envs/reco_base/lib/python3.6/site-packages/ipykernel_launcher.py:1: DeprecationWarning: Function record is deprecated and will be removed in verison 1.0.0 (current version 0.19.1). Please see `scrapbook.glue` (nteract-scrapbook) as a replacement for this functionality.\n",
" \"\"\"Entry point for launching an IPython kernel.\n"
]
},
{
"data": {
"application/papermill.record+json": {
"utils_state": {
"embedding_exist_num": 22408,
"embedding_exist_num_all": 37634,
"subvert_num": 17,
"uid2index": 5000,
"vert_num": 17,
"word_num": 23404,
"word_num_all": 41074
}
}
},
"metadata": {},
"output_type": "display_data"
}
],
"outputs": [],
"source": [
"pm.record(\"utils_state\", utils_state)"
"sb.glue(\"utils_state\", utils_state)"
]
}
],
Expand Down
8 changes: 4 additions & 4 deletions examples/template.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -259,9 +259,9 @@
"metadata": {
"celltoolbar": "Tags",
"kernelspec": {
"display_name": "Python (recommender)",
"display_name": "Python (reco_base)",
"language": "python",
"name": "recommender"
"name": "reco_base"
},
"language_info": {
"codemirror_mode": {
Expand All @@ -273,9 +273,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.0"
"version": "3.6.11"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
}
28 changes: 27 additions & 1 deletion tests/integration/test_mind.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

import pytest
import os
import pytest
import papermill as pm
import scrapbook as sb

from reco_utils.dataset.mind import download_mind, extract_mind
from tests.notebooks_common import OUTPUT_NOTEBOOK, KERNEL_NAME


@pytest.mark.integration
Expand Down Expand Up @@ -37,3 +41,25 @@ def test_extract_mind(tmp):
assert statinfo.st_size == 59055351
statinfo = os.stat(os.path.join(valid_path, "relation_embedding.vec"))
assert statinfo.st_size == 1044588


@pytest.mark.integration
def test_mind_utils_integration(notebooks, tmp):
notebook_path = notebooks["mind_utils"]
pm.execute_notebook(
notebook_path,
OUTPUT_NOTEBOOK,
kernel_name=KERNEL_NAME,
parameters=dict(mind_type="small", word_embedding_dim=300),
)
results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[
"data"
]

assert results["utils_state"]["vert_num"] == 17
assert results["utils_state"]["subvert_num"] == 17
assert results["utils_state"]["word_num"] == 23404
assert results["utils_state"]["word_num_all"] == 41074
assert results["utils_state"]["embedding_exist_num"] == 22408
assert results["utils_state"]["embedding_exist_num_all"] == 37634
assert results["utils_state"]["uid2index"] == 5000
76 changes: 57 additions & 19 deletions tests/integration/test_notebooks_gpu.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

import papermill as pm
import os
import pytest
import papermill as pm
import scrapbook as sb

from reco_utils.common.gpu_utils import get_number_gpus
from tests.notebooks_common import OUTPUT_NOTEBOOK, KERNEL_NAME
import os


TOL = 0.5
Expand Down Expand Up @@ -48,7 +49,9 @@ def test_ncf_integration(notebooks, size, epochs, expected_values, seed):
TOP_K=10, MOVIELENS_DATA_SIZE=size, EPOCHS=epochs, BATCH_SIZE=512, SEED=seed
),
)
results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"]
results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[
"data"
]

for key, value in expected_values.items():
assert results[key] == pytest.approx(value, rel=TOL, abs=ABS_TOL)
Expand Down Expand Up @@ -93,7 +96,9 @@ def test_ncf_deep_dive_integration(
SEED=seed,
),
)
results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"]
results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[
"data"
]

for key, value in expected_values.items():
assert results[key] == pytest.approx(value, rel=TOL, abs=ABS_TOL)
Expand Down Expand Up @@ -123,14 +128,15 @@ def test_ncf_deep_dive_integration(
)
def test_fastai_integration(notebooks, size, epochs, expected_values):
notebook_path = notebooks["fastai"]
pm.execute_notebook(notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME)
pm.execute_notebook(
notebook_path,
OUTPUT_NOTEBOOK,
kernel_name=KERNEL_NAME,
parameters=dict(TOP_K=10, MOVIELENS_DATA_SIZE=size, EPOCHS=epochs),
)
results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"]
results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[
"data"
]

for key, value in expected_values.items():
assert results[key] == pytest.approx(value, rel=TOL, abs=ABS_TOL)
Expand Down Expand Up @@ -168,7 +174,9 @@ def test_xdeepfm_integration(
RANDOM_SEED=seed,
),
)
results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"]
results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[
"data"
]

for key, value in expected_values.items():
assert results[key]["auc"] == pytest.approx(value["auc"], rel=TOL, abs=ABS_TOL)
Expand Down Expand Up @@ -215,7 +223,10 @@ def test_wide_deep_integration(notebooks, size, steps, expected_values, seed, tm
pm.execute_notebook(
notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME, parameters=params
)
results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"]
results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[
"data"
]

for key, value in expected_values.items():
assert results[key] == pytest.approx(value, rel=TOL, abs=ABS_TOL)

Expand Down Expand Up @@ -250,7 +261,10 @@ def test_slirec_quickstart_integration(
pm.execute_notebook(
notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME, parameters=params
)
results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"]
results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[
"data"
]

for key, value in expected_values.items():
assert results[key]["auc"] == pytest.approx(value["auc"], rel=TOL, abs=ABS_TOL)
assert results[key]["logloss"] == pytest.approx(
Expand Down Expand Up @@ -278,14 +292,19 @@ def test_slirec_quickstart_integration(
)
],
)
def test_nrms_quickstart_integration(notebooks, epochs, seed, MIND_type, expected_values):
def test_nrms_quickstart_integration(
notebooks, epochs, seed, MIND_type, expected_values
):
notebook_path = notebooks["nrms_quickstart"]

params = {"epochs": epochs, "seed": seed, "MIND_type": MIND_type}
pm.execute_notebook(
notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME, parameters=params
)
results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"]
results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[
"data"
]

for key, value in expected_values.items():
assert results[key]["group_auc"] == pytest.approx(
value["group_auc"], rel=TOL, abs=ABS_TOL
Expand Down Expand Up @@ -321,14 +340,19 @@ def test_nrms_quickstart_integration(notebooks, epochs, seed, MIND_type, expecte
)
],
)
def test_naml_quickstart_integration(notebooks, epochs, seed, MIND_type, expected_values):
def test_naml_quickstart_integration(
notebooks, epochs, seed, MIND_type, expected_values
):
notebook_path = notebooks["naml_quickstart"]

params = {"epochs": epochs, "seed": seed, "MIND_type": MIND_type}
pm.execute_notebook(
notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME, parameters=params
)
results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"]
results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[
"data"
]

for key, value in expected_values.items():
assert results[key]["group_auc"] == pytest.approx(
value["group_auc"], rel=TOL, abs=ABS_TOL
Expand Down Expand Up @@ -364,14 +388,19 @@ def test_naml_quickstart_integration(notebooks, epochs, seed, MIND_type, expecte
)
],
)
def test_lstur_quickstart_integration(notebooks, epochs, seed, MIND_type, expected_values):
def test_lstur_quickstart_integration(
notebooks, epochs, seed, MIND_type, expected_values
):
notebook_path = notebooks["lstur_quickstart"]

params = {"epochs": epochs, "seed": seed, "MIND_type": MIND_type}
pm.execute_notebook(
notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME, parameters=params
)
results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"]
results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[
"data"
]

for key, value in expected_values.items():
assert results[key]["group_auc"] == pytest.approx(
value["group_auc"], rel=TOL, abs=ABS_TOL
Expand Down Expand Up @@ -407,14 +436,19 @@ def test_lstur_quickstart_integration(notebooks, epochs, seed, MIND_type, expect
)
],
)
def test_npa_quickstart_integration(notebooks, epochs, seed, MIND_type, expected_values):
def test_npa_quickstart_integration(
notebooks, epochs, seed, MIND_type, expected_values
):
notebook_path = notebooks["npa_quickstart"]

params = {"epochs": epochs, "seed": seed, "MIND_type": MIND_type}
pm.execute_notebook(
notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME, parameters=params
)
results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"]
results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[
"data"
]

for key, value in expected_values.items():
assert results[key]["group_auc"] == pytest.approx(
value["group_auc"], rel=TOL, abs=ABS_TOL
Expand Down Expand Up @@ -470,7 +504,9 @@ def test_lightgcn_deep_dive_integration(
item_file=os.path.join(data_path, r"item_embeddings"),
),
)
results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"]
results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[
"data"
]

for key, value in expected_values.items():
assert results[key] == pytest.approx(value, rel=TOL, abs=ABS_TOL)
Expand All @@ -486,7 +522,9 @@ def test_dkn_quickstart_integration(notebooks):
kernel_name=KERNEL_NAME,
parameters=dict(epochs=5, batch_size=500),
)
results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"]
results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[
"data"
]

assert results["res"]["auc"] == pytest.approx(0.5651, rel=TOL, abs=ABS_TOL)
assert results["res"]["mean_mrr"] == pytest.approx(0.1639, rel=TOL, abs=ABS_TOL)
Expand Down