Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

debug issue with pysar test and tolerance #109

Closed
miguelgfierro opened this issue Oct 22, 2018 · 0 comments
Closed

debug issue with pysar test and tolerance #109

miguelgfierro opened this issue Oct 22, 2018 · 0 comments
Assignees
Milestone

Comments

@miguelgfierro
Copy link
Collaborator

On some machines, using a tolerance of 1e-8, the tests pass, but in others they don't.

We got this error on Prometheus, when testing test_sar_single_node.py:

(py36) miguel@prometheus:~/repos/Recommenders$ pytest tests/unit/test_sar_singlenode.py 
=================================================================================== test session starts ====================================================================================
platform linux -- Python 3.6.5, pytest-3.6.4, py-1.7.0, pluggy-0.7.1
rootdir: /home/miguel/repos/Recommenders, inifile:
collected 15 items                                                                                                                                                                         

tests/unit/test_sar_singlenode.py ...........FFFF                                                                                                                                    [100%]

========================================================================================= FAILURES =========================================================================================
____________________________________________________________________________________ test_user_affinity ____________________________________________________________________________________

demo_usage_data =                  UserId    MovieId     Timestamp  Rating  exponential  rating_exponential
0      0003000098E85347  DQF...076
11837  00030000822E3BAE  DAF-00448  1.416292e+09       1     0.009076            0.009076

[11838 rows x 6 columns]
sar_settings = {'ATOL': 1e-08, 'FILE_DIR': 'http://recodatasets.blob.core.windows.net/sarunittest/', 'TEST_USER_ID': '0003000098E85347'}
header = {'col_item': 'MovieId', 'col_rating': 'Rating', 'col_timestamp': 'Timestamp', 'col_user': 'UserId'}

    def test_user_affinity(demo_usage_data, sar_settings, header):
        time_now = demo_usage_data[header["col_timestamp"]].max()
        model = SARSingleNodeReference(
            remove_seen=True,
            similarity_type="cooccurrence",
            timedecay_formula=True,
            time_decay_coefficient=30,
            time_now=time_now,
            **header
        )
        _apply_sar_hash_index(model, demo_usage_data, None, header)
        model.fit(demo_usage_data)
    
        true_user_affinity, items = load_affinity(sar_settings["FILE_DIR"] + "user_aff.csv")
        user_index = model.user_map_dict[sar_settings["TEST_USER_ID"]]
        test_user_affinity = np.reshape(
            np.array(
                _rearrange_to_test(
                    model.user_affinity, None, items, None, model.item_map_dict
                )[user_index,].todense()
            ),
            -1,
        )
>       assert np.allclose(
            true_user_affinity.astype(test_user_affinity.dtype),
            test_user_affinity,
            atol=sar_settings["ATOL"],
        )
E       AssertionError: assert False
E        +  where False = <function allclose at 0x7f6110e1d730>(array([0.        , 0.        , 0.        , 0.        , 0.        ,\n       0.        , 0.        , 0.        , 0.      ...       , 0.        , 0.        ,\n       0.        , 0.        , 0.15181286, 1.        , 0.        ,\n       0.        ]), array([0.        , 0.        , 0.        , 0.        , 0.        ,\n       0.        , 0.        , 0.        , 0.      ...       , 0.        , 0.        ,\n       0.        , 0.        , 0.15195908, 1.        , 0.        ,\n       0.        ]), atol=1e-08)
E        +    where <function allclose at 0x7f6110e1d730> = np.allclose
E        +    and   array([0.        , 0.        , 0.        , 0.        , 0.        ,\n       0.        , 0.        , 0.        , 0.      ...       , 0.        , 0.        ,\n       0.        , 0.        , 0.15181286, 1.        , 0.        ,\n       0.        ]) = <built-in method astype of numpy.ndarray object at 0x7f60fc6adee0>(dtype('float64'))
E        +      where <built-in method astype of numpy.ndarray object at 0x7f60fc6adee0> = array(['0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',\n       '0', '0.0221122254449968', '0', '0', '0..., '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',\n       '0', '0.151812861826336', '1', '0', '0'], dtype='<U18').astype
E        +      and   dtype('float64') = array([0.        , 0.        , 0.        , 0.        , 0.        ,\n       0.        , 0.        , 0.        , 0.      ...       , 0.        , 0.        ,\n       0.        , 0.        , 0.15195908, 1.        , 0.        ,\n       0.        ]).dtype

tests/unit/test_sar_singlenode.py:201: AssertionError
___________________________________________________________________________ test_userpred[3-cooccurrence-count] ____________________________________________________________________________

threshold = 3, similarity_type = 'cooccurrence', file = 'count', header = {'col_item': 'MovieId', 'col_rating': 'Rating', 'col_timestamp': 'Timestamp', 'col_user': 'UserId'}
sar_settings = {'ATOL': 1e-08, 'FILE_DIR': 'http://recodatasets.blob.core.windows.net/sarunittest/', 'TEST_USER_ID': '0003000098E85347'}
demo_usage_data =                  UserId    MovieId     Timestamp  Rating  exponential  rating_exponential
0      0003000098E85347  DQF...076
11837  00030000822E3BAE  DAF-00448  1.416292e+09       1     0.009076            0.009076

[11838 rows x 6 columns]

    @pytest.mark.parametrize(
        "threshold,similarity_type,file",
        [(3, "cooccurrence", "count"), (3, "jaccard", "jac"), (3, "lift", "lift")],
    )
    def test_userpred(
        threshold, similarity_type, file, header, sar_settings, demo_usage_data
    ):
        time_now = demo_usage_data[header["col_timestamp"]].max()
        model = SARSingleNodeReference(
            remove_seen=True,
            similarity_type=similarity_type,
            timedecay_formula=True,
            time_decay_coefficient=30,
            time_now=time_now,
            threshold=threshold,
            **header
        )
        _apply_sar_hash_index(model, demo_usage_data, None, header)
        model.fit(demo_usage_data)
    
        true_items, true_scores = load_userpred(
            sar_settings["FILE_DIR"]
            + "userpred_"
            + file
            + str(threshold)
            + "_userid_only.csv"
        )
        test_results = model.recommend_k_items(
            demo_usage_data[
                demo_usage_data[header["col_user"]] == sar_settings["TEST_USER_ID"]
            ],
            top_k=10,
        )
        test_items = list(test_results[header["col_item"]])
        test_scores = np.array(test_results["prediction"])
        assert true_items == test_items
>       assert np.allclose(true_scores, test_scores, atol=sar_settings["ATOL"])
E       assert False
E        +  where False = <function allclose at 0x7f6110e1d730>(array([40.96870941, 40.37760085, 19.55002941, 18.10756063, 13.24775154,\n       12.67358812, 12.49898911, 12.0359004 , 10.91842008, 10.91185623]), array([41.00239015, 40.41649126, 19.5650067 , 18.12114858, 13.26051135,\n       12.6742369 , 12.50043289, 12.047493  , 10.92893636, 10.92236618]), atol=1e-08)
E        +    where <function allclose at 0x7f6110e1d730> = np.allclose

tests/unit/test_sar_singlenode.py:245: AssertionError
_______________________________________________________________________________ test_userpred[3-jaccard-jac] _______________________________________________________________________________

threshold = 3, similarity_type = 'jaccard', file = 'jac', header = {'col_item': 'MovieId', 'col_rating': 'Rating', 'col_timestamp': 'Timestamp', 'col_user': 'UserId'}
sar_settings = {'ATOL': 1e-08, 'FILE_DIR': 'http://recodatasets.blob.core.windows.net/sarunittest/', 'TEST_USER_ID': '0003000098E85347'}
demo_usage_data =                  UserId    MovieId     Timestamp  Rating  exponential  rating_exponential
0      0003000098E85347  DQF...076
11837  00030000822E3BAE  DAF-00448  1.416292e+09       1     0.009076            0.009076

[11838 rows x 6 columns]

    @pytest.mark.parametrize(
        "threshold,similarity_type,file",
        [(3, "cooccurrence", "count"), (3, "jaccard", "jac"), (3, "lift", "lift")],
    )
    def test_userpred(
        threshold, similarity_type, file, header, sar_settings, demo_usage_data
    ):
        time_now = demo_usage_data[header["col_timestamp"]].max()
        model = SARSingleNodeReference(
            remove_seen=True,
            similarity_type=similarity_type,
            timedecay_formula=True,
            time_decay_coefficient=30,
            time_now=time_now,
            threshold=threshold,
            **header
        )
        _apply_sar_hash_index(model, demo_usage_data, None, header)
        model.fit(demo_usage_data)
    
        true_items, true_scores = load_userpred(
            sar_settings["FILE_DIR"]
            + "userpred_"
            + file
            + str(threshold)
            + "_userid_only.csv"
        )
        test_results = model.recommend_k_items(
            demo_usage_data[
                demo_usage_data[header["col_user"]] == sar_settings["TEST_USER_ID"]
            ],
            top_k=10,
        )
        test_items = list(test_results[header["col_item"]])
        test_scores = np.array(test_results["prediction"])
        assert true_items == test_items
>       assert np.allclose(true_scores, test_scores, atol=sar_settings["ATOL"])
E       assert False
E        +  where False = <function allclose at 0x7f6110e1d730>(array([0.0616357 , 0.04918001, 0.04247487, 0.04009872, 0.03847229,\n       0.03839772, 0.03251167, 0.02474822, 0.02432458, 0.0224889 ]), array([0.06163639, 0.04921205, 0.04247624, 0.04011545, 0.03848885,\n       0.03843471, 0.0325135 , 0.02477206, 0.02432508, 0.02249099]), atol=1e-08)
E        +    where <function allclose at 0x7f6110e1d730> = np.allclose

tests/unit/test_sar_singlenode.py:245: AssertionError
________________________________________________________________________________ test_userpred[3-lift-lift] ________________________________________________________________________________

threshold = 3, similarity_type = 'lift', file = 'lift', header = {'col_item': 'MovieId', 'col_rating': 'Rating', 'col_timestamp': 'Timestamp', 'col_user': 'UserId'}
sar_settings = {'ATOL': 1e-08, 'FILE_DIR': 'http://recodatasets.blob.core.windows.net/sarunittest/', 'TEST_USER_ID': '0003000098E85347'}
demo_usage_data =                  UserId    MovieId     Timestamp  Rating  exponential  rating_exponential
0      0003000098E85347  DQF...076
11837  00030000822E3BAE  DAF-00448  1.416292e+09       1     0.009076            0.009076

[11838 rows x 6 columns]

    @pytest.mark.parametrize(
        "threshold,similarity_type,file",
        [(3, "cooccurrence", "count"), (3, "jaccard", "jac"), (3, "lift", "lift")],
    )
    def test_userpred(
        threshold, similarity_type, file, header, sar_settings, demo_usage_data
    ):
        time_now = demo_usage_data[header["col_timestamp"]].max()
        model = SARSingleNodeReference(
            remove_seen=True,
            similarity_type=similarity_type,
            timedecay_formula=True,
            time_decay_coefficient=30,
            time_now=time_now,
            threshold=threshold,
            **header
        )
        _apply_sar_hash_index(model, demo_usage_data, None, header)
        model.fit(demo_usage_data)
    
        true_items, true_scores = load_userpred(
            sar_settings["FILE_DIR"]
            + "userpred_"
            + file
            + str(threshold)
            + "_userid_only.csv"
        )
        test_results = model.recommend_k_items(
            demo_usage_data[
                demo_usage_data[header["col_user"]] == sar_settings["TEST_USER_ID"]
            ],
            top_k=10,
        )
        test_items = list(test_results[header["col_item"]])
        test_scores = np.array(test_results["prediction"])
        assert true_items == test_items
>       assert np.allclose(true_scores, test_scores, atol=sar_settings["ATOL"])
E       assert False
E        +  where False = <function allclose at 0x7f6110e1d730>(array([0.00134902, 0.00084695, 0.00072497, 0.00072133, 0.00066855,\n       0.0006003 , 0.00045299, 0.00045202, 0.00041803, 0.00034772]), array([0.00134902, 0.00084696, 0.00072513, 0.00072134, 0.00066871,\n       0.00060031, 0.00045312, 0.00045204, 0.00041804, 0.00034806]), atol=1e-08)
E        +    where <function allclose at 0x7f6110e1d730> = np.allclose

tests/unit/test_sar_singlenode.py:245: AssertionError
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants