v0.3.2

sergioburdisso · Nov 12, 2019 · ab9e8be · ab9e8be
1 parent 09298e7
commit ab9e8be
Show file tree

Hide file tree

Showing 5 changed files with 155 additions and 42 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,8 +1,9 @@
-docs/_build
+build/
 __extra__
-pyss3/__pycache__
 .pytest_cache
-build/
+docs/_build
+*/__pycache__
+tests/pyss3
 *.pyc
 *~
 *.ipynb

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,3 +2,17 @@
 
 All notable changes to PySS3 will be documented here.
 
+## [0.3.2] 2019-11-12
+
+### Added
+- Summary operators: now it is possible to use user-defined summary operators, the following static methods were added to the ``SS3`` class: `summary_op_ngrams`, `summary_op_sentences`, and `summary_op_paragraphs`.
+
+## [0.3.1] 2019-11-11
+
+### Added
+- update: some docstrings were improved
+- update: the README.md / Pypi Description file.
+
+### Fixed
+- Python 2 and 3 compatibility problem with scikit-learn (using version 0.20.1 from now on)
+- PyPi: setup.py: `long_description_content_type` set to `'text/markdown'`
diff --git a/README.md b/README.md
@@ -64,8 +64,12 @@ Each dot represents an experiment/evaluation performed using that particular com
 
 ### The somewhat standard way
 
+(TODO: tutorial WIP)
+
 ### The "Command-Line" way
 
+(TODO: tutorial WIP)
+
 ## Installation
 
 

diff --git a/pyss3/__init__.py b/pyss3/__init__.py
@@ -18,7 +18,7 @@
 from functools import reduce
 from six.moves import xrange
 
-__version__ = "0.3.1"
+__version__ = "0.3.2"
 
 ENCODING = "utf-8"
 
@@ -428,12 +428,9 @@ def __classify_sentence__(self, sent, prep, json=False):
         get_word = self.get_word
         if not json:
             words_cvs = [classify_trans(seq) for _, seq in sent]
-
             if words_cvs:
-                cv = reduce(vsum, words_cvs)
-            else:
-                return self.__zero_cv__
-            return cv
+                return SS3.summary_op_ngrams(words_cvs)
+            return self.__zero_cv__
         else:
             get_tip = self.__trie_node__
             local_value = self.__lv__
@@ -449,7 +446,7 @@ def __classify_sentence__(self, sent, prep, json=False):
             ]
             return {
                 "words": info,
-                "cv": reduce(vsum, [v["cv"] for v in info]),
+                "cv": SS3.summary_op_ngrams([v["cv"] for v in info]),
                 "wmv": reduce(vmax, [v["cv"] for v in info])  # word max value
             }
 
@@ -462,10 +459,8 @@ def __classify_paragraph__(self, paragraph, prep, json=False):
                 if sent
             ]
             if sents_cvs:
-                cv = reduce(vsum, sents_cvs)
-            else:
-                cv = self.__zero_cv__
-            return cv
+                return SS3.summary_op_sentences(sents_cvs)
+            return self.__zero_cv__
         else:
             info = [
                 self.__classify_sentence__(sent, prep=prep, json=True)
@@ -474,7 +469,7 @@ def __classify_paragraph__(self, paragraph, prep, json=False):
             ]
             if info:
                 sents_cvs = [v["cv"] for v in info]
-                cv = reduce(vsum, sents_cvs)
+                cv = SS3.summary_op_sentences(sents_cvs)
                 wmv = reduce(vmax, [v["wmv"] for v in info])
             else:
                 cv = self.__zero_cv__
@@ -715,6 +710,102 @@ def __save_cat_vocab__(self, icat, path, n_grams):
                 f.close()
                 Print.info("\t[ %s stored in '%s'" % (term, voc_path))
 
+    @staticmethod
+    def summary_op_ngrams(cvs):
+        """
+        Summary operator for n-gram confidence vectors.
+
+        By default it returns the addition of all confidence
+        vectors. However, in case you want to use a custom
+        wants to use a custom summary operator, this function
+        must be replaced as shown in the following example:
+
+            >>> def dummy_summary_op(cvs):
+            >>>     return cvs[0]
+            >>> ...
+            >>> SS3.summary_op_ngrams = dummy_summary_op
+            >>> ...
+            >>> clf = SS3()
+
+        Note that any function receiving a list of (confidence)
+        vectors and returning a single (confidence) vector
+        could be used. In the example above the summary operator
+        is replaced by the user-defined ``dummy_summary_op`` which
+        ignores all confidence vectors returning only the confidence
+        vector of the first n-gram (which besides being an
+        illustrative example, makes no real sense).
+
+        :param cvs: a list n-grams confidence vectors
+        :type cvs: list (of list of float)
+        :returns: a sentence confidence vector
+        :rtype: list (of float)
+        """
+        return reduce(vsum, cvs)
+
+    @staticmethod
+    def summary_op_sentences(cvs):
+        """
+        Summary operator for sentence confidence vectors.
+
+        By default it returns the addition of all confidence
+        vectors. However, in case you want to use a custom
+        wants to use a custom summary operator, this function
+        must be replaced as shown in the following example:
+
+            >>> def dummy_summary_op(cvs):
+            >>>     return cvs[0]
+            >>> ...
+            >>> SS3.summary_op_sentences = dummy_summary_op
+            >>> ...
+            >>> clf = SS3()
+
+        Note that any function receiving a list of (confidence)
+        vectors and returning a single (confidence) vector
+        could be used. In the example above the summary operator
+        is replaced by the user-defined ``dummy_summary_op`` which
+        ignores all confidence vectors returning only the confidence
+        vector of the first sentence (which besides being an
+        illustrative example, makes no real sense).
+
+        :param cvs: a list sentence confidence vectors
+        :type cvs: list (of list of float)
+        :returns: a paragraph confidence vector
+        :rtype: list (of float)
+        """
+        return reduce(vsum, cvs)
+
+    @staticmethod
+    def summary_op_paragraphs(cvs):
+        """
+        Summary operator for paragraph confidence vectors.
+
+        By default it returns the addition of all confidence
+        vectors. However, in case you want to use a custom
+        wants to use a custom summary operator, this function
+        must be replaced as shown in the following example:
+
+            >>> def dummy_summary_op(cvs):
+            >>>     return cvs[0]
+            >>> ...
+            >>> SS3.summary_op_paragraphs = dummy_summary_op
+            >>> ...
+            >>> clf = SS3()
+
+        Note that any function receiving a list of (confidence)
+        vectors and returning a single (confidence) vector
+        could be used. In the example above the summary operator
+        is replaced by the user-defined ``dummy_summary_op`` which
+        ignores all confidence vectors returning only the confidence
+        vector of the first paragraph (which besides being an
+        illustrative example, makes no real sense).
+
+        :param cvs: a list paragraph confidence vectors
+        :type cvs: list (of list of float)
+        :returns: the document confidence vector
+        :rtype: list (of float)
+        """
+        return reduce(vsum, cvs)
+
     def get_name(self):
         """
         Return the model's name.
@@ -1505,7 +1596,7 @@ def classify(self, doc, prep=True, sort=True, json=False):
                 if parag
             ]
             if paragraphs_cvs:
-                cv = reduce(vsum, paragraphs_cvs)
+                cv = SS3.summary_op_paragraphs(paragraphs_cvs)
             else:
                 cv = self.__zero_cv__
             if sort:
@@ -1523,27 +1614,26 @@ def classify(self, doc, prep=True, sort=True, json=False):
                 for parag in doc.split(PARA_DELTR)
                 if parag
             ]
+
             nbr_cats = len(self.__categories__)
-            confidence_vector = reduce(vsum, [v["cv"] for v in info])
-            max_v = max(confidence_vector)
+            cv = SS3.summary_op_paragraphs([v["cv"] for v in info])
+            max_v = max(cv)
+
             if max_v > 1:
-                confidence_vector_norm = map(
-                    lambda x: x / max_v, confidence_vector
-                )
+                norm_cv = map(lambda x: x / max_v, cv)
             else:
-                confidence_vector_norm = confidence_vector
-            confidence_vector_norm_sorted = sorted(
-                [
-                    (i, nv, confidence_vector[i])
-                    for i, nv in enumerate(confidence_vector_norm)
-                ],
+                norm_cv = cv
+
+            norm_cv_sorted = sorted(
+                [(i, nv, cv[i]) for i, nv in enumerate(norm_cv)],
                 key=lambda e: -e[1]
             )
+
             return {
                 "pars": info,
-                "cv": confidence_vector,
+                "cv": cv,
                 "wmv": reduce(vmax, [v["wmv"] for v in info]),
-                "cvns": confidence_vector_norm_sorted,
+                "cvns": norm_cv_sorted,
                 "ci": [self.get_category_name(ic) for ic in xrange(nbr_cats)]
             }
 

diff --git a/tests/test_pyss3.py b/tests/test_pyss3.py
@@ -9,10 +9,10 @@
 
 DATASET_FOLDER = "dataset"
 
-x_train = []
-y_train = []
+x_train = []  # loaded later from disk
+y_train = []  # loaded later from disk
 x_test = [
-    "sports nfl nba superbowl football team soccer learns jersey air bowl hockey "
+    "sports nfl nba superbowl soccer football team. learns jersey air bowl hockey.\n"
     "baseball helmet mccutchen jordan curry poker",
 
     "travel pictures images moment glamour canvas photoshoot lens dslr portrait "
@@ -59,7 +59,6 @@
 for file in listdir(dataset_path):
     with open(path.join(dataset_path, file), encoding="utf-8") as cat_file:
         docs = cat_file.readlines()
-        print(path.splitext(file)[0])
         x_train.extend(docs)
         y_train.extend([path.splitext(file)[0]] * len(docs))
 
@@ -69,7 +68,7 @@ def argmax(lst):
     return max(range(len(lst)), key=lst.__getitem__)
 
 
-def perform_tests_with(clf):
+def perform_tests_with(clf, cv):
     """Perform some tests with the given classifier."""
     assert clf.get_category_index("SpOrTs") == clf.get_category_index("sports")
 
@@ -92,6 +91,7 @@ def perform_tests_with(clf):
 
     y_pred = clf.predict_proba(x_test)
     assert y_test == [clf.get_category_name(argmax(cv)) for cv in y_pred]
+    assert [round(p, 5) for p in y_pred[0]] == cv
 
     y_pred = clf.predict_proba(["bla bla bla"])
     assert y_pred[0] == [0] * len(clf.get_categories())
@@ -124,15 +124,15 @@ def test_pyss3_functions():
     assert pyss3.sigmoid(1, 1) == .5
     assert pyss3.sigmoid(.2, .2) == .5
     assert pyss3.sigmoid(.5, .5) == .5
-    assert round(pyss3.sigmoid(0, .5), 5) == round(.002472623156634768, 5)
-    assert round(pyss3.sigmoid(1, .5), 5) == round(.9975273768433652, 5)
-    assert round(pyss3.sigmoid(1, 2), 5) == round(.0474258731775668, 5)
+    assert round(pyss3.sigmoid(0, .5), 5) == .00247
+    assert round(pyss3.sigmoid(1, .5), 5) == .99753
+    assert round(pyss3.sigmoid(1, 2), 5) == .04743
 
     assert pyss3.mad([1, 1, 1], 3) == (1, .0)
     assert pyss3.mad([1, 1, 1], 3) == (1, .0)
     assert pyss3.mad([], 1) == (0, .0)
-    assert round(pyss3.mad([1, 2, 1], 3)[1], 5) == round(.3333333333333333, 5)
-    assert round(pyss3.mad([1, 10, 1], 3)[1], 5) == round(3.0, 5)
+    assert round(pyss3.mad([1, 2, 1], 3)[1], 5) == .33333
+    assert round(pyss3.mad([1, 10, 1], 3)[1], 5) == 3.0
 
     with pytest.raises(IndexError):
         pyss3.mad([], 0)
@@ -156,23 +156,27 @@ def test_pyss3_ss3():
 
     clf.fit(x_train, y_train)
 
-    perform_tests_with(clf)
+    perform_tests_with(clf, [.00032, .00056, 0, 0, 0, .00019, .0021, 7.03651])
 
     clf = SS3(
         s=.32, l=1.24, p=1.1, a=0, name="test-3grams",
         cv_m=STR_NORM_GV_XAI, sn_m=STR_XAI
     )
     clf.fit(x_train, y_train, n_grams=3)
 
-    perform_tests_with(clf)
+    perform_tests_with(clf, [.00037, .0006, 0, 0, 0, .00028, .00082, 9.03427])
 
     pred = clf.classify("android mobile and video games", json=True)
     assert pred["pars"][0]["sents"][0]["words"][0]["lexeme"] == "android mobile"
     assert pred["pars"][0]["sents"][0]["words"][-1]["lexeme"] == "video games"
     assert argmax(pred["cv"]) == clf.get_category_index("science&technology")
+    assert [round(p, 5) for p in pred["cv"]] == [0, 0, 0, 0, 0, 0, 3.8183, 0, 0]
 
     pred = clf.classify("playing football soccer", json=True)
     assert pred["pars"][0]["sents"][0]["words"][-1]["lexeme"] == "football soccer"
     assert argmax(pred["cv"]) == clf.get_category_index("sports")
+    assert [round(p, 5) for p in pred["cv"]] == [0, 0, 0, 0, 0, .53463, 0, 1.70975, 0]
 
-test_pyss3_ss3()
+
+# if __name__ == "__main__":
+#     test_pyss3_ss3()