Skip to content

Commit

Permalink
v0.3.2
Browse files Browse the repository at this point in the history
  • Loading branch information
sergioburdisso committed Nov 12, 2019
1 parent 09298e7 commit ab9e8be
Show file tree
Hide file tree
Showing 5 changed files with 155 additions and 42 deletions.
7 changes: 4 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
docs/_build
build/
__extra__
pyss3/__pycache__
.pytest_cache
build/
docs/_build
*/__pycache__
tests/pyss3
*.pyc
*~
*.ipynb
Expand Down
14 changes: 14 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,17 @@

All notable changes to PySS3 will be documented here.

## [0.3.2] 2019-11-12

### Added
- Summary operators: now it is possible to use user-defined summary operators, the following static methods were added to the ``SS3`` class: `summary_op_ngrams`, `summary_op_sentences`, and `summary_op_paragraphs`.

## [0.3.1] 2019-11-11

### Added
- update: some docstrings were improved
- update: the README.md / Pypi Description file.

### Fixed
- Python 2 and 3 compatibility problem with scikit-learn (using version 0.20.1 from now on)
- PyPi: setup.py: `long_description_content_type` set to `'text/markdown'`
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,12 @@ Each dot represents an experiment/evaluation performed using that particular com

### The somewhat standard way

(TODO: tutorial WIP)

### The "Command-Line" way

(TODO: tutorial WIP)

## Installation


Expand Down
142 changes: 116 additions & 26 deletions pyss3/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from functools import reduce
from six.moves import xrange

__version__ = "0.3.1"
__version__ = "0.3.2"

ENCODING = "utf-8"

Expand Down Expand Up @@ -428,12 +428,9 @@ def __classify_sentence__(self, sent, prep, json=False):
get_word = self.get_word
if not json:
words_cvs = [classify_trans(seq) for _, seq in sent]

if words_cvs:
cv = reduce(vsum, words_cvs)
else:
return self.__zero_cv__
return cv
return SS3.summary_op_ngrams(words_cvs)
return self.__zero_cv__
else:
get_tip = self.__trie_node__
local_value = self.__lv__
Expand All @@ -449,7 +446,7 @@ def __classify_sentence__(self, sent, prep, json=False):
]
return {
"words": info,
"cv": reduce(vsum, [v["cv"] for v in info]),
"cv": SS3.summary_op_ngrams([v["cv"] for v in info]),
"wmv": reduce(vmax, [v["cv"] for v in info]) # word max value
}

Expand All @@ -462,10 +459,8 @@ def __classify_paragraph__(self, paragraph, prep, json=False):
if sent
]
if sents_cvs:
cv = reduce(vsum, sents_cvs)
else:
cv = self.__zero_cv__
return cv
return SS3.summary_op_sentences(sents_cvs)
return self.__zero_cv__
else:
info = [
self.__classify_sentence__(sent, prep=prep, json=True)
Expand All @@ -474,7 +469,7 @@ def __classify_paragraph__(self, paragraph, prep, json=False):
]
if info:
sents_cvs = [v["cv"] for v in info]
cv = reduce(vsum, sents_cvs)
cv = SS3.summary_op_sentences(sents_cvs)
wmv = reduce(vmax, [v["wmv"] for v in info])
else:
cv = self.__zero_cv__
Expand Down Expand Up @@ -715,6 +710,102 @@ def __save_cat_vocab__(self, icat, path, n_grams):
f.close()
Print.info("\t[ %s stored in '%s'" % (term, voc_path))

@staticmethod
def summary_op_ngrams(cvs):
"""
Summary operator for n-gram confidence vectors.
By default it returns the addition of all confidence
vectors. However, in case you want to use a custom
wants to use a custom summary operator, this function
must be replaced as shown in the following example:
>>> def dummy_summary_op(cvs):
>>> return cvs[0]
>>> ...
>>> SS3.summary_op_ngrams = dummy_summary_op
>>> ...
>>> clf = SS3()
Note that any function receiving a list of (confidence)
vectors and returning a single (confidence) vector
could be used. In the example above the summary operator
is replaced by the user-defined ``dummy_summary_op`` which
ignores all confidence vectors returning only the confidence
vector of the first n-gram (which besides being an
illustrative example, makes no real sense).
:param cvs: a list n-grams confidence vectors
:type cvs: list (of list of float)
:returns: a sentence confidence vector
:rtype: list (of float)
"""
return reduce(vsum, cvs)

@staticmethod
def summary_op_sentences(cvs):
"""
Summary operator for sentence confidence vectors.
By default it returns the addition of all confidence
vectors. However, in case you want to use a custom
wants to use a custom summary operator, this function
must be replaced as shown in the following example:
>>> def dummy_summary_op(cvs):
>>> return cvs[0]
>>> ...
>>> SS3.summary_op_sentences = dummy_summary_op
>>> ...
>>> clf = SS3()
Note that any function receiving a list of (confidence)
vectors and returning a single (confidence) vector
could be used. In the example above the summary operator
is replaced by the user-defined ``dummy_summary_op`` which
ignores all confidence vectors returning only the confidence
vector of the first sentence (which besides being an
illustrative example, makes no real sense).
:param cvs: a list sentence confidence vectors
:type cvs: list (of list of float)
:returns: a paragraph confidence vector
:rtype: list (of float)
"""
return reduce(vsum, cvs)

@staticmethod
def summary_op_paragraphs(cvs):
"""
Summary operator for paragraph confidence vectors.
By default it returns the addition of all confidence
vectors. However, in case you want to use a custom
wants to use a custom summary operator, this function
must be replaced as shown in the following example:
>>> def dummy_summary_op(cvs):
>>> return cvs[0]
>>> ...
>>> SS3.summary_op_paragraphs = dummy_summary_op
>>> ...
>>> clf = SS3()
Note that any function receiving a list of (confidence)
vectors and returning a single (confidence) vector
could be used. In the example above the summary operator
is replaced by the user-defined ``dummy_summary_op`` which
ignores all confidence vectors returning only the confidence
vector of the first paragraph (which besides being an
illustrative example, makes no real sense).
:param cvs: a list paragraph confidence vectors
:type cvs: list (of list of float)
:returns: the document confidence vector
:rtype: list (of float)
"""
return reduce(vsum, cvs)

def get_name(self):
"""
Return the model's name.
Expand Down Expand Up @@ -1505,7 +1596,7 @@ def classify(self, doc, prep=True, sort=True, json=False):
if parag
]
if paragraphs_cvs:
cv = reduce(vsum, paragraphs_cvs)
cv = SS3.summary_op_paragraphs(paragraphs_cvs)
else:
cv = self.__zero_cv__
if sort:
Expand All @@ -1523,27 +1614,26 @@ def classify(self, doc, prep=True, sort=True, json=False):
for parag in doc.split(PARA_DELTR)
if parag
]

nbr_cats = len(self.__categories__)
confidence_vector = reduce(vsum, [v["cv"] for v in info])
max_v = max(confidence_vector)
cv = SS3.summary_op_paragraphs([v["cv"] for v in info])
max_v = max(cv)

if max_v > 1:
confidence_vector_norm = map(
lambda x: x / max_v, confidence_vector
)
norm_cv = map(lambda x: x / max_v, cv)
else:
confidence_vector_norm = confidence_vector
confidence_vector_norm_sorted = sorted(
[
(i, nv, confidence_vector[i])
for i, nv in enumerate(confidence_vector_norm)
],
norm_cv = cv

norm_cv_sorted = sorted(
[(i, nv, cv[i]) for i, nv in enumerate(norm_cv)],
key=lambda e: -e[1]
)

return {
"pars": info,
"cv": confidence_vector,
"cv": cv,
"wmv": reduce(vmax, [v["wmv"] for v in info]),
"cvns": confidence_vector_norm_sorted,
"cvns": norm_cv_sorted,
"ci": [self.get_category_name(ic) for ic in xrange(nbr_cats)]
}

Expand Down
30 changes: 17 additions & 13 deletions tests/test_pyss3.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@

DATASET_FOLDER = "dataset"

x_train = []
y_train = []
x_train = [] # loaded later from disk
y_train = [] # loaded later from disk
x_test = [
"sports nfl nba superbowl football team soccer learns jersey air bowl hockey "
"sports nfl nba superbowl soccer football team. learns jersey air bowl hockey.\n"
"baseball helmet mccutchen jordan curry poker",

"travel pictures images moment glamour canvas photoshoot lens dslr portrait "
Expand Down Expand Up @@ -59,7 +59,6 @@
for file in listdir(dataset_path):
with open(path.join(dataset_path, file), encoding="utf-8") as cat_file:
docs = cat_file.readlines()
print(path.splitext(file)[0])
x_train.extend(docs)
y_train.extend([path.splitext(file)[0]] * len(docs))

Expand All @@ -69,7 +68,7 @@ def argmax(lst):
return max(range(len(lst)), key=lst.__getitem__)


def perform_tests_with(clf):
def perform_tests_with(clf, cv):
"""Perform some tests with the given classifier."""
assert clf.get_category_index("SpOrTs") == clf.get_category_index("sports")

Expand All @@ -92,6 +91,7 @@ def perform_tests_with(clf):

y_pred = clf.predict_proba(x_test)
assert y_test == [clf.get_category_name(argmax(cv)) for cv in y_pred]
assert [round(p, 5) for p in y_pred[0]] == cv

y_pred = clf.predict_proba(["bla bla bla"])
assert y_pred[0] == [0] * len(clf.get_categories())
Expand Down Expand Up @@ -124,15 +124,15 @@ def test_pyss3_functions():
assert pyss3.sigmoid(1, 1) == .5
assert pyss3.sigmoid(.2, .2) == .5
assert pyss3.sigmoid(.5, .5) == .5
assert round(pyss3.sigmoid(0, .5), 5) == round(.002472623156634768, 5)
assert round(pyss3.sigmoid(1, .5), 5) == round(.9975273768433652, 5)
assert round(pyss3.sigmoid(1, 2), 5) == round(.0474258731775668, 5)
assert round(pyss3.sigmoid(0, .5), 5) == .00247
assert round(pyss3.sigmoid(1, .5), 5) == .99753
assert round(pyss3.sigmoid(1, 2), 5) == .04743

assert pyss3.mad([1, 1, 1], 3) == (1, .0)
assert pyss3.mad([1, 1, 1], 3) == (1, .0)
assert pyss3.mad([], 1) == (0, .0)
assert round(pyss3.mad([1, 2, 1], 3)[1], 5) == round(.3333333333333333, 5)
assert round(pyss3.mad([1, 10, 1], 3)[1], 5) == round(3.0, 5)
assert round(pyss3.mad([1, 2, 1], 3)[1], 5) == .33333
assert round(pyss3.mad([1, 10, 1], 3)[1], 5) == 3.0

with pytest.raises(IndexError):
pyss3.mad([], 0)
Expand All @@ -156,23 +156,27 @@ def test_pyss3_ss3():

clf.fit(x_train, y_train)

perform_tests_with(clf)
perform_tests_with(clf, [.00032, .00056, 0, 0, 0, .00019, .0021, 7.03651])

clf = SS3(
s=.32, l=1.24, p=1.1, a=0, name="test-3grams",
cv_m=STR_NORM_GV_XAI, sn_m=STR_XAI
)
clf.fit(x_train, y_train, n_grams=3)

perform_tests_with(clf)
perform_tests_with(clf, [.00037, .0006, 0, 0, 0, .00028, .00082, 9.03427])

pred = clf.classify("android mobile and video games", json=True)
assert pred["pars"][0]["sents"][0]["words"][0]["lexeme"] == "android mobile"
assert pred["pars"][0]["sents"][0]["words"][-1]["lexeme"] == "video games"
assert argmax(pred["cv"]) == clf.get_category_index("science&technology")
assert [round(p, 5) for p in pred["cv"]] == [0, 0, 0, 0, 0, 0, 3.8183, 0, 0]

pred = clf.classify("playing football soccer", json=True)
assert pred["pars"][0]["sents"][0]["words"][-1]["lexeme"] == "football soccer"
assert argmax(pred["cv"]) == clf.get_category_index("sports")
assert [round(p, 5) for p in pred["cv"]] == [0, 0, 0, 0, 0, .53463, 0, 1.70975, 0]

test_pyss3_ss3()

# if __name__ == "__main__":
# test_pyss3_ss3()

0 comments on commit ab9e8be

Please sign in to comment.