Add load_from_files_multilabel() to Dataset (#6)

A new method (``load_from_files_multilabel``) was added to the ``Dataset`` class to load multilabel datasets from disk. More details about this new class can be found in the API documentation (https://pyss3.rtfd.io/en/latest/api/index.html##pyss3.util.Dataset.load_from_files_multilabel). Resolves: #6
sergioburdisso · May 12, 2020 · 2cee50e · 2cee50e
1 parent e2ecda4
commit 2cee50e
Show file tree

Hide file tree

Showing 7 changed files with 194 additions and 11 deletions.
diff --git a/pyss3/util.py b/pyss3/util.py
@@ -2,13 +2,13 @@
 """This is a helper module with utility classes and functions."""
 from __future__ import print_function
 from io import open
-from os import listdir, makedirs, path
+from os import listdir, makedirs, path, remove as remove_file
 from tqdm import tqdm
-
 from math import ceil
-from numpy import mean, linspace, arange
 from itertools import product
-from os import remove as remove_file
+from collections import defaultdict
+
+from numpy import mean, linspace, arange
 from sklearn.metrics import classification_report, accuracy_score
 from sklearn.metrics import confusion_matrix
 from sklearn.model_selection import StratifiedKFold
@@ -1506,12 +1506,12 @@ def grid_search(
 
 
 class Dataset:
-    """A helper class with methods to read/write datasets."""
+    """A helper class with methods to read datasets from disk."""
 
     @staticmethod
-    def load_from_files(data_path, folder_label=True, as_single_doc=False, sep_doc="\n"):
-        """
-        Load category documents from disk.
+    def load_from_files(data_path, folder_label=True, as_single_doc=False, sep_doc='\n'):
+        r"""
+        Load training/test documents and category labels from disk.
 
         :param data_path: the training or the test set path
         :type data_path: str
@@ -1521,8 +1521,12 @@ def load_from_files(data_path, folder_label=True, as_single_doc=False, sep_doc="
         :type folder_label: bool
         :param as_single_doc: read the documents as a single (and big) document
                               (default: False)
-        :type folder_label: bool
-        :returns: the (x_train, y_train) or the (x_test, y_test) pairs.
+        :type as_single_doc: bool
+        :param sep_doc: the separator/delimiter used to separate each document
+                        when loading training/test documents from single file. Valid
+                        only when ``folder_label=False``. (default: ``'\n'``)
+        :type sep_doc: str
+        :returns: the (x_train, y_train) or (x_test, y_test) pairs.
         :rtype: tuple
         """
         x_data = []
@@ -1586,6 +1590,128 @@ def load_from_files(data_path, folder_label=True, as_single_doc=False, sep_doc="
 
         return x_data, y_data
 
+    @staticmethod
+    def load_from_files_multilabel(docs_path, labels_path, sep_label=None, sep_doc='\n'):
+        r"""
+        Multilabel version of the ``Dataset.load_from_files()`` function.
+
+        Load training/test documents and category labels from disk.
+
+        :param docs_path: the file or the folder containing the training/test
+                          documents.
+        :type docs_path: str
+        :param labels_path: the file containing the labels for each document.
+
+                            * if ``docs_path`` is a file, then the ``labels_path`` file
+                            should contain a line with the corresponding list of category
+                            labels for each document in ``docs_path``. For
+                            instance, if ``sep_doc='\n'`` and the the content
+                            of ``docs_path`` is:
+
+                            .. parsed-literal::
+                                this is document 1
+                                this is document 2
+                                this is document 3
+
+                            then, if ``sep_label=';'``, the ``labels_path``
+                            file should contain the labels for each document
+                            (in order) separated by ;, as follows:
+
+                            .. parsed-literal::
+                                labelA;labelB
+                                labelA
+                                labelB;labelC
+
+                            * if ``docs_path`` is a folder containing the documents, then
+                            the ``labels_path`` file should contain a line for each document and
+                            category label. Each line should have the following format:
+                            ``document_name<the sep_label>label``. For instance, if the
+                            ``docs_path`` folder contains the following 3 documents:
+
+                            .. parsed-literal::
+                                doc1.txt
+                                doc2.txt
+                                doc3.txt
+
+                            Then, following the above example, the ``labels_path`` file should be:
+
+                            .. parsed-literal::
+                                doc1    labelA
+                                doc1    labelB
+                                doc2    labelA
+                                doc3    labelB
+                                doc3    labelC
+
+        :type labels_path: str
+        :param sep_label: the separator/delimiter used to separate either each label (if
+                          ``docs_path`` is a file) or the document name from its category
+                          (if ``docs_path`` is a folder).
+                          (default: ``';'`` when ``docs_path`` is a file, the ``'\s+'`` regular
+                          expression otherwise).
+        :type sep_label: str
+        :param sep_doc: the separator/delimiter used to separate each document
+                        when loading training/test documents from single file. Valid
+                        only when ``folder_label=False``. (default: ``\n'``)
+        :type sep_doc: str
+        :returns: the (x_train, y_train) or (x_test, y_test) pairs.
+        :rtype: tuple
+        """
+        x_data = []
+        y_data = []
+        cat_info = defaultdict(int)
+
+        Print.info("reading files...")
+
+        if path.isdir(docs_path):
+            sep_label = sep_label or r'\s+'  # default separator
+
+            with open(labels_path, "r", encoding=ENCODING) as flabels:
+                doc_labels_raw = [re.split(sep_label, l.rstrip())
+                                  for l in flabels.read().split('\n')]
+                doc_labels = {}
+
+                for doc_name, label in doc_labels_raw:
+                    if doc_name not in doc_labels:
+                        doc_labels[doc_name] = [label]
+                    else:
+                        doc_labels[doc_name].append(label)
+                    cat_info[label] += 1
+
+                for doc_name in tqdm(doc_labels, desc="Loading documents"):
+                    file_name = doc_name + ".txt" if '.' not in doc_name else doc_name
+                    with open(path.join(docs_path, file_name), "r", encoding=ENCODING) as fdoc:
+                        x_data.append(fdoc.read())
+                    y_data.append(doc_labels[doc_name])
+
+        else:
+            sep_label = sep_label or ';'
+
+            with open(labels_path, "r", encoding=ENCODING) as flabels:
+                y_data = [re.split(sep_label, l) if l else []
+                          for l in tqdm(flabels.read().split('\n'))]
+            with open(docs_path, "r", encoding=ENCODING) as fdocs:
+                x_data = fdocs.read().split(sep_doc)
+
+            if len(x_data) != len(y_data):
+                x_data = x_data[:len(y_data)]
+
+            for labels in y_data:
+                for i, label in enumerate(labels):
+                    label = label.strip()
+                    labels[i] = label
+                    cat_info[label] += 1
+
+        Print.info("%d categories found" % len(cat_info))
+        for cat in cat_info:
+            Print.info(
+                "'%s'%s"
+                %
+                (cat, " (%d documents)" % cat_info[cat]),
+                offset=4
+            )
+
+        return x_data, y_data
+
     # TODO: save_to_files(x_train, y_train, x_test, y_test)
 
 

diff --git a/tests/dataset_ml/file_labels.tsv b/tests/dataset_ml/file_labels.tsv
@@ -0,0 +1,4 @@
+a	catA
+a	catB
+a	catC
+b	catA
diff --git a/tests/dataset_ml/train/docs.txt b/tests/dataset_ml/train/docs.txt
@@ -0,0 +1,15 @@
+this is the first document
+>>>>>
+second document is toxic, severe_toxic, obscene, and insult.
+>>>>>
+this is document number 3
+>>>>>
+the 4th document goes like this one :P
+>>>>>
+document 5
+>>>>>
+this is the synthetic document number 6
+>>>>>
+7, 7 is the number of this document
+>>>>>
+this is the last document, and it is really toxic!
diff --git a/tests/dataset_ml/train/labels.txt b/tests/dataset_ml/train/labels.txt
@@ -0,0 +1,8 @@
+
+toxic,severe_toxic,obscene,insult
+
+
+
+
+
+toxic
diff --git a/tests/dataset_ml/train_files/a.txt b/tests/dataset_ml/train_files/a.txt
@@ -0,0 +1,3 @@
+this is the first document!!
+
+:)
diff --git a/tests/dataset_ml/train_files/b.txt b/tests/dataset_ml/train_files/b.txt
@@ -0,0 +1,3 @@
+and this is the
+
+SECOND!!
diff --git a/tests/test_util.py b/tests/test_util.py
@@ -3,15 +3,17 @@
 from os import path
 from pyss3 import SS3, InvalidCategoryError
 from shutil import rmtree
-from pyss3.util import Evaluation, Dataset, RecursiveDefaultDict, Print, VERBOSITY
+from pyss3.util import Dataset, Evaluation, RecursiveDefaultDict, Print, VERBOSITY
 
 import sys
 import pytest
 import pyss3
 
 DATASET_FOLDER = "dataset_mr"
+DATASET_MULTILABEL_FOLDER = "dataset_ml"
 PY3 = sys.version_info[0] >= 3
 DATASET_PATH = path.join(path.abspath(path.dirname(__file__)), DATASET_FOLDER)
+DATASET_MULTILABEL_PATH = path.join(path.abspath(path.dirname(__file__)), DATASET_MULTILABEL_FOLDER)
 TMP_FOLDER = "tests/ss3_models/"
 
 
@@ -238,3 +240,25 @@ def test_evaluation(mocker):
     # different tag
 
     rmtree("./tests/ss3_models", ignore_errors=True)
+
+
+def test_dataset():
+    """Test Dataset class."""
+    x_train, y_train = Dataset.load_from_files_multilabel(
+        path.join(DATASET_MULTILABEL_PATH, "train_files"),
+        path.join(DATASET_MULTILABEL_PATH, "file_labels.tsv")
+    )
+
+    assert x_train == ['this is the first document!!\n\n:)', 'and this is the\n\nSECOND!!']
+    assert y_train == [['catA', 'catB', 'catC'], ['catA']]
+
+    x_train, y_train = Dataset.load_from_files_multilabel(
+        path.join(DATASET_MULTILABEL_PATH, "train/docs.txt"),
+        path.join(DATASET_MULTILABEL_PATH, "train/labels.txt"),
+        sep_label=",",
+        sep_doc="\n>>>>>\n"
+    )
+
+    assert len(y_train) == len(y_train) and len(y_train) == 8
+    assert y_train == [[], ['toxic', 'severe_toxic', 'obscene', 'insult'],
+                       [], [], [], [], [], ['toxic']]