FEA/MAINT Add column-wise transforms & refactor TableVectorizer (#902)

Co-authored-by: Théo Jolivet <57430673+TheooJ@users.noreply.github.com>
skrub-data · May 28, 2024 · 5b30ddd · 5b30ddd
1 parent 42b5f90
commit 5b30ddd
Show file tree

Hide file tree

Showing 48 changed files with 5,220 additions and 2,517 deletions.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -14,6 +14,14 @@ It is currently undergoing fast development and backward compatibility is not en
 
 Major changes
 -------------
+* The :class:`TableVectorizer` now consistently applies the same transformation
+  across different calls to `transform`. There also have been some breaking
+  changes to its functionality: (i) all transformations are now applied
+  independently to each column, i.e. it does not perform multivariate
+  transformations (ii) in ``specific_transformers`` the same column may not be
+  used twice (go through 2 different transformers).
+  :pr:`902` by :user:`Jérôme Dockès <jeromedockes>`.
+
 * Added the :class:`MultiAggJoiner` that allows to augment a main table with
   multiple auxiliary tables. :pr:`876` by :user:`Théo Jolivet <TheooJ>`.
 

diff --git a/doc/api.rst b/doc/api.rst
@@ -83,6 +83,7 @@ This page lists all available functions and classes of `skrub`.
    GapEncoder
    MinHashEncoder
    SimilarityEncoder
+   ToCategorical
 
 .. raw:: html
 
@@ -98,10 +99,18 @@ This page lists all available functions and classes of `skrub`.
 
 .. autosummary::
    :toctree: generated/
-   :template: function.rst
+   :template: class.rst
    :nosignatures:
    :caption: Converting datetime columns in a table
 
+   ToDatetime
+
+
+.. autosummary::
+   :toctree: generated/
+   :template: function.rst
+   :nosignatures:
+
    to_datetime
 
 .. raw:: html

diff --git a/examples/01_encodings.py b/examples/01_encodings.py
diff --git a/examples/03_datetime_encoder.py b/examples/03_datetime_encoder.py
@@ -88,12 +88,12 @@
 
 encoder = make_column_transformer(
     (OneHotEncoder(handle_unknown="ignore"), ["city"]),
-    (DatetimeEncoder(add_day_of_the_week=True, resolution="minute"), ["date.utc"]),
+    (DatetimeEncoder(add_weekday=True, resolution="minute"), "date.utc"),
     remainder="drop",
 )
 
 X_enc = encoder.fit_transform(X)
-pprint(encoder.get_feature_names_out())
+# pprint(encoder.get_feature_names_out())
 
 ###############################################################################
 # We see that the encoder is working as expected: the ``"date.utc"`` column has
@@ -119,7 +119,7 @@
 # Here, for example, we want it to extract the day of the week.
 
 table_vec = TableVectorizer(
-    datetime_transformer=DatetimeEncoder(add_day_of_the_week=True),
+    datetime_transformer=DatetimeEncoder(add_weekday=True),
 ).fit(X)
 pprint(table_vec.get_feature_names_out())
 
@@ -257,7 +257,7 @@
 from sklearn.inspection import permutation_importance
 
 table_vec = TableVectorizer(
-    datetime_transformer=DatetimeEncoder(add_day_of_the_week=True),
+    datetime_transformer=DatetimeEncoder(add_weekday=True),
 )
 
 # In this case, we don't use a pipeline, because we want to compute the
@@ -280,8 +280,8 @@
     y="importances", x="feature_names", title="Feature Importances", figsize=(12, 9)
 )
 plt.tight_layout()
+plt.show()
 
-###############################################################################
 # We can see that the total seconds since Epoch and the hour of the day
 # are the most important feature, which seems reasonable.
 #

diff --git a/examples/08_join_aggregation.py b/examples/08_join_aggregation.py
@@ -88,9 +88,8 @@
 
 
 table_vectorizer = TableVectorizer(
-    datetime_transformer=DatetimeEncoder(add_day_of_the_week=True)
+    datetime_transformer=DatetimeEncoder(add_weekday=True)
 )
-table_vectorizer.set_output(transform="pandas")
 X_date_encoded = table_vectorizer.fit_transform(X)
 X_date_encoded.head()
 
@@ -103,19 +102,19 @@
 
 
 def make_barplot(x, y, title):
+    fig, ax = plt.subplots(layout="constrained")
     norm = plt.Normalize(y.min(), y.max())
     cmap = plt.get_cmap("magma")
 
-    sns.barplot(x=x, y=y, palette=cmap(norm(y)))
-    plt.title(title)
-    plt.xticks(rotation=30)
-    plt.ylabel(None)
-    plt.tight_layout()
+    sns.barplot(x=x, y=y, palette=cmap(norm(y)), ax=ax)
+    ax.set_title(title)
+    ax.set_xticks(ax.get_xticks(), labels=ax.get_xticklabels(), rotation=30)
+    ax.set_ylabel(None)
 
 
 # O is Monday, 6 is Sunday
 
-daily_volume = X_date_encoded["timestamp_day_of_week"].value_counts().sort_index()
+daily_volume = X_date_encoded["timestamp_weekday"].value_counts().sort_index()
 
 make_barplot(
     x=daily_volume.index,
@@ -287,9 +286,10 @@ def baseline_r2(X, y, train_idx, test_idx):
 
 # we only keep the 5 out of 10 last results
 # because the initial size of the train set is rather small
-sns.boxplot(results.tail(5), palette="magma")
-plt.ylabel("R2 score")
-plt.title("Hyper parameters grid-search results")
+fig, ax = plt.subplots(layout="constrained")
+sns.boxplot(results.tail(5), palette="magma", ax=ax)
+ax.set_ylabel("R2 score")
+ax.set_title("Hyper parameters grid-search results")
 plt.tight_layout()
 
 ###############################################################################

diff --git a/skrub/__init__.py b/skrub/__init__.py
@@ -5,7 +5,7 @@
 
 from ._agg_joiner import AggJoiner, AggTarget
 from ._check_dependencies import check_dependencies
-from ._datetime_encoder import DatetimeEncoder, to_datetime
+from ._datetime_encoder import DatetimeEncoder
 from ._deduplicate import compute_ngram_distance, deduplicate
 from ._fuzzy_join import fuzzy_join
 from ._gap_encoder import GapEncoder
@@ -16,6 +16,8 @@
 from ._select_cols import DropCols, SelectCols
 from ._similarity_encoder import SimilarityEncoder
 from ._table_vectorizer import TableVectorizer
+from ._to_categorical import ToCategorical
+from ._to_datetime import ToDatetime, to_datetime
 
 check_dependencies()
 
@@ -25,6 +27,7 @@
 
 __all__ = [
     "DatetimeEncoder",
+    "ToDatetime",
     "Joiner",
     "fuzzy_join",
     "GapEncoder",
@@ -34,6 +37,7 @@
     "TableVectorizer",
     "deduplicate",
     "compute_ngram_distance",
+    "ToCategorical",
     "to_datetime",
     "AggJoiner",
     "MultiAggJoiner",

diff --git a/skrub/_check_input.py b/skrub/_check_input.py
@@ -0,0 +1,184 @@
+import warnings
+
+import numpy as np
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.utils.validation import check_is_fitted
+
+from . import _dataframe as sbd
+from . import _join_utils, _utils
+from ._dispatch import dispatch
+
+__all__ = ["CheckInputDataFrame"]
+
+
+def _column_names_to_strings(column_names):
+    non_string = [c for c in column_names if not isinstance(c, str)]
+    if not non_string:
+        return column_names
+    warnings.warn(
+        f"Some column names are not strings: {non_string}. All column names"
+        " must be strings; converting to strings."
+    )
+    return list(map(str, column_names))
+
+
+def _deduplicated_column_names(column_names):
+    duplicates = _utils.get_duplicates(column_names)
+    if not duplicates:
+        return column_names
+    warnings.warn(
+        f"Found duplicated column names: {duplicates}. Please make sure column names"
+        " are unique. Renaming columns that have duplicated names."
+    )
+    return _join_utils.pick_column_names(column_names)
+
+
+def _cleaned_column_names(column_names):
+    return _deduplicated_column_names(_column_names_to_strings(column_names))
+
+
+@dispatch
+def _check_not_pandas_sparse(df):
+    pass
+
+
+@_check_not_pandas_sparse.specialize("pandas")
+def _check_not_pandas_sparse_pandas(df):
+    import pandas as pd
+
+    sparse_cols = [
+        col for col in df.columns if isinstance(df[col].dtype, pd.SparseDtype)
+    ]
+    if sparse_cols:
+        raise TypeError(
+            f"Columns {sparse_cols} are sparse Pandas series, but dense "
+            "data is required. Use ``df[col].sparse.to_dense()`` to convert "
+            "a series from sparse to dense."
+        )
+
+
+def _check_is_dataframe(df):
+    if not sbd.is_dataframe(df):
+        raise TypeError(
+            "Only pandas and polars DataFrames are supported. Cannot handle X of"
+            f" type: {type(df)}."
+        )
+
+
+def _collect_lazyframe(df):
+    if not sbd.is_lazyframe(df):
+        return df
+    warnings.warn(
+        "At the moment, skrub only works on eager DataFrames, calling collect()."
+    )
+    return sbd.collect(df)
+
+
+class CheckInputDataFrame(TransformerMixin, BaseEstimator):
+    """Check the dataframe entering a skrub pipeline.
+
+    This transformer ensures that:
+
+    - The input is a dataframe.
+        - Numpy arrays are converted to pandas dataframes with a warning.
+    - The dataframe library is the same during ``fit`` and ``transform``, e.g.
+      fitting on a polars dataframe and then transforming a pandas dataframe is
+      not allowed.
+        - A TypeError is raised otherwise.
+    - Column names are unique strings.
+        - Non-strings are cast to strings.
+        - A random suffix is added to duplicated names.
+        - If either of these operations is needed, a warning is emitted.
+        - Only applies to pandas; polars column names are always unique strings.
+    - The input is not sparse.
+        - A TypeError is raised otherwise.
+    - The input is not a ``LazyFrame``.
+        - A ``LazyFrame`` is ``collect``ed with a warning.
+    - The column names are the same during ``fit`` and ``transform``.
+        - A ValueError is raised otherwise.
+
+    Attributes
+    ----------
+    module_name_ : str
+        The name of the dataframe module, 'polars' or 'pandas'.
+    feature_names_in_ : list
+        The column names of the input (before cleaning).
+    n_features_in_ : int
+        The number of input columns.
+    feature_names_out_ : list of str
+        The column names after converting to string and deduplication.
+    """
+
+    def fit(self, X, y=None):
+        self.fit_transform(X, y)
+        return self
+
+    def fit_transform(self, X, y=None):
+        del y
+        X = self._handle_array(X)
+        _check_is_dataframe(X)
+        self.module_name_ = sbd.dataframe_module_name(X)
+        # TODO check schema (including dtypes) not just names.
+        # Need to decide how strict we should be about types
+        column_names = sbd.column_names(X)
+        self.feature_names_in_ = column_names
+        self.n_features_in_ = len(column_names)
+        self.feature_names_out_ = _cleaned_column_names(column_names)
+        if sbd.column_names(X) != self.feature_names_out_:
+            X = sbd.set_column_names(X, self.feature_names_out_)
+        _check_not_pandas_sparse(X)
+        X = _collect_lazyframe(X)
+        return X
+
+    def transform(self, X):
+        check_is_fitted(self, "module_name_")
+        X = self._handle_array(X)
+        _check_is_dataframe(X)
+        module_name = sbd.dataframe_module_name(X)
+        if module_name != self.module_name_:
+            raise TypeError(
+                f"Pipeline was fitted to a {self.module_name_} dataframe "
+                f"but is being applied to a {module_name} dataframe. "
+                "This is likely to produce errors and is not supported."
+            )
+        column_names = sbd.column_names(X)
+        if column_names != self.feature_names_in_:
+            import difflib
+
+            diff = "\n".join(
+                difflib.Differ().compare(self.feature_names_in_, column_names)
+            )
+            message = (
+                f"Columns of dataframes passed to fit() and transform() differ:\n{diff}"
+            )
+            raise ValueError(message)
+        if sbd.column_names(X) != self.feature_names_out_:
+            X = sbd.set_column_names(X, self.feature_names_out_)
+        _check_not_pandas_sparse(X)
+        X = _collect_lazyframe(X)
+        return X
+
+    def _handle_array(self, X):
+        if not isinstance(X, np.ndarray):
+            return X
+        if X.ndim != 2:
+            raise ValueError(
+                "Input should be a DataFrame. Found an array with incompatible shape:"
+                f" {X.shape}."
+            )
+        warnings.warn(
+            "Only pandas and polars DataFrames are supported, but input is a Numpy"
+            " array. Please convert Numpy arrays to DataFrames before passing them to"
+            " skrub transformers. Converting to pandas DataFrame with columns"
+            " ['0', '1', …]."
+        )
+        import pandas as pd
+
+        columns = list(map(str, range(X.shape[1])))
+        X = pd.DataFrame(X, columns=columns)
+        return X
+
+    # set_output api compatibility
+
+    def get_feature_names_out(self):
+        return self.feature_names_out_