In [1]:
import sys

# Add your local path
sys.path.append('/workspaces/tubular')

In [2]:
from __future__ import annotations

from typing import TYPE_CHECKING

import narwhals as nw
import numpy as np
import pandas as pd
import polars as pl
from sklearn.cluster import KMeans

from tubular.base import BaseTransformer, DataFrameMethodTransformer
from tubular.mixins import (
    CheckNumericMixin,
    DropOriginalMixin,
    NewColumnNameMixin,
    TwoColumnMixin,
)

if TYPE_CHECKING:
    from narwhals.typing import (
        FrameT,
        IntoSeriesT,
    )

In [3]:

class BaseNumericTransformer(BaseTransformer, CheckNumericMixin):
    """
    Extends BaseTransformer for datetime scenarios.

    Parameters
    ----------
    columns : List[str]
        List of columns to be operated on.

    **kwargs
        Arbitrary keyword arguments passed onto BaseTransformer.init method.

    Attributes
    ----------
    columns : List[str]
        List of columns to be operated on

    polars_compatible : bool
        class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework
    FITS: bool
        class attribute, indicates whether transform requires fit to be run first

    """

    polars_compatible = True

    FITS = False

    def __init__(self, columns: list[str], **kwargs: dict[str, bool]) -> None:
        super().__init__(columns=columns, **kwargs)

    @nw.narwhalify
    def fit(
        self,
        X: FrameT,
        y: nw.Series | None = None,
    ) -> BaseNumericTransformer:
        """Base fit method. Validates data and attributes prior to the child objects fit logic.

        Parameters
        ----------
        X : pd/pl.DataFrame
            A dataframe containing the required columns

        y : pd/pl.Series | None
            Required for pipeline.

        """

        super().fit(X, y)

        CheckNumericMixin.check_numeric_columns(self, X[self.columns])

        return self

    @nw.narwhalify
    def transform(self, X: FrameT) -> FrameT:
        """Base transform method. Validates data and attributes prior to the child objects tranform logic.

        Parameters
        ----------
        X : pd/pl.DataFrame
            Data to transform.

        Returns
        -------
        X : pd/pl.DataFrame
            Validated data

        """

        X = super().transform(X)

        CheckNumericMixin.check_numeric_columns(self, X[self.columns])

        return X


In [43]:
from sklearn import datasets

# Load the iris dataset
iris = datasets.load_iris()
df = iris.data
df = pl.DataFrame(df).select('column_0').rename({'column_0':'a'})
df = nw.from_native(df)

In [50]:
kmeans = KMeans(n_clusters=5)
kmeans.fit_predict(df)

  super()._check_params_vs_input(X, default_n_init=10)


array([0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 2, 2, 2, 0, 2, 0, 2, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 2, 0, 1, 4, 1, 2, 1, 2, 4, 0, 1, 0, 0, 4, 4, 4, 2, 1,
       2, 2, 4, 2, 4, 4, 4, 4, 4, 1, 1, 1, 4, 2, 2, 2, 2, 4, 2, 4, 1, 4,
       2, 2, 2, 4, 2, 0, 2, 2, 2, 4, 0, 2, 4, 2, 1, 4, 1, 3, 0, 3, 1, 3,
       1, 4, 1, 2, 2, 4, 1, 3, 3, 4, 1, 2, 3, 4, 1, 3, 4, 4, 4, 3, 3, 3,
       4, 4, 4, 3, 4, 4, 4, 1, 1, 1, 2, 1, 1, 1, 4, 1, 4, 4], dtype=int32)

In [45]:
groups = kmeans.predict(df)
groups = nw.from_native(pl.DataFrame(groups)).rename({'column_0':'groups'})
groups = groups.with_row_index()

In [46]:
df2 = nw.from_native(df)
df2 = df2.with_row_index()

In [47]:
results = df2.join(groups, on = 'index')
bins_max = results.group_by('groups').agg(
    nw.col('a').max()
    ).sort("a").select('a').to_numpy().ravel()

In [49]:
bins_max

array([5.2, 5.8, 6.4, 7.1, 7.9])

In [None]:
class OneDKmeansTransformer(BaseNumericTransformer):
    """Transformer that generates a new column based on kmeans algorithm.
    Transformer runs the kmean algorithm based on given number of clusters and then identifies the bins' cuts based on the results.
    Finally it passes them into the a cut function.

    Parameters
    ----------
    column : str
        Name of the column to discretise.

    new_column_name : str
        Name given to the new discrete column.

    n_clusters : int, default = 8
        The number of clusters to form as well as the number of centroids to generate.

    n_init "auto" or int, default="auto"
        Number of times the k-means algorithm is run with different centroid seeds. 
        The final results is the best output of n_init consecutive runs in terms of inertia. 
        Several runs are recommended for sparse high-dimensional problems (see `Clustering sparse data with k-means <https://scikit-learn.org/stable/auto_examples/text/plot_document_clustering.html#kmeans-sparse-high-dim>`__).

        When n_init='auto', the number of runs depends on the value of init: 10 if using init='random' or init is a callable; 
        1 if using init='k-means++' or init is an array-like.

    **kwargs
        Arbitrary keyword arguments passed onto BaseTransformer.init().

    Attributes
    ----------

    polars_compatible : bool
        class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework
    FITS: bool
        class attribute, indicates whether transform requires fit to be run first

    """

    polars_compatible = True

    FITS = True

    def __init__(
        self,
        column: str,
        new_column_name: str,
        n_init: str | int = 'auto',
        n_clusters: int = 8,
        **kwargs: dict[str, bool],
    ) -> None:
        if not isinstance(new_column_name, str):
            msg = f"{self.classname()}: new_column_name should be a str but got type {type(new_column_name)}"
            raise TypeError(msg)

        if not isinstance(n_clusters, int):
            msg = f"{self.classname()}: n_clusters should be a str but got type {type(n_clusters)}"
            raise TypeError(msg)

        if not (n_init=="auto" or isinstance(n_init, int)):
            msg = f"{self.classname()}: n_init should be 'auto' or int but got type {type(n_init)}"
            raise TypeError(msg)   
                
        self.n_clusters = n_clusters
        self.new_column_name = new_column_name
        self.n_init = n_init

        super().__init__(columns=column, **kwargs)

    def fit(self, X: FrameT, y: IntoSeriesT | None = None) -> OneDKmeansTransformer:
        """Fir transformer to input data.

        Parameters
        ----------
        X : pd/pl.DataFrame
            Dataframe with columns to learn scaling values from.

        y : None
            Required for pipeline.

        """

        super().fit(X, y)

        X = nw.from_native(X)

        kmeans = KMeans(
            n_clusters=self.n_clusters,
            n_init=self.n_init,
        )

        groups = kmeans.fit_predict(X.select(self.columns))
        if  nw.get_native_namespace(X).__name__ == "pandas":
            groups = nw.from_native(pd.DataFrame(groups)).rename({0:'groups'})
        if  nw.get_native_namespace(X).__name__ == "polars":
            groups = nw.from_native(pl.DataFrame(groups)).rename({'column_0':'groups'})
        groups = groups.with_row_index()
        
        results = X.with_row_index().join(groups, on = 'index')
        bins_max = results.group_by('groups').agg(
            nw.col('a').max()
            ).sort("a").select('a').to_numpy().ravel()

        return bins_max
    
    def transform(self, X : FrameT) -> FrameT:
        X = super().transform(X)
        
        return X
    
iris = datasets.load_iris()
df = iris.data
df = pl.DataFrame(df).select('column_0').rename({'column_0':'a'})
bins = OneDKmeansTransformer(column='a', new_column_name='new').fit(X=df)
print(bins)

df = pd.DataFrame(df).rename(columns={0:'a'})
bins = OneDKmeansTransformer(column='a', new_column_name='new').fit(X=df)
print(bins)

[4.7 5.1 5.5 5.9 6.2 6.7 7.3 7.9]
[4.6 4.9 5.2 5.6 6.1 6.6 7.2 7.9]
