In [1]:
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.datasets import load_iris

In [2]:
experiments_range = [
    [0, 100, 1, 4, "Q"],
    [0, 50, 1, 4, "Q"],
    [0, 100, 1, 3, "Q"],
    [0, 50, 1, 3, "Q"],
    [0, 10, 1, 3, "Q"],
    [0, 100, 1, 4, "U"],
    [0, 50, 1, 4, "U"],
    [0, 100, 1, 3, "U"],
    [0, 50, 1, 3, "U"],
# 
    [0, 10, 1, 3, "U"],
    [1, 10, 1, 3, "Q"],
    [1, 10, 1, 3, "U"],
    [1, 11, 1, 3, "Q"],
    [1, 11, 1, 3, "U"],
    [1, 12, 1, 3, "Q"],
    [1, 12, 1, 3, "U"],
    [1, 13, 1, 3, "Q"],
    [1, 13, 1, 3, "U"],
    [1, 14, 1, 3, "Q"],
    [1, 14, 1, 3, "U"],
    [1, 15, 1, 3, "Q"],
    [1, 15, 1, 3, "U"]
]
experiments_vectors = [
    (3, [3.0, 1.0, 1.0, 3.0, 1.0, 1.0, 3.0, 1.0, 1.0]),
    (3, [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0]),
    (3, [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0]),
    (3, [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0]),
    (3, [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0]),
    (3, [15.0, 8.0, 12.0, 14.0, 6.0, 1.0, 13.0, 11.0, 10.0, 9.0, 7.0, 4.0, 3.0, 5.0, 2.0]),
    (3, [0.0, 1.0, 1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 3.0, 4.0])
]

In [3]:
def write_lists(file, data, cuts):
    sep = ""
    for res in data:
        file.write(f"{sep}{int(res):d}")
        sep= ", "
    file.write("\n")
    sep = ""
    for res in cuts:
        file.write(sep + str(round(res,5)))
        sep = ", "
    file.write("\n")

with open("datasets/tests.txt", "w") as file:
    file.write("#\n")
    file.write("# from, to, step, #bins, Q/U\n")
    file.write("# discretized data\n")
    file.write("# cut points\n")
    file.write("#\n")
    #
    # Range experiments
    #
    file.write("#\n")
    file.write("# Range experiments\n")
    file.write("#\n")
    for experiment in experiments_range:
        file.write("RANGE\n")
        (from_, to_, step_, bins_, strategy) = experiment
        disc = KBinsDiscretizer(n_bins=bins_, encode='ordinal', strategy='quantile' if strategy.strip() == "Q" else 'uniform')
        data = [[x] for x in range(from_, to_, step_)]
        disc.fit(data)
        result = disc.transform(data)
        file.write(f"{from_}, {to_}, {step_}, {bins_}, {strategy}\n")
        write_lists(file, result, disc.bin_edges_[0])
    #
    # Vector experiments
    #
    file.write("#\n")
    file.write("# Vector experiments\n")
    file.write("#\n")
    for n_bins, experiment in experiments_vectors:
        for strategy in ["Q", "U"]:
            file.write("VECTOR\n")
            file.write(f"{strategy}{n_bins}{experiment}\n")
            disc = KBinsDiscretizer(
                n_bins=n_bins,
                encode="ordinal",
                
                strategy="quantile" if strategy.strip() == "Q" else "uniform",
            )
            data = [[x] for x in experiment]
            result = disc.fit_transform(data)
            write_lists(file, result, disc.bin_edges_[0])
    #
    # Vector experiments iris
    #
    file.write("#\n");
    file.write("# Vector experiments with iris\n");
    file.write("#\n");
    X, y = load_iris(return_X_y=True)
    for i in range(X.shape[1]):
        for n_bins in [3, 4]:
            for strategy in ["Q", "U"]:
                file.write("VECTOR\n")
                experiment = X[:, i]
                file.write(f"{strategy}{n_bins}{experiment.tolist()}\n")
                disc = KBinsDiscretizer(
                    n_bins=n_bins,
                    encode="ordinal",
                    strategy="quantile" if strategy.strip() == "Q" else "uniform")
                data = [[x] for x in experiment]
                result = disc.fit_transform(data)
                write_lists(file, result, disc.bin_edges_[0])



In [10]:
X = [[x] for x in range(100)]
disc = KBinsDiscretizer(n_bins=3, encode="ordinal", strategy="uniform")
result = disc.fit_transform(X)
print("Cut points", disc.bin_edges_)
test = [32, 33, 34, 65, 66, 67]
for i in test:
    print(f"{i=} X[{i}]={X[i]} result[{i}]={result[i]}")

Cut points [array([ 0., 33., 66., 99.])]
i=32 X[32]=[32] result[32]=[0.]
i=33 X[33]=[33] result[33]=[1.]
i=34 X[34]=[34] result[34]=[1.]
i=65 X[65]=[65] result[65]=[1.]
i=66 X[66]=[66] result[66]=[2.]
i=67 X[67]=[67] result[67]=[2.]


In [15]:
import numpy as np
print("right", np.searchsorted(disc.bin_edges_[0][1:-1],test, side="right"))
print("left ", np.searchsorted(disc.bin_edges_[0][1:-1],test))

right [0 1 1 1 2 2]
left  [0 0 1 1 1 2]
