In [1]:
from sklearn.datasets import load_diabetes

In [2]:
import sumnplot as sp
from sumnplot.discretisation import (
    EqualWidthDiscretiser,
    EqualWeightDiscretiser,
    QuantileDiscretiser,
)

In [3]:
sp.__version__

'0.3.0'

# Set up data
Make the column `s1` positive so it can be treated as weights.

In [4]:
X, y = load_diabetes(return_X_y=True, as_frame=True)

In [5]:
X["s1"] = X["s1"] - X["s1"].min()

In [6]:
X.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.05068,0.061696,0.021872,0.082557,-0.034821,-0.043401,-0.002592,0.019908,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,0.118332,-0.019163,0.074412,-0.039493,-0.06833,-0.092204
2,0.085299,0.05068,0.044451,-0.005671,0.081181,-0.034194,-0.032356,-0.002592,0.002864,-0.02593
3,-0.089063,-0.044642,-0.011595,-0.036656,0.138971,0.024991,-0.036038,0.034309,0.022692,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.130716,0.015596,0.008142,-0.002592,-0.031991,-0.046641


# Equal Width

In [7]:
equal_width = EqualWidthDiscretiser(variable="age", n=10)

In [8]:
equal_width.fit_transform(X)

0        (0.0235, 0.0453]
1        (-0.02, 0.00175]
2        (0.0671, 0.0889]
3       (-0.108, -0.0854]
4       (0.00175, 0.0235]
              ...        
437      (0.0235, 0.0453]
438      (-0.02, 0.00175]
439      (0.0235, 0.0453]
440    (-0.0636, -0.0418]
441    (-0.0636, -0.0418]
Name: age, Length: 442, dtype: category
Categories (11, object): [(-0.108, -0.0854] < (-0.0854, -0.0636] < (-0.0636, -0.0418] < (-0.0418, -0.02] ... (0.0453, 0.0671] < (0.0671, 0.0889] < (0.0889, 0.111] < Null]

In [9]:
equal_width.fit_transform(X).value_counts(dropna=False).sort_index()

(-0.108, -0.0854]     24
(-0.0854, -0.0636]    23
(-0.0636, -0.0418]    47
(-0.0418, -0.02]      53
(-0.02, 0.00175]      55
(0.00175, 0.0235]     85
(0.0235, 0.0453]      69
(0.0453, 0.0671]      47
(0.0671, 0.0889]      33
(0.0889, 0.111]        6
Null                   0
Name: age, dtype: int64

# Equal weight

In [10]:
equal_weight = EqualWeightDiscretiser(variable="age", n=10)

In [11]:
equal_weight.fit_transform(X, sample_weight=X["s1"])

0         (0.0344, 0.0453]
1        (-0.02, -0.00188]
2          (0.0666, 0.111]
3        (-0.108, -0.0565]
4      (-0.00188, 0.00902]
              ...         
437       (0.0344, 0.0453]
438      (-0.02, -0.00188]
439       (0.0344, 0.0453]
440     (-0.0565, -0.0418]
441     (-0.0565, -0.0418]
Name: age, Length: 442, dtype: category
Categories (11, object): [(-0.108, -0.0565] < (-0.0565, -0.0418] < (-0.0418, -0.02] < (-0.02, -0.00188] ... (0.0344, 0.0453] < (0.0453, 0.0666] < (0.0666, 0.111] < Null]

In [12]:
equal_weight.fit_transform(X, sample_weight=X["s1"]).value_counts(
    dropna=False
).sort_index()

(-0.108, -0.0565]      58
(-0.0565, -0.0418]     47
(-0.0418, -0.02]       49
(-0.02, -0.00188]      48
(-0.00188, 0.00902]    41
(0.00902, 0.0199]      44
(0.0199, 0.0344]       42
(0.0344, 0.0453]       41
(0.0453, 0.0666]       33
(0.0666, 0.111]        39
Null                    0
Name: age, dtype: int64

In [13]:
X.groupby(equal_weight.fit_transform(X, sample_weight=X["s1"]))["s1"].sum()

age
(-0.108, -0.0565]      5.562977
(-0.0565, -0.0418]     5.950996
(-0.0418, -0.02]       5.985395
(-0.02, -0.00188]      6.376166
(-0.00188, 0.00902]    5.546466
(0.00902, 0.0199]      5.461157
(0.0199, 0.0344]       5.241004
(0.0344, 0.0453]       5.495556
(0.0453, 0.0666]       4.788316
(0.0666, 0.111]        5.629023
Null                   0.000000
Name: s1, dtype: float64

# Quantile bucketing

In [14]:
quantile_buckets = QuantileDiscretiser(
    variable="age", quantiles=(0, 0.25, 0.5, 0.75, 1.0)
)

In [15]:
quantile_buckets.fit_transform(X)

0       (0.00538, 0.0381]
1      (-0.0382, 0.00538]
2         (0.0381, 0.111]
3       (-0.108, -0.0382]
4      (-0.0382, 0.00538]
              ...        
437       (0.0381, 0.111]
438    (-0.0382, 0.00538]
439       (0.0381, 0.111]
440     (-0.108, -0.0382]
441     (-0.108, -0.0382]
Name: age, Length: 442, dtype: category
Categories (5, object): [(-0.108, -0.0382] < (-0.0382, 0.00538] < (0.00538, 0.0381] < (0.0381, 0.111] < Null]

In [16]:
quantile_buckets.fit_transform(X).value_counts(dropna=False).sort_index()

(-0.108, -0.0382]     111
(-0.0382, 0.00538]    116
(0.00538, 0.0381]     112
(0.0381, 0.111]       103
Null                    0
Name: age, dtype: int64

# Weighted quantile bucketing

In [17]:
weighted_quantile_buckets = QuantileDiscretiser(
    variable="age", quantiles=(0, 0.25, 0.5, 0.75, 1.0)
)

In [18]:
weighted_quantile_buckets.fit_transform(X, sample_weight=X["s1"])

0       (0.00902, 0.0417]
1      (-0.0273, 0.00902]
2         (0.0417, 0.111]
3       (-0.108, -0.0273]
4      (-0.0273, 0.00902]
              ...        
437     (0.00902, 0.0417]
438    (-0.0273, 0.00902]
439     (0.00902, 0.0417]
440     (-0.108, -0.0273]
441     (-0.108, -0.0273]
Name: age, Length: 442, dtype: category
Categories (5, object): [(-0.108, -0.0273] < (-0.0273, 0.00902] < (0.00902, 0.0417] < (0.0417, 0.111] < Null]

In [19]:
weighted_quantile_buckets.fit_transform(X, sample_weight=X["s1"]).value_counts(
    dropna=False
).sort_index()

(-0.108, -0.0273]     137
(-0.0273, 0.00902]    106
(0.00902, 0.0417]     113
(0.0417, 0.111]        86
Null                    0
Name: age, dtype: int64

# Fit and transform

In [20]:
equal_weight2 = EqualWeightDiscretiser(variable="bmi", n=8)

In [21]:
equal_weight2.fit(X.loc[0 : X.shape[0] // 2])

EqualWeightDiscretiser(n=8, variable='bmi')

In [22]:
equal_weight2.transform(X.loc[0 : X.shape[0] // 2]).value_counts(
    dropna=False
).sort_index()

(-0.0848, -0.0579]     29
(-0.0579, -0.0375]     28
(-0.0375, -0.0205]     26
(-0.0205, -0.00621]    29
(-0.00621, 0.00996]    28
(0.00996, 0.0283]      27
(0.0283, 0.0558]       27
(0.0558, 0.129]        28
Null                    0
Name: bmi, dtype: int64

In [23]:
equal_weight2.cut_points

array([-0.08380842, -0.05794093, -0.0374625 , -0.02048696, -0.00620595,
        0.00996123,  0.02828403,  0.05576824,  0.12852056])

In [24]:
equal_weight2.transform(X.loc[X.shape[0] // 2 :]).value_counts(
    dropna=False
).sort_index()

(-0.0848, -0.0579]     16
(-0.0579, -0.0375]     26
(-0.0375, -0.0205]     41
(-0.0205, -0.00621]    31
(-0.00621, 0.00996]    25
(0.00996, 0.0283]      18
(0.0283, 0.0558]       25
(0.0558, 0.129]        33
Null                    6
Name: bmi, dtype: int64