In [1]:
from sklearn.datasets import load_diabetes

In [2]:
import sumnplot as sp
from sumnplot.discretisation import (
    EqualWidthDiscretiser,
    QuantileDiscretiser,
)
from sumnplot.summary import ColumnSummariser

In [3]:
sp.__version__

'0.3.0'

# Set up data
Make the column `s1` positive so it can be treated as weights.

In [4]:
X, y = load_diabetes(return_X_y=True, as_frame=True)

In [5]:
X["s1"] = X["s1"] - X["s1"].min()

In [6]:
X["age_bucketed"] = QuantileDiscretiser(variable="age").fit_transform(
    X, sample_weight=X["s1"]
)

In [7]:
X.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,age_bucketed
0,0.038076,0.05068,0.061696,0.021872,0.082557,-0.034821,-0.043401,-0.002592,0.019908,-0.017646,"(0.0344, 0.0453]"
1,-0.001882,-0.044642,-0.051474,-0.026328,0.118332,-0.019163,0.074412,-0.039493,-0.06833,-0.092204,"(-0.02, -0.00188]"
2,0.085299,0.05068,0.044451,-0.005671,0.081181,-0.034194,-0.032356,-0.002592,0.002864,-0.02593,"(0.0666, 0.111]"
3,-0.089063,-0.044642,-0.011595,-0.036656,0.138971,0.024991,-0.036038,0.034309,0.022692,-0.009362,"(-0.108, -0.0565]"
4,0.005383,-0.044642,-0.036385,0.021872,0.130716,0.015596,0.008142,-0.002592,-0.031991,-0.046641,"(-0.00188, 0.00902]"


# Summarise by columns

## Specifying a discretiser for each column individually

In [8]:
column_summariser = ColumnSummariser(
    to_summarise_columns=["s1", "s2", "s3"],
    discretisers=[
        sp.discretisation.EqualWidthDiscretiser(variable="age"),
        sp.discretisation.EqualWeightDiscretiser(variable="bmi"),
        sp.discretisation.QuantileDiscretiser(variable="bp"),
    ],
)

In [9]:
for x in column_summariser.discretisers:
    print(hasattr(x, "cut_points"))

False
False
False


In [10]:
column_summariser.summarise_columns(X)

AttributeError: 'ColumnSummariser' object has no attribute 'summarise_columns'

In [None]:
for x in column_summariser.discretisers:
    print(hasattr(x, "cut_points"))

## Specifying a descretiser or categorical column name

In [None]:
column_summariser_b = ColumnSummariser(
    to_summarise_columns=["s1", "s2", "s3"],
    discretisers=[
        "age_bucketed",
        sp.discretisation.EqualWeightDiscretiser(variable="bmi"),
        sp.discretisation.QuantileDiscretiser(variable="bp"),
    ],
)

In [None]:
column_summariser_b.summarise_columns(X, sample_weight=X["s1"])

## Specifying one discretiser to use for all columns

In [None]:
column_summariser_c = ColumnSummariser(
    to_summarise_columns=["s1", "s2", "s3"],
    to_summarise_columns_labels=["label1", "label2", "label3"],
    by_columns=["age_bucketed", "bmi", "bp"],
    discretiser=EqualWidthDiscretiser,
    discretiser_kwargs={"n": 5},
)

In [None]:
column_summariser_c.summarise_columns(X)

## Specifying a column to divde through by in the summaries

In [None]:
column_summariser_d = ColumnSummariser(
    to_summarise_columns=["s1", "s2", "s3"],
    to_summarise_divide_column="s1",
    by_columns=["age_bucketed", "bmi", "bp"],
    discretiser=EqualWidthDiscretiser,
    discretiser_kwargs={"n": 5},
)

In [None]:
column_summariser_d.summarise_columns(X)

## Using `_summarise_column` method directly

In [None]:
ColumnSummariser._summarise_column(
    df=X,
    to_summarise_columns=["s1", "s2", "s3"],
    to_summarise_columns_labels=["obs", "p1", "p2"],
    to_summarise_divide_column="s1",
    by_column="age_bucketed",
)

## Specifying a second group by column

In [None]:
ColumnSummariser._summarise_column(
    df=X,
    to_summarise_columns=["s1", "s2", "s3"],
    to_summarise_divide_column="s1",
    by_column=sp.discretisation.QuantileDiscretiser(
        variable="bmi", quantiles=(0, 0.33, 0.66, 1.0)
    ),
    second_by_column=sp.discretisation.QuantileDiscretiser(
        variable="bp", quantiles=(0, 0.5, 1.0)
    ),
)