In [1]:
import pandas as pd
import numpy as np
from html import escape
import helpsk as hlp

In [2]:
from sklearn.datasets import fetch_openml
credit_g = fetch_openml('credit-g', version=1)
data = credit_g['data']
data['target'] = credit_g['target']

In [3]:
data.loc[:, 'target'] = data['target'] == 'good'
data.loc[0:10, 'target'] = np.nan
data.loc[:, 'target'] = data['target'].astype('boolean')
data.loc[9:20, 'checking_status'] = np.nan
data.loc[19:30, 'credit_amount'] = np.nan

In [4]:
outer_group_by = 'checking_status'
inner_group_by = 'target'
sum_by = 'credit_amount'

In [5]:
outer_group_by = 'target'
inner_group_by = 'checking_status'
sum_by = 'credit_amount'

In [6]:
data = data[[outer_group_by, inner_group_by, sum_by]]
data.head(20)

Unnamed: 0,target,checking_status,credit_amount
0,,<0,1169.0
1,,0<=X<200,5951.0
2,,no checking,2096.0
3,,<0,7882.0
4,,<0,4870.0
5,,no checking,9055.0
6,,no checking,2835.0
7,,0<=X<200,6948.0
8,,no checking,3059.0
9,,,5234.0


In [7]:
data.loc[:, outer_group_by] = hlp.pandas.fill_na(data[outer_group_by])
data.loc[:, inner_group_by] = hlp.pandas.fill_na(data[inner_group_by])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [8]:
data.head(20)

Unnamed: 0,target,checking_status,credit_amount
0,<Missing>,<0,1169.0
1,<Missing>,0<=X<200,5951.0
2,<Missing>,no checking,2096.0
3,<Missing>,<0,7882.0
4,<Missing>,<0,4870.0
5,<Missing>,no checking,9055.0
6,<Missing>,no checking,2835.0
7,<Missing>,0<=X<200,6948.0
8,<Missing>,no checking,3059.0
9,<Missing>,<Missing>,5234.0


In [9]:
def f(x, label):
    d = {}
    d[f'{label} Count'] = x.shape[0]
    d[f'{label} Sum'] = x[sum_by].sum()
    return pd.Series(d)

In [10]:
outer_totals = data.groupby(outer_group_by).apply(f, 'Outer')
#outer_totals.columns = pd.MultiIndex.from_tuples([(outer_group_by, 'Count1'), (outer_group_by, 'Sum1')])
outer_totals = outer_totals.reset_index(level=0, drop=False)
outer_totals

Unnamed: 0,target,Outer Count,Outer Sum
0,False,296.0,1157252.0
1,True,693.0,2035914.0
2,<Missing>,11.0,50394.0


In [11]:
hlp.validation.assert_true(outer_totals['Outer Count'].sum() == data.shape[0])

In [12]:
hlp.validation.assert_true(outer_totals['Outer Sum'].sum() == data[sum_by].sum())

In [13]:
outer_totals['Outer Count Perc'] = outer_totals['Outer Count'] / data.shape[0]
outer_totals['Outer Sum Perc'] = outer_totals['Outer Sum'] / data[sum_by].sum()
outer_totals

Unnamed: 0,target,Outer Count,Outer Sum,Outer Count Perc,Outer Sum Perc
0,False,296.0,1157252.0,0.296,0.356785
1,True,693.0,2035914.0,0.693,0.627679
2,<Missing>,11.0,50394.0,0.011,0.015537


In [14]:
hlp.validation.assert_is_close(outer_totals['Outer Count Perc'].sum(), 1)

In [15]:
hlp.validation.assert_is_close(outer_totals['Outer Sum Perc'].sum(), 1)

In [17]:
outer_totals = outer_totals[[outer_group_by, 'Outer Count', 'Outer Count Perc', 'Outer Sum', 'Outer Sum Perc']]
outer_totals

Unnamed: 0,target,Outer Count,Outer Count Perc,Outer Sum,Outer Sum Perc
0,False,296.0,0.296,1157252.0,0.356785
1,True,693.0,0.693,2035914.0,0.627679
2,<Missing>,11.0,0.011,50394.0,0.015537


In [None]:
inner_totals = data.groupby([outer_group_by, inner_group_by]).apply(f, 'Inner')
inner_totals = inner_totals.reset_index(level=1, drop=False)
inner_totals = inner_totals.reset_index(level=0, drop=False)
inner_totals

In [None]:
final = outer_totals.merge(inner_totals, on=outer_group_by, how='left')
final

In [None]:
final['Inner Count Perc'] = final['Inner Count'] / final['Outer Count']
final['Inner Sum Perc'] = final['Inner Sum'] / final['Outer Sum']
final

In [None]:
new_column_order = final.columns[0:7].tolist() + [final.columns[8]] + [final.columns[7]] + [final.columns[9]]
final = final[new_column_order]

In [None]:
final

In [None]:
outer_shifted = final[outer_group_by].shift(1)
is_first = final[outer_group_by] != outer_shifted
is_first

In [None]:
no_style_result = final
no_style_result['Outer Count'] = no_style_result['Outer Count'].where(is_first, np.nan)
no_style_result['Outer Sum'] = no_style_result['Outer Sum'].where(is_first, np.nan)
no_style_result['Outer Count Perc'] = no_style_result['Outer Count Perc'].where(is_first, np.nan)
no_style_result['Outer Sum Perc'] = no_style_result['Outer Sum Perc'].where(is_first, np.nan)
#final['Outer Sum'] = no_style_result['Outer Sum'].astype(str).where(is_first, '')
no_style_result

In [None]:
new_columns = pd.MultiIndex.from_tuples([
    (outer_group_by, outer_group_by),
    (outer_group_by, 'Count'),
    (outer_group_by, 'Count Perc'),
    (outer_group_by, 'Sum'),
    (outer_group_by, 'Sum Perc'),
    (inner_group_by, inner_group_by),
    (inner_group_by, 'Count'),
    (inner_group_by, 'Count Perc'),
    (inner_group_by, 'Sum'),
    (inner_group_by, 'Sum Perc'),
])
no_style_result.columns = new_columns
no_style_result

style:

In [None]:
style_result = no_style_result
style_result[outer_group_by] = style_result[outer_group_by].where(is_first, np.nan)
style_result

In [None]:
sum_precision = 1
style_result.style.\
    format(subset=idx[:, idx[(outer_group_by, 'target')]], na_rep='').\
    format(subset=idx[:, idx[(outer_group_by, 'Count')]], precision=0, na_rep='').\
    format(subset=idx[:, idx[(outer_group_by, 'Count Perc')]], precision=4, na_rep='', formatter='{:,.2%}'.format).\
    bar(subset=idx[:, idx[(outer_group_by, 'Count Perc')]], vmin=0, vmax=1, color=hlp.color.GRAY).\
    format(subset=idx[:, idx[(outer_group_by, 'Sum')]], precision=sum_precision, na_rep='').\
    format(subset=idx[:, idx[(outer_group_by, 'Sum Perc')]], precision=4, na_rep='', formatter='{:,.2%}'.format).\
    bar(subset=idx[:, idx[(outer_group_by, 'Sum Perc')]], vmin=0, vmax=1, color=hlp.color.GRAY).\
    format(subset=idx[:, idx[(inner_group_by, 'Count')]], precision=0).\
    format(subset=idx[:, idx[(inner_group_by, 'Count Perc')]], precision=4, na_rep='', formatter='{:,.2%}'.format).\
    bar(subset=idx[:, idx[(inner_group_by, 'Count Perc')]], vmin=0, vmax=1, color=hlp.color.GRAY).\
    format(subset=idx[:, idx[(inner_group_by, 'Sum')]], precision=sum_precision).\
    format(subset=idx[:, idx[(inner_group_by, 'Sum Perc')]], precision=4, formatter='{:,.2%}'.format).\
    bar(subset=idx[:, idx[(inner_group_by, 'Sum Perc')]], vmin=0, vmax=1, color=hlp.color.GRAY)

In [None]:
sum_precision = 1
style_result.style.\
    format(subset=idx[:, idx[('target', 'target')]], na_rep='').\
    format(subset=idx[:, idx[('target', 'Count')]], precision=0, na_rep='').\
    bar(subset=idx[:, idx[('target', 'Count')]], vmin=0, vmax=data.shape[0], color=hlp.color.GRAY).\
    format(subset=idx[:, idx[('target', 'Count Perc')]], precision=4, na_rep='', formatter='{:,.2%}'.format).\
    bar(subset=idx[:, idx[('target', 'Count Perc')]], vmin=0, vmax=1, color=hlp.color.GRAY).\
    format(subset=idx[:, idx[('target', 'Sum')]], precision=sum_precision, na_rep='').\
    bar(subset=idx[:, idx[('target', 'Sum')]], vmin=0, vmax=data[sum_by].sum(), color=hlp.color.GRAY).\
    format(subset=idx[:, idx[('target', 'Sum Perc')]], precision=4, na_rep='', formatter='{:,.2%}'.format).\
    bar(subset=idx[:, idx[('target', 'Sum Perc')]], vmin=0, vmax=1, color=hlp.color.GRAY)

In [None]:
final[outer_group_by] = final[outer_group_by].astype(str).where(is_first, '')
final['Outer Count'] = final['Outer Count'].astype(str).where(is_first, '')
final['Outer Sum'] = final['Outer Sum'].astype(str).where(is_first, '')
final

In [None]:
final = final[['target', 'Outer Count', 'checking_status', 'Inner Count', 'Perc', 'Outer Sum', 'Inner Sum', 'Sum']]
final

In [None]:
final.style.bar(vmin=0)