In [1]:
import pandas as pd
import numpy as np
from html import escape
import helpsk as hlp

In [236]:
from sklearn.datasets import fetch_openml
credit_g = fetch_openml('credit-g', version=1)
data = credit_g['data']
data['target'] = credit_g['target']

In [237]:
data.loc[:, 'target'] = data['target'] == 'good'
data.loc[0:10, 'target'] = np.nan
data.loc[:, 'target'] = data['target'].astype('boolean')
data.loc[9:20, 'checking_status'] = np.nan
data.loc[19:30, 'credit_amount'] = np.nan

- change dataset so that there is an entire category missing (e.g. no target == True and checking stauts `<Missing>`
- change dataset so there is an entire category with no sum_by  

In [238]:
indexes_to_blank = (data['target'] == True) & (data['checking_status'] == '0<=X<200')
data.loc[indexes_to_blank, 'checking_status'] = np.nan

In [239]:
indexes_to_blank = (data['target'] == False) & (data['checking_status'] == 'no checking')
data.loc[indexes_to_blank, 'credit_amount'] = np.nan

In [240]:
outer_group_by = 'checking_status'
inner_group_by = 'target'
sum_by = 'credit_amount'

In [241]:
outer_group_by = 'target'
inner_group_by = 'checking_status'
sum_by = 'credit_amount'

In [242]:
data = data[[outer_group_by, inner_group_by, sum_by]]
data.head(5)

Unnamed: 0,target,checking_status,credit_amount
0,,<0,1169.0
1,,0<=X<200,5951.0
2,,no checking,2096.0
3,,<0,7882.0
4,,<0,4870.0


In [243]:
data.loc[:, outer_group_by] = hlp.pandas.fill_na(data[outer_group_by])
data.loc[:, inner_group_by] = hlp.pandas.fill_na(data[inner_group_by])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [244]:
data.head(5)

Unnamed: 0,target,checking_status,credit_amount
0,<Missing>,<0,1169.0
1,<Missing>,0<=X<200,5951.0
2,<Missing>,no checking,2096.0
3,<Missing>,<0,7882.0
4,<Missing>,<0,4870.0


In [245]:
data['target'].value_counts()

True         693
False        296
<Missing>     11
Name: target, dtype: int64

In [246]:
def f(x, label):
    d = {}
    d[f'{label} Count'] = x.shape[0]
    d[f'{label} Sum'] = x[sum_by].sum()
    return pd.Series(d)

In [247]:
outer_totals = data.groupby(outer_group_by).apply(f, 'Outer')
#outer_totals.columns = pd.MultiIndex.from_tuples([(outer_group_by, 'Count1'), (outer_group_by, 'Sum1')])
outer_totals = outer_totals.reset_index(level=0, drop=False)
outer_totals

Unnamed: 0,target,Outer Count,Outer Sum
0,False,296.0,960060.0
1,True,693.0,2035914.0
2,<Missing>,11.0,50394.0


In [248]:
hlp.validation.assert_not_any_missing(outer_totals)

In [249]:
hlp.validation.assert_true(outer_totals['Outer Count'].sum() == data.shape[0])

In [250]:
hlp.validation.assert_true(outer_totals['Outer Sum'].sum() == data[sum_by].sum())

In [251]:
outer_totals['Outer Count Perc'] = outer_totals['Outer Count'] / data.shape[0]
outer_totals['Outer Sum Perc'] = outer_totals['Outer Sum'] / data[sum_by].sum()
outer_totals

Unnamed: 0,target,Outer Count,Outer Sum,Outer Count Perc,Outer Sum Perc
0,False,296.0,960060.0,0.296,0.315149
1,True,693.0,2035914.0,0.693,0.668309
2,<Missing>,11.0,50394.0,0.011,0.016542


In [252]:
hlp.validation.assert_is_close(outer_totals['Outer Count Perc'].sum(), 1)

In [253]:
hlp.validation.assert_is_close(outer_totals['Outer Sum Perc'].sum(), 1)

In [254]:
outer_totals = outer_totals[[outer_group_by, 'Outer Count', 'Outer Count Perc', 'Outer Sum', 'Outer Sum Perc']]
outer_totals

Unnamed: 0,target,Outer Count,Outer Count Perc,Outer Sum,Outer Sum Perc
0,False,296.0,0.296,960060.0,0.315149
1,True,693.0,0.693,2035914.0,0.668309
2,<Missing>,11.0,0.011,50394.0,0.016542


In [255]:
inner_totals = data.groupby([outer_group_by, inner_group_by]).apply(f, 'Inner')
inner_totals = inner_totals.reset_index(level=1, drop=False)
inner_totals = inner_totals.reset_index(level=0, drop=False)
inner_totals

Unnamed: 0,target,checking_status,Inner Count,Inner Sum
0,False,<0,131.0,442342.0
1,False,0<=X<200,101.0,474190.0
2,False,>=200,14.0,24160.0
3,False,no checking,46.0,0.0
4,False,<Missing>,4.0,19368.0
5,True,<0,135.0,384385.0
6,True,0<=X<200,,
7,True,>=200,49.0,112623.0
8,True,no checking,341.0,1009722.0
9,True,<Missing>,168.0,529184.0


In [256]:
inner_totals['checking_status'] == '0<=X<200'

0     False
1      True
2     False
3     False
4     False
5     False
6      True
7     False
8     False
9     False
10    False
11     True
12    False
13    False
14    False
Name: checking_status, dtype: bool

In [258]:
expected_na = inner_totals[(inner_totals['target'] == True) & (inner_totals['checking_status'] == '0<=X<200')]
hlp.validation.assert_true(expected_na['Inner Count'].isna().iloc[0])
hlp.validation.assert_true(expected_na['Inner Sum'].isna().iloc[0])

In [259]:
expected_0 = inner_totals[(inner_totals['target'] == False) & (inner_totals['checking_status'] == 'no checking')]

In [260]:
hlp.validation.assert_true(expected_0['Inner Sum'].iloc[0] == 0)

In [261]:
# make sure that if there are missing values, the only missing are in the count and sum columns
hlp.validation.assert_not_any_missing(inner_totals[[outer_group_by, inner_group_by]])

In [262]:
inner_totals['Inner Count'] = inner_totals['Inner Count'].fillna(0)
inner_totals['Inner Sum'] = inner_totals['Inner Sum'].fillna(0)

In [263]:
hlp.validation.assert_not_any_missing(inner_totals)

In [264]:
final = outer_totals.merge(inner_totals, on=outer_group_by, how='left')
final

Unnamed: 0,target,Outer Count,Outer Count Perc,Outer Sum,Outer Sum Perc,checking_status,Inner Count,Inner Sum
0,False,296.0,0.296,960060.0,0.315149,<0,131.0,442342.0
1,False,296.0,0.296,960060.0,0.315149,0<=X<200,101.0,474190.0
2,False,296.0,0.296,960060.0,0.315149,>=200,14.0,24160.0
3,False,296.0,0.296,960060.0,0.315149,no checking,46.0,0.0
4,False,296.0,0.296,960060.0,0.315149,<Missing>,4.0,19368.0
5,True,693.0,0.693,2035914.0,0.668309,<0,135.0,384385.0
6,True,693.0,0.693,2035914.0,0.668309,0<=X<200,0.0,0.0
7,True,693.0,0.693,2035914.0,0.668309,>=200,49.0,112623.0
8,True,693.0,0.693,2035914.0,0.668309,no checking,341.0,1009722.0
9,True,693.0,0.693,2035914.0,0.668309,<Missing>,168.0,529184.0


In [265]:
final['Inner Count Perc'] = final['Inner Count'] / final['Outer Count']
final['Inner Sum Perc'] = final['Inner Sum'] / final['Outer Sum']
final

Unnamed: 0,target,Outer Count,Outer Count Perc,Outer Sum,Outer Sum Perc,checking_status,Inner Count,Inner Sum,Inner Count Perc,Inner Sum Perc
0,False,296.0,0.296,960060.0,0.315149,<0,131.0,442342.0,0.442568,0.460744
1,False,296.0,0.296,960060.0,0.315149,0<=X<200,101.0,474190.0,0.341216,0.493917
2,False,296.0,0.296,960060.0,0.315149,>=200,14.0,24160.0,0.047297,0.025165
3,False,296.0,0.296,960060.0,0.315149,no checking,46.0,0.0,0.155405,0.0
4,False,296.0,0.296,960060.0,0.315149,<Missing>,4.0,19368.0,0.013514,0.020174
5,True,693.0,0.693,2035914.0,0.668309,<0,135.0,384385.0,0.194805,0.188802
6,True,693.0,0.693,2035914.0,0.668309,0<=X<200,0.0,0.0,0.0,0.0
7,True,693.0,0.693,2035914.0,0.668309,>=200,49.0,112623.0,0.070707,0.055318
8,True,693.0,0.693,2035914.0,0.668309,no checking,341.0,1009722.0,0.492063,0.495955
9,True,693.0,0.693,2035914.0,0.668309,<Missing>,168.0,529184.0,0.242424,0.259925


In [266]:
hlp.validation.assert_not_any_missing(final)

In [267]:
new_column_order = final.columns[0:7].tolist() + [final.columns[8]] + [final.columns[7]] + [final.columns[9]]
final = final[new_column_order]

In [268]:
final

Unnamed: 0,target,Outer Count,Outer Count Perc,Outer Sum,Outer Sum Perc,checking_status,Inner Count,Inner Count Perc,Inner Sum,Inner Sum Perc
0,False,296.0,0.296,960060.0,0.315149,<0,131.0,0.442568,442342.0,0.460744
1,False,296.0,0.296,960060.0,0.315149,0<=X<200,101.0,0.341216,474190.0,0.493917
2,False,296.0,0.296,960060.0,0.315149,>=200,14.0,0.047297,24160.0,0.025165
3,False,296.0,0.296,960060.0,0.315149,no checking,46.0,0.155405,0.0,0.0
4,False,296.0,0.296,960060.0,0.315149,<Missing>,4.0,0.013514,19368.0,0.020174
5,True,693.0,0.693,2035914.0,0.668309,<0,135.0,0.194805,384385.0,0.188802
6,True,693.0,0.693,2035914.0,0.668309,0<=X<200,0.0,0.0,0.0,0.0
7,True,693.0,0.693,2035914.0,0.668309,>=200,49.0,0.070707,112623.0,0.055318
8,True,693.0,0.693,2035914.0,0.668309,no checking,341.0,0.492063,1009722.0,0.495955
9,True,693.0,0.693,2035914.0,0.668309,<Missing>,168.0,0.242424,529184.0,0.259925


In [272]:
final[outer_group_by] = final[outer_group_by].astype(str)
final[inner_group_by] = final[inner_group_by].astype(str)

In [273]:
no_style_result = final
# no_style_result['Outer Count'] = no_style_result['Outer Count'].where(is_first, np.nan)
# no_style_result['Outer Sum'] = no_style_result['Outer Sum'].where(is_first, np.nan)
# no_style_result['Outer Count Perc'] = no_style_result['Outer Count Perc'].where(is_first, np.nan)
# no_style_result['Outer Sum Perc'] = no_style_result['Outer Sum Perc'].where(is_first, np.nan)
# #final['Outer Sum'] = no_style_result['Outer Sum'].astype(str).where(is_first, '')
# no_style_result

In [274]:
outer_shifted = final[outer_group_by].shift(1)
is_first = final[outer_group_by] != outer_shifted
is_first

0      True
1     False
2     False
3     False
4     False
5      True
6     False
7     False
8     False
9     False
10     True
11    False
12    False
13    False
14    False
Name: target, dtype: bool

In [275]:
new_columns = pd.MultiIndex.from_tuples([
    (outer_group_by, outer_group_by),
    (outer_group_by, 'Count'),
    (outer_group_by, 'Count Perc'),
    (outer_group_by, 'Sum'),
    (outer_group_by, 'Sum Perc'),
    (inner_group_by, inner_group_by),
    (inner_group_by, 'Count'),
    (inner_group_by, 'Count Perc'),
    (inner_group_by, 'Sum'),
    (inner_group_by, 'Sum Perc'),
])
no_style_result.columns = new_columns
no_style_result

Unnamed: 0_level_0,target,target,target,target,target,checking_status,checking_status,checking_status,checking_status,checking_status
Unnamed: 0_level_1,target,Count,Count Perc,Sum,Sum Perc,checking_status,Count,Count Perc,Sum,Sum Perc
0,False,296.0,0.296,960060.0,0.315149,<0,131.0,0.442568,442342.0,0.460744
1,False,296.0,0.296,960060.0,0.315149,0<=X<200,101.0,0.341216,474190.0,0.493917
2,False,296.0,0.296,960060.0,0.315149,>=200,14.0,0.047297,24160.0,0.025165
3,False,296.0,0.296,960060.0,0.315149,no checking,46.0,0.155405,0.0,0.0
4,False,296.0,0.296,960060.0,0.315149,<Missing>,4.0,0.013514,19368.0,0.020174
5,True,693.0,0.693,2035914.0,0.668309,<0,135.0,0.194805,384385.0,0.188802
6,True,693.0,0.693,2035914.0,0.668309,0<=X<200,0.0,0.0,0.0,0.0
7,True,693.0,0.693,2035914.0,0.668309,>=200,49.0,0.070707,112623.0,0.055318
8,True,693.0,0.693,2035914.0,0.668309,no checking,341.0,0.492063,1009722.0,0.495955
9,True,693.0,0.693,2035914.0,0.668309,<Missing>,168.0,0.242424,529184.0,0.259925


style:

In [276]:
style_result = no_style_result
style_result[outer_group_by] = style_result[outer_group_by].where(is_first, np.nan)
style_result

Unnamed: 0_level_0,target,target,target,target,target,checking_status,checking_status,checking_status,checking_status,checking_status
Unnamed: 0_level_1,target,Count,Count Perc,Sum,Sum Perc,checking_status,Count,Count Perc,Sum,Sum Perc
0,False,296.0,0.296,960060.0,0.315149,<0,131.0,0.442568,442342.0,0.460744
1,,,,,,0<=X<200,101.0,0.341216,474190.0,0.493917
2,,,,,,>=200,14.0,0.047297,24160.0,0.025165
3,,,,,,no checking,46.0,0.155405,0.0,0.0
4,,,,,,<Missing>,4.0,0.013514,19368.0,0.020174
5,True,693.0,0.693,2035914.0,0.668309,<0,135.0,0.194805,384385.0,0.188802
6,,,,,,0<=X<200,0.0,0.0,0.0,0.0
7,,,,,,>=200,49.0,0.070707,112623.0,0.055318
8,,,,,,no checking,341.0,0.492063,1009722.0,0.495955
9,,,,,,<Missing>,168.0,0.242424,529184.0,0.259925


In [277]:
style_result = hlp.pandas_style.html_escape_dataframe(style_result)

In [278]:
idx = pd.IndexSlice
sum_precision = 1

style_result.style.\
    format(subset=idx[:, idx[(outer_group_by, 'target')]], na_rep='').\
    format(subset=idx[:, idx[(outer_group_by, 'Count')]], precision=0, na_rep='').\
    format(subset=idx[:, idx[(outer_group_by, 'Count Perc')]], precision=4, na_rep='', formatter='{:,.2%}'.format).\
    bar(subset=idx[:, idx[(outer_group_by, 'Count Perc')]], vmin=0, vmax=1, color=hlp.color.GRAY).\
    format(subset=idx[:, idx[(outer_group_by, 'Sum')]], precision=sum_precision, na_rep='').\
    format(subset=idx[:, idx[(outer_group_by, 'Sum Perc')]], precision=4, na_rep='', formatter='{:,.2%}'.format).\
    bar(subset=idx[:, idx[(outer_group_by, 'Sum Perc')]], vmin=0, vmax=1, color=hlp.color.GRAY).\
    format(subset=idx[:, idx[(inner_group_by, 'Count')]], precision=0).\
    format(subset=idx[:, idx[(inner_group_by, 'Count Perc')]], precision=4, na_rep='', formatter='{:,.2%}'.format).\
    bar(subset=idx[:, idx[(inner_group_by, 'Count Perc')]], vmin=0, vmax=1, color=hlp.color.GRAY).\
    format(subset=idx[:, idx[(inner_group_by, 'Sum')]], precision=sum_precision).\
    format(subset=idx[:, idx[(inner_group_by, 'Sum Perc')]], precision=4, formatter='{:,.2%}'.format).\
    bar(subset=idx[:, idx[(inner_group_by, 'Sum Perc')]], vmin=0, vmax=1, color=hlp.color.GRAY)

Unnamed: 0_level_0,target,target,target,target,target,checking_status,checking_status,checking_status,checking_status,checking_status
Unnamed: 0_level_1,target,Count,Count Perc,Sum,Sum Perc,checking_status,Count,Count Perc,Sum,Sum Perc
0,False,296.0,29.60%,960060.0,31.51%,<0,131,44.26%,442342.0,46.07%
1,,,,,,0<=X<200,101,34.12%,474190.0,49.39%
2,,,,,,>=200,14,4.73%,24160.0,2.52%
3,,,,,,no checking,46,15.54%,0.0,0.00%
4,,,,,,<Missing>,4,1.35%,19368.0,2.02%
5,True,693.0,69.30%,2035914.0,66.83%,<0,135,19.48%,384385.0,18.88%
6,,,,,,0<=X<200,0,0.00%,0.0,0.00%
7,,,,,,>=200,49,7.07%,112623.0,5.53%
8,,,,,,no checking,341,49.21%,1009722.0,49.60%
9,,,,,,<Missing>,168,24.24%,529184.0,25.99%


In [None]:
sum_precision = 1
style_result.style.\
    format(subset=idx[:, idx[('target', 'target')]], na_rep='').\
    format(subset=idx[:, idx[('target', 'Count')]], precision=0, na_rep='').\
    bar(subset=idx[:, idx[('target', 'Count')]], vmin=0, vmax=data.shape[0], color=hlp.color.GRAY).\
    format(subset=idx[:, idx[('target', 'Count Perc')]], precision=4, na_rep='', formatter='{:,.2%}'.format).\
    bar(subset=idx[:, idx[('target', 'Count Perc')]], vmin=0, vmax=1, color=hlp.color.GRAY).\
    format(subset=idx[:, idx[('target', 'Sum')]], precision=sum_precision, na_rep='').\
    bar(subset=idx[:, idx[('target', 'Sum')]], vmin=0, vmax=data[sum_by].sum(), color=hlp.color.GRAY).\
    format(subset=idx[:, idx[('target', 'Sum Perc')]], precision=4, na_rep='', formatter='{:,.2%}'.format).\
    bar(subset=idx[:, idx[('target', 'Sum Perc')]], vmin=0, vmax=1, color=hlp.color.GRAY)

In [None]:
final[outer_group_by] = final[outer_group_by].astype(str).where(is_first, '')
final['Outer Count'] = final['Outer Count'].astype(str).where(is_first, '')
final['Outer Sum'] = final['Outer Sum'].astype(str).where(is_first, '')
final

In [None]:
final = final[['target', 'Outer Count', 'checking_status', 'Inner Count', 'Perc', 'Outer Sum', 'Inner Sum', 'Sum']]
final

In [None]:
final.style.bar(vmin=0)