In [1]:
import pandas
import numpy
import numpy.random

import vtreat
import vtreat.cross_plan



In [2]:
k = 5
n_row = 1000

numpy.random.seed(2019)

d = pandas.DataFrame({
    'x': numpy.random.normal(size=n_row),
    'y': numpy.random.binomial(size=n_row, p=0.01, n=1)
})

d.describe()

Unnamed: 0,x,y
count,1000.0,1000.0
mean,0.005766,0.011
std,1.024104,0.104355
min,-3.20504,0.0
25%,-0.689752,0.0
50%,0.01225,0.0
75%,0.702009,0.0
max,2.928164,1.0


In [3]:
def label_rows(d, cross_plan, *, label_column = 'group'):
    d[label_column] = 0
    for i in range(len(cross_plan)):
        app = cross_plan[i]['app']
        d.loc[app, label_column] = i

In [4]:
# cross plan API
# https://github.com/WinVector/pyvtreat/blob/master/pkg/vtreat/cross_plan.py#L14

treatment_stratified = vtreat.BinomialOutcomeTreatment(
    var_list=['x'],
    outcome_name='y',
    outcome_target=1,
    params=vtreat.vtreat_parameters({
        'cross_validation_plan': vtreat.cross_plan.KWayCrossPlanYStratified(),
        'cross_validation_k': k
    })
)

prepared_stratified = treatment_stratified.fit_transform(d, d['y'])
label_rows(prepared_stratified, treatment_stratified.cross_plan_)

prepared_stratified.head()

Unnamed: 0,y,x,group
0,0,-0.217679,4
1,0,0.821455,2
2,0,1.481278,3
3,0,1.331864,4
4,0,-0.361865,2


In [5]:

treatment_unstratified = vtreat.BinomialOutcomeTreatment(
    var_list=['x'],
    outcome_name='y',
    outcome_target=1,
    params=vtreat.vtreat_parameters({
        'cross_validation_plan': vtreat.cross_plan.KWayCrossPlan(),
        'cross_validation_k': k
    })
)


prepared_unstratified = treatment_unstratified.fit_transform(d, d['y'])
label_rows(prepared_unstratified, treatment_unstratified.cross_plan_)

prepared_unstratified.head()

Unnamed: 0,y,x,group
0,0,-0.217679,1
1,0,0.821455,3
2,0,1.481278,4
3,0,1.331864,0
4,0,-0.361865,1


In [6]:
stratified_summary = prepared_stratified.groupby(['group']).agg({'y': ['sum', 'mean', 'count']})
stratified_summary.columns = stratified_summary.columns.get_level_values(1)
stratified_summary

Unnamed: 0_level_0,sum,mean,count
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,3,0.015,200
1,2,0.01,200
2,2,0.01,200
3,2,0.01,200
4,2,0.01,200


In [7]:
numpy.std(stratified_summary['mean'])

0.001999999999999999

In [8]:
unstratified_summary = prepared_unstratified.groupby(['group']).agg({'y': ['sum', 'mean', 'count']})
unstratified_summary.columns = unstratified_summary.columns.get_level_values(1)
unstratified_summary

Unnamed: 0_level_0,sum,mean,count
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,4,0.02,200
1,1,0.005,200
2,2,0.01,200
3,1,0.005,200
4,3,0.015,200


In [9]:
numpy.std(unstratified_summary['mean'])



0.0058309518948453

In [10]:
from data_algebra.data_ops import *

In [11]:
ops = describe_table(prepared_stratified). \
    project({
        'sum': 'y.sum()',
        'mean': 'y.mean()',
        'size': '_size()',
    },
    group_by=['group'])

ops.transform(prepared_stratified)

Unnamed: 0,group,sum,mean,size
0,0,3,0.015,200
1,1,2,0.01,200
2,2,2,0.01,200
3,3,2,0.01,200
4,4,2,0.01,200


In [12]:
ops.transform(prepared_unstratified)

Unnamed: 0,group,sum,mean,size
0,0,4,0.02,200
1,1,1,0.005,200
2,2,2,0.01,200
3,3,1,0.005,200
4,4,3,0.015,200
