In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.figure_factory as ff

In [3]:
df = pd.read_csv('stratification_task_data_public.csv')

In [4]:

df

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,y
0,0.869,30,33.8,0,1,0.2,1992,1,1,1,1903
1,0.759,27,21.7,2,0,3.5,1995,1,1,2,1313
2,0.456,29,37.6,2,0,3.1,1993,0,0,0,1484
3,0.060,35,27.5,2,0,4.7,1988,0,0,1,1188
4,0.939,19,30.7,0,0,3.6,2003,1,1,2,842
...,...,...,...,...,...,...,...,...,...,...,...
9995,0.844,31,31.7,0,0,1.8,1992,1,0,3,1798
9996,0.342,27,32.5,1,0,1.8,1996,1,0,0,1457
9997,0.679,28,29.7,1,0,0.8,1994,1,0,2,1477
9998,0.724,22,38.4,0,0,4.6,2000,1,0,2,1484


In [5]:
len(df)

10000

In [6]:
corr = df.corr()
corr.style.background_gradient(cmap='RdYlGn')

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,y
x1,1.0,-0.005929,-0.012855,0.001497,-0.01404,-0.015988,0.005006,0.008268,0.009632,-0.000988,0.001949
x2,-0.005929,1.0,-0.006367,0.002223,-0.006278,0.001817,-0.994133,-0.003189,0.004515,-0.017679,-0.031074
x3,-0.012855,-0.006367,1.0,0.014967,0.006089,0.00026,0.005265,-0.006652,-0.000694,0.010359,-0.016386
x4,0.001497,0.002223,0.014967,1.0,0.012947,-0.000159,-0.003701,0.013017,0.012899,-0.001301,0.010589
x5,-0.01404,-0.006278,0.006089,0.012947,1.0,0.008639,0.007417,0.00549,-0.011906,0.010445,0.030876
x6,-0.015988,0.001817,0.00026,-0.000159,0.008639,1.0,-0.001239,0.009014,-0.004835,-0.002753,-0.002978
x7,0.005006,-0.994133,0.005265,-0.003701,0.007417,-0.001239,1.0,0.003755,-0.004342,0.017433,0.03095
x8,0.008268,-0.003189,-0.006652,0.013017,0.00549,0.009014,0.003755,1.0,0.012639,0.012351,0.136201
x9,0.009632,0.004515,-0.000694,0.012899,-0.011906,-0.004835,-0.004342,0.012639,1.0,0.010224,-0.022753
x10,-0.000988,-0.017679,0.010359,-0.001301,0.010445,-0.002753,0.017433,0.012351,0.010224,1.0,0.16944


In [7]:
df.groupby(['x2'], as_index=False)['y'].mean()

Unnamed: 0,x2,y
0,12,572.5
1,15,677.5
2,16,746.75
3,17,897.777778
4,18,950.291667
5,19,1107.7
6,20,1058.162162
7,21,1094.780303
8,22,1146.612245
9,23,1209.298932


In [9]:
fig = px.box(df, x='x2', y='y')
fig.show()

In [30]:
# путем пристального взгляда делим на страты
df['strat'] = 1
df.loc[(df['x2'] < 23) | (df['x2'] > 37), 'strat'] = 2
df.loc[(df['x2'] > 28) & (df['x2'] < 32), 'strat'] = 3
df.loc[((df['x2'] >= 25) & (df['x2'] <= 28)) | ((df['x2'] >= 32) & (df['x2'] <= 34)), 'strat'] = 4
print(df.groupby('strat')['y'].mean())
print(df['strat'].value_counts(normalize=True))

strat
1    1251.406701
2    1083.233879
3    1514.821205
4    1393.922122
Name: y, dtype: float64
strat
4    0.4674
3    0.2556
1    0.1731
2    0.1039
Name: proportion, dtype: float64


# Эталонное решение

## Без ML

In [None]:
def get_strats(df: pd.DataFrame):
    """Возвращает страты объектов.
    
    :return (list | np.array | pd.Series): список страт объектов размера len(df).
    """
    return [
        str(int(26 <= x <= 34)) + str(int(28 <= x <= 32)) + str(int(y > 1)) + str(int(a == b))
        for x, y, a, b in df[['x2', 'x10', 'x5', 'x9']].values
    ]

## С ML

In [36]:
def calculate_strat_var(df):
    """Вычисляет стратифицированную дисперсию популяции."""
    strat_vars = df.groupby('strat')['y'].var()
    weights = df['strat'].value_counts(normalize=True)
    stratified_var = (strat_vars * weights).sum()
    return stratified_var

In [38]:
from lightgbm import LGBMRegressor

df_train = df.iloc[:len(df) // 2].copy()
df_test = df.iloc[len(df) // 2:].copy()

model = LGBMRegressor(num_leaves=3)
feature_names = [f'x{i}' for i in range(1, 11)]
model.fit(df_train[feature_names].values, df_train['y'].values)
predict_test = model.predict(df_test[feature_names].values)

n_strat = 10
quantiles = np.quantile(predict_test, np.linspace(0, 1 - 1 / n_strat, n_strat))
df_test['strat'] = [np.sum(predict >= quantiles) for predict in predict_test]
print(int(calculate_strat_var(df_test)))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000323 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 769
[LightGBM] [Info] Number of data points in the train set: 5000, number of used features: 10
[LightGBM] [Info] Start training from score 1370.817800
42289


In [102]:
metrics_strat_a_group = np.zeros((10, 2,))
metrics_strat_a_group[:, 0] = np.arange(10)
metrics_strat_a_group[:, 1] = (np.arange(10) < 4).astype(float)
metrics_strat_b_group = np.zeros((10, 2,))
metrics_strat_b_group[:, 0] = np.arange(1, 11)
metrics_strat_b_group[:, 1] = (np.arange(10) < 5).astype(float)

In [103]:
metrics = np.vstack([metrics_strat_a_group, metrics_strat_b_group])

In [123]:
metrics_strat_a_group

array([[0., 1.],
       [1., 1.],
       [2., 1.],
       [3., 1.],
       [4., 0.],
       [5., 0.],
       [6., 0.],
       [7., 0.],
       [8., 0.],
       [9., 0.]])

In [121]:
metrics_strat_b_group[:, 0].mean()

5.5

In [128]:
from scipy import stats

metrics_strat_a_group = np.zeros((10, 2,))
metrics_strat_a_group[:, 0] = np.arange(10)
metrics_strat_a_group[:, 1] = (np.arange(10) < 4).astype(float)
metrics_strat_b_group = np.zeros((10, 2,))
metrics_strat_b_group[:, 0] = np.arange(1, 11)
metrics_strat_b_group[:, 1] = (np.arange(10) < 5).astype(float)

df_a = pd.DataFrame(metrics_strat_a_group, columns=['metric', 'strat'])
df_b = pd.DataFrame(metrics_strat_b_group, columns=['metric', 'strat'])
df = pd.concat([df_a, df_b])

strats_share = df['strat'].value_counts(normalize=True)

mean_a = (df_a.groupby('strat')['metric'].mean() * strats_share).sum()
mean_b = (df_b.groupby('strat')['metric'].mean() * strats_share).sum()

var_a = (df_a.groupby('strat')['metric'].var() * strats_share).sum()
var_b = (df_b.groupby('strat')['metric'].var() * strats_share).sum()

delta = mean_b - mean_a
std = (var_a / len(df_a) + var_b / len(df_b)) ** 0.5

t = delta / std
p_val = (1 - stats.norm.cdf(np.abs(t))) * 2
print(p_val)

0.037056218564119


In [111]:
df['strat'].value_counts(normalize=True)

strat
0.0    0.55
1.0    0.45
Name: proportion, dtype: float64