In [1]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

import numpy as np
import pandas as pd
from scipy.stats import norm
from scipy.special import logit

from category_encoders.one_hot import OneHotEncoder
from category_encoders.target_encoder import TargetEncoder
from sklearn.linear_model import LogisticRegression

### Вычиcление $\pi_0$ и $\pi_1$ 

In [2]:
df = pd.read_csv('data.csv', usecols=['date_time','zone_id','banner_id',
                                        'campaign_clicks','os_id','country_id',
                                        'banner_id0','rate0','g0','coeff_sum0',
                                        'banner_id1','rate1','g1','coeff_sum1',
                                        'clicks'], parse_dates=['date_time'], 
                 
                                        dtype={'zone_id':str,'banner_id':str,'banner_id0':str,'banner_id1':str,
                                                'os_id':str,'country_id':str,'clicks':int})
df.head()

Unnamed: 0,date_time,zone_id,banner_id,campaign_clicks,os_id,country_id,banner_id0,rate0,g0,coeff_sum0,banner_id1,rate1,g1,coeff_sum1,clicks
0,2021-09-27 00:01:30,0,0,0,0,0,1240,0.067,0.035016,-7.268846,0,0.01,0.049516,-5.369901,1
1,2021-09-26 22:54:49,1,1,0,0,1,1,0.002,0.054298,-2.657477,269,0.004,0.031942,-4.44922,1
2,2021-09-26 23:57:20,2,2,3,0,0,2,0.014,0.014096,-3.824875,21,0.014,0.014906,-3.939309,1
3,2021-09-27 00:04:30,3,3,0,1,1,3,0.012,0.015232,-3.461357,99,0.006,0.050671,-3.418403,1
4,2021-09-27 00:06:21,4,4,0,1,0,4,0.019,0.051265,-4.009026,11464230,6.79,0.032005,-2.828797,1


In [3]:
df = df.set_index('date_time')

train = df.loc[:'2021-10-01'].copy()
test0 = df.loc['2021-10-02'].copy()
test0 = test0[test0.banner_id == test0.banner_id0]
test1 = test0.copy()
test1.banner_id = test1.banner_id1

df = df.loc['2021-10-02'].copy()
df = df[df.banner_id == df.banner_id0]

one_hot = OneHotEncoder().fit(train[['os_id','country_id']])
target = TargetEncoder().fit(train[['zone_id','banner_id']], train['clicks'])

def feature_engineering(data: pd.DataFrame) -> pd.DataFrame:
    data['hour'] = data.index.hour
    data[['zone_id','banner_id']] = target.transform(data[['zone_id','banner_id']])
    return pd.concat([data[['hour','zone_id','banner_id','clicks']], 
                      one_hot.transform(data[['os_id','country_id']])], axis=1)

train = feature_engineering(train)
X_train, y_train = train.drop(columns='clicks').values, train.clicks.values

model = LogisticRegression(max_iter=300).fit(X_train, y_train)

test0 = feature_engineering(test0)
X_test0 = test0.drop(columns='clicks').values
test0_pred = logit(model.predict_proba(X_test0)[:,1])

test1 = feature_engineering(test1)
X_test1 = test1.drop(columns='clicks').values
test1_pred = logit(model.predict_proba(X_test1)[:,1])

In [4]:
E = df.coeff_sum0 - df.coeff_sum1
D = np.sqrt(df.g0**2 + df.g1**2)
pi_0 = norm.sf(0, E, D)

In [5]:
pi_0.round(3)

array([1.   , 1.   , 0.   , ..., 0.   , 0.485, 0.495])

In [6]:
E = test0_pred - test1_pred
pi_1 = norm.sf(0, E, D)

In [7]:
pi_1.round(3)

array([0.998, 1.   , 0.788, ..., 0.015, 0.654, 0.5  ])

### Рассчет CIPS

In [8]:
l = 10

def cips(pi_0, pi_1):
    return (df.clicks.values * np.minimum(np.nan_to_num(pi_1/pi_0), l)).sum()/df.shape[0]

cips(pi_0, pi_1)

0.07617955653069858

### Оптимизация регуляризации
Максимизизуем CIPS

In [9]:
for reg_coef in np.arange(0.5, 4, 0.5):
    print(f'Fitting with reg coef: {reg_coef}')

    model = LogisticRegression(max_iter=300, C=reg_coef).fit(X_train, y_train)
    test0_pred = logit(model.predict_proba(X_test0)[:,1])
    test1_pred = logit(model.predict_proba(X_test1)[:,1])
    
    E = test0_pred - test1_pred
    pi_1 = norm.sf(0, E, D)
    score = cips(pi_0, pi_1)
    print(f'CIPS score: {score}\n')

Fitting with reg coef: 0.5
CIPS score: 0.07627148335876843

Fitting with reg coef: 1.0
CIPS score: 0.07617955653069858

Fitting with reg coef: 1.5
CIPS score: 0.07635101042247992

Fitting with reg coef: 2.0
CIPS score: 0.0762508382038136

Fitting with reg coef: 2.5
CIPS score: 0.0762213526337213

Fitting with reg coef: 3.0
CIPS score: 0.07627470265935617

Fitting with reg coef: 3.5
CIPS score: 0.07626234108080153



Наилучшая регуляризация C=1.5