In [None]:
import pandas as pd
import numpy as np

import json
import pickle

from scipy.special import logit

## Импорт сохранённой линейной модели и препроцессинг

In [10]:
with open('./zone_id_dict.json') as f:
    zone_id_dict = json.load(f)
with open('./log_reg_C_0.01.sav', 'rb') as f:
    model = pickle.load(f)
with open('./one_hot_encoder.sav', 'rb') as f:
    encoder = pickle.load(f)

In [2]:
def prepare_features(df_, zone_id_dict):
    # убираем impressions - везде 1
    df_.drop(columns=['impressions'], inplace=True) 
    
    # date_time -> день недели, час, выходной ли, месяц
    df_['day_of_week']= pd.to_datetime(df_['date_time']).dt.dayofweek
    df_['is_weekend'] = (df_['day_of_week'] > 4).astype(int)
    df_['hour']= pd.to_datetime(df_['date_time']).dt.hour
    df_['month']= pd.to_datetime(df_['date_time']).dt.month - pd.to_datetime(df_['date_time']).dt.month.min() # чтобы были 0 и 1
    
    # campaign_clicks -> непрерывные значения к категориальному признаку с интервалами: 0, 1, 2-10, 11-20, 20-200, >200
    df_['campaign_clicks_'] = 0
    df_.loc[df_['campaign_clicks'] == 1, 'campaign_clicks_'] = 1
    df_.loc[df_['campaign_clicks'] > 1, 'campaign_clicks_'] = 2
    df_.loc[df_['campaign_clicks'] > 10, 'campaign_clicks_'] = 3
    df_.loc[df_['campaign_clicks'] > 20, 'campaign_clicks_'] = 4
    df_.loc[df_['campaign_clicks'] > 200, 'campaign_clicks_'] = 5  
    
    # zone_id -> 10 классов по доле кликов, если новый, то 11
    df_['zone_id_coded'] = df_['zone_id'].map(zone_id_dict)
    df_['zone_id_coded'] = df_['zone_id_coded'].fillna(11)
    
    # и для 9, и для 10 os_id - только негативные примеры, можно объединить в один класс
    df_.loc[df_['os_id'] == 10]['os_id'] = 9
    # убираем преобразованные фичи
    df_.drop(columns=['zone_id', 'campaign_clicks'], inplace=True)
    
    # для создания интеракций
    df_['banner_str'] = df_['banner_id'].astype(str)
    
    # интеракции между индексом баннера и фичами контекста/пользователя
    sep = '_'
    df_['id_weekend'] = df_['banner_str'] + sep + df_['is_weekend'].astype(str)
    df_['id_day'] = df_['banner_str'] + sep + df_['day_of_week'].astype(str)
    df_['id_clicks'] = df_['banner_str'] + sep + df_['campaign_clicks_'].astype(str)
    df_['id_os'] = df_['banner_str'] + sep + df_['os_id'].astype(str)
    df_['id_time'] = df_['banner_str'] + sep + df_['hour'].astype(str)
    df_['id_zone'] = df_['banner_str'] + sep + df_['zone_id_coded'].astype(str)
    df_['id_country'] = df_['banner_str'] + sep + df_['country_id'].astype(str)
    
    return df_

In [41]:
df_ = pd.read_csv('/Users/evgenia/Desktop/data.csv')
df_ = df_[df_['date_time'] >= '2021-10-02 00:00:01.000000']

## Получение предиктов для пары баннеров

In [42]:
df_1 = df_.copy()
df_2 = df_.copy()
df_2['banner_id'] = df_2['banner_id1']

In [43]:
df_1 = prepare_features(df_1, zone_id_dict)
df_2 = prepare_features(df_2, zone_id_dict)

In [44]:
columns = ['banner_id', 'os_id', 'country_id', 'day_of_week', 'hour', 'month', 'campaign_clicks_',
                   'zone_id_coded', 'id_day', 'id_clicks', 'id_os', 'id_time', 'id_zone', 'id_country', 'id_weekend']

X1 = df_1[columns]
X2 = df_2[columns]

In [45]:
X_enc1 = encoder.transform(X1)
X_enc2 = encoder.transform(X2)

In [46]:
df_['coeff_sum0_new'] = logit(model.predict_proba(X_enc1)[:, 0])
df_['coeff_sum1_new'] = logit(model.predict_proba(X_enc2)[:, 0])

## Оценка вероятности того, что одна нормальная величина больше другой

In [26]:
def compare_normal_distr(mu0, sigma0, mu1, sigma1, samples_n=1000):
    diff = np.random.normal(mu0, sigma0, samples_n) - np.random.normal(mu1, sigma1, samples_n)
    return float((diff > 0).sum()) / samples_n

In [71]:
df_.loc[df_['g0'] < 0., 'g0'] = 0.
df_.loc[df_['g1'] < 0., 'g1'] = 0.

In [73]:
df_['pi_0'] = df_.apply(lambda row: compare_normal_distr(row['coeff_sum0'], row['g0'], row['coeff_sum1'], row['g1']), axis=1)
df_['pi_1'] = df_.apply(lambda row: compare_normal_distr(row['coeff_sum0_new'], row['g0'], row['coeff_sum0_new'], row['g1']), axis=1)


In [79]:
lm = 10

## Clipped IPS (lambda=10)

In [82]:
np.mean(df_['clicks'] * np.minimum(lm, df_['pi_0'] / df_['pi_1']))

0.04908145627913838