# Подсчет OPE метрики clipped ips на последнем дне. В качестве модели для оценки - линейная модель из первого задания.

Входные параметры:
* lambda = 10
* ревард равен 1 если в наблюдении был клик и 0 в противном случае

Обучение/валидация на предыдущем/текущем дне, т.к. если брать данные для обучения за больший период, то они вносят больше шума и метрики падают

* Модель logreg с фичами взаимодействий и target encoding
* best_params = {'params': {'C': 0.5, 'max_iter': 300, 'random_state': 1}
* test log_loss with best params =  0.13998072945658324

In [138]:
import pandas as pd
from functools import partial
from scipy.special import logit
from scipy.stats import norm

In [2]:
from typing import List

from sklearn.linear_model import LogisticRegression
import numpy as np
import category_encoders as ce
import logging

logging.basicConfig(format='%(asctime)s %(message)s',
                    level=logging.DEBUG)

In [10]:
columns = ['date_time', 'zone_id', 'banner_id',
       'os_id', 'country_id', 'banner_id0', 'rate0', 'g0', 'coeff_sum0',
       'banner_id1', 'rate1', 'g1', 'coeff_sum1', 'clicks']

In [11]:
len(columns)

15

In [13]:
data = pd.read_csv('../../data/data.csv',
                   parse_dates=['date_time']
                   , usecols=columns)
data.head()

Unnamed: 0,date_time,zone_id,banner_id,campaign_clicks,os_id,country_id,banner_id0,rate0,g0,coeff_sum0,banner_id1,rate1,g1,coeff_sum1,clicks
0,2021-09-27 00:01:30,0,0,0,0,0,1240,0.067,0.035016,-7.268846,0,0.01,0.049516,-5.369901,1
1,2021-09-26 22:54:49,1,1,0,0,1,1,0.002,0.054298,-2.657477,269,0.004,0.031942,-4.44922,1
2,2021-09-26 23:57:20,2,2,3,0,0,2,0.014,0.014096,-3.824875,21,0.014,0.014906,-3.939309,1
3,2021-09-27 00:04:30,3,3,0,1,1,3,0.012,0.015232,-3.461357,99,0.006,0.050671,-3.418403,1
4,2021-09-27 00:06:21,4,4,0,1,0,4,0.019,0.051265,-4.009026,11464230,6.79,0.032005,-2.828797,1


# Data cleaning

In [14]:
data['date_time'].min(), data['date_time'].max()

(Timestamp('2021-09-01 00:02:49'), Timestamp('2021-10-02 23:59:59'))

In [15]:
all_dates = sorted(data.date_time.dt.date.unique())

In [17]:
data = data[data['date_time'].dt.date>=all_dates[-3]]

In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15821471 entries, 0 to 15821471
Data columns (total 7 columns):
 #   Column           Dtype         
---  ------           -----         
 0   date_time        datetime64[ns]
 1   zone_id          int16         
 2   banner_id        int16         
 3   campaign_clicks  int16         
 4   os_id            int8          
 5   country_id       int8          
 6   clicks           int8          
dtypes: datetime64[ns](1), int16(3), int8(3)
memory usage: 377.2 MB


In [19]:
cat_cols = ['zone_id', 'banner_id', 'os_id', 'country_id']
for col in cat_cols:
    data[col] = data[col].astype(str)

In [30]:
(data.banner_id0.astype(str)==data.banner_id).mean()

0.8845184103108055

In [69]:
data = data[data.banner_id0.astype(str)==data.banner_id]
data2 = data.copy()
data2['banner_id'] = data2['banner_id1']

# Process data

In [32]:
def get_feature_zone_id_country_id(df:pd.DataFrame) -> pd.DataFrame:
    return df['zone_id'] + df['country_id']

def get_feature_zone_id_os_id(df:pd.DataFrame) -> pd.DataFrame:
    return df['zone_id'] + df['os_id']

def get_feature_os_id_country_id(df:pd.DataFrame) -> pd.DataFrame:
    return df['os_id'] + df['country_id']

def get_feature_os_id_hour(df:pd.DataFrame) -> pd.DataFrame:
    return df['os_id'] + df['hour']

def get_feature_os_id_hour_country_id(df:pd.DataFrame) -> pd.DataFrame:
    return df['os_id'] + df['hour'] + df['country_id']

def get_feature_os_id_zone_id_country_id(df:pd.DataFrame) -> pd.DataFrame:
    return df['os_id'] + df['zone_id'] + df['country_id']

def get_feature_hour_zone_id_country_id(df:pd.DataFrame) -> pd.DataFrame:
    return df['zone_id'] + df['hour'] + df['country_id']

def get_feature_banner_id_country_id(df:pd.DataFrame) -> pd.DataFrame:
    return df['banner_id'] + df['country_id']

def get_feature_banner_id_zone_id(df:pd.DataFrame) -> pd.DataFrame:
    return df['banner_id'] + df['zone_id']

def get_feature_banner_id_os_id(df:pd.DataFrame) -> pd.DataFrame:
    return df['banner_id'] + df['os_id']

def get_feature_banner_id_hour(df:pd.DataFrame) -> pd.DataFrame:
    return df['banner_id'] + df['hour']

def get_hour(df: pd.DataFrame) -> pd.DataFrame:
    return df.date_time.dt.hour.astype(str)

def feature_preparation_pipeline(df: pd.DataFrame) -> pd.DataFrame:
    df['hour'] = get_hour(df)
    df['zone_id_country_id'] = get_feature_zone_id_os_id(df)
    df['zone_id_os_id'] = get_feature_zone_id_country_id(df)
    df['os_id_country_id'] = get_feature_os_id_country_id(df)
    df['os_id_hour'] = get_feature_os_id_hour(df)
    df['os_id_hour_country_id'] = get_feature_os_id_hour_country_id(df)
    df['os_id_zone_id_country_id'] = get_feature_os_id_zone_id_country_id(df)
    df['hour_zone_id_country_id'] = get_feature_hour_zone_id_country_id(df)
    df['banner_id_country_id'] = get_feature_banner_id_country_id(df)
    df['banner_id_zone_id'] = get_feature_banner_id_zone_id(df)
    df['banner_id_os_id'] = get_feature_banner_id_os_id(df)
    df['banner_id_os_hour'] = get_feature_banner_id_hour(df)
    return df


In [33]:
%%time
data = feature_preparation_pipeline(data)
data2 = feature_preparation_pipeline(data2)

CPU times: user 14.2 s, sys: 984 ms, total: 15.2 s
Wall time: 15.6 s


Index(['date_time', 'zone_id', 'banner_id', 'campaign_clicks', 'os_id',
       'country_id', 'banner_id0', 'rate0', 'g0', 'coeff_sum0', 'banner_id1',
       'rate1', 'g1', 'coeff_sum1', 'clicks', 'hour', 'zone_id_country_id',
       'zone_id_os_id', 'os_id_country_id', 'os_id_hour',
       'os_id_hour_country_id', 'os_id_zone_id_country_id',
       'hour_zone_id_country_id', 'banner_id_country_id', 'banner_id_zone_id',
       'banner_id_os_id', 'banner_id_os_hour', 'transform_campaign_clicks',
       'os_id_transform_campaign_clicks', 'zone_id_transform_campaign_clicks',
       'country_id_transform_campaign_clicks',
       'hour_transform_campaign_clicks'],
      dtype='object')

In [24]:
features = ['zone_id', 'os_id', 'banner_id',
           'country_id', 'zone_id_country_id', 'zone_id_os_id',
           'os_id_country_id', 'hour', 'os_id_hour',
           'os_id_hour_country_id', 'os_id_zone_id_country_id',
           'hour_zone_id_country_id', 'banner_id_country_id', 'banner_id_zone_id',
           'banner_id_os_id', 'banner_id_os_hour']

target_enc_features = ['zone_id', 'os_id', 'banner_id',
           'country_id', 'zone_id_country_id', 'zone_id_os_id',
           'os_id_country_id', 'hour', 'os_id_hour',
           'os_id_hour_country_id', 'os_id_zone_id_country_id',
           'hour_zone_id_country_id', 'banner_id_country_id', 'banner_id_zone_id',
           'banner_id_os_id', 'banner_id_os_hour']

info_features = [ 'banner_id0','rate0',
                  'g0','coeff_sum0',
                  'banner_id1','rate1',
                  'g1','coeff_sum1',]

In [159]:
def data_transformation(data: pd.DataFrame,
                        data2: pd.DataFrame,
                        train_date: object,
                        predict_date: object,
                        features: List,
                        target_enc_features: List) -> List:
    
    logging.info(f'start data trasformation')
    cat_transformer = ce.target_encoder.TargetEncoder(cols=target_enc_features)
    
    tr_mask= data.date_time.dt.date==train_date
    val_mask = data.date_time.dt.date==predict_date
    tr_ = data[tr_mask]
    tr_X = tr_[features + info_features]
    tr_y = tr_['clicks']

    logging.info(f'fit target encoder')
    tr_X = cat_transformer.fit_transform(tr_X, tr_y)
    logging.info(f'tr_X.shape is {tr_X.shape}')

    val1 = data[val_mask]
    val1_X = val1[features + info_features]
    val_y = val1['clicks']
    val1_X = cat_transformer.transform(val1_X)
    logging.info(f'val1_X.shape is {val1_X.shape}')

    val2 = data2[val_mask]
    val2_X = val2[features + info_features]
    val2_X = cat_transformer.transform(val2_X)
    logging.info(f'val2_X.shape is {val2_X.shape}')
    
    return {'train_x': tr_X, 'train_y': tr_y, 'val1_x': val1_X, 'val2_x': val2_X, 'val_y': val_y}


In [161]:
X_y_validation = data_transformation(data, data2, all_dates[-3], all_dates[-2], features, target_enc_features)

2021-12-11 18:28:06,346 start data trasformation
2021-12-11 18:28:55,188 fit target encoder
2021-12-11 18:30:25,157 tr_X.shape is (1640889, 30)
2021-12-11 18:30:57,587 val1_X.shape is (1442740, 30)
2021-12-11 18:31:27,150 val2_X.shape is (1442740, 30)


In [162]:
X_y_test = data_transformation(data, data2, all_dates[-2], all_dates[-1], features, target_enc_features)

2021-12-11 18:31:30,504 start data trasformation
2021-12-11 18:31:40,945 fit target encoder
2021-12-11 18:33:21,116 tr_X.shape is (1442740, 30)
2021-12-11 18:34:05,092 val1_X.shape is (1890562, 30)
2021-12-11 18:34:44,447 val2_X.shape is (1890562, 30)


# Train and eval models

In [166]:
class CIPS:
    """
    calculate cips
    """
    def __init__(self, df: pd.DataFrame, target_df: pd.Series, preds1: np.array, preds2: np.array, lmbda: int=10):
        self.df = df
        self.target_df = target_df
        self.preds1 = preds1
        self.preds2 = preds2
        self.lmbda = lmbda
        self.std = None
        self.pi_0 = None
        self.pi_1 = None

    def get_pi_0(self):
        np.random.seed(1)
        np.random.normal(self.df.coeff_sum0)
        new_mean = self.df.coeff_sum0 - self.df.coeff_sum1
        self.std = np.sqrt(self.df.g0**2 + self.df.g1**2)
        self.pi_0 = 1 - norm.cdf(new_mean / self.std)

    def get_pi_1(self):
        new_mean= self.preds1 - self.preds2
        self.pi_1 = 1 - norm.cdf(new_mean / self.std)

    def get_cips(self) -> float:
        return 1 / len(self.df) * (self.target_df.values * np.minimum(np.nan_to_num(self.pi_1 / self.pi_0), self.lmbda)).sum()


In [167]:
def find_best_cips_pipe(model: object,
                        params: list,
                        train_X: pd.DataFrame,
                        train_y: pd.Series, 
                        val_X: pd.DataFrame, 
                        val_X2: pd.DataFrame, 
                        val_y: pd.Series, 
                        features: list):
    result = {}
    for reg in params:
        model_ = model(C=reg)
        model_.fit(train_X[features], train_y)
        preds_val = logit(model_.predict_proba(val_X[features])[:, 1])
        preds_val2 = logit(model_.predict_proba(val_X2[features])[:, 1])

        val_cips = CIPS(val_X, val_y, preds_val, preds_val2)
        val_cips.get_pi_0()
        val_cips.get_pi_1()
        cips_res = val_cips.get_cips()
        result[reg] = cips_res
        logging.info(f"current C={reg} and cips={cips_res}")
    return result


In [168]:
model = LogisticRegression
model = partial(model, random_state=1, n_jobs=-1, max_iter=300)

params = np.arange(0.5, 1.4, 0.2)
len(params)

5

In [169]:
results = find_best_cips_pipe(model=model,
                              params=params,
                              train_X=X_y_validation['train_x'],
                              train_y=X_y_validation['train_y'], 
                              val_X=X_y_validation['val1_x'], 
                              val_X2=X_y_validation['val2_x'], 
                              val_y=X_y_validation['val_y'], 
                              features=features)

  return 1 / len(self.df) * (self.target_df.values * np.minimum(np.nan_to_num(self.pi_1 / self.pi_0), self.lmbda)).sum()
  return 1 / len(self.df) * (self.target_df.values * np.minimum(np.nan_to_num(self.pi_1 / self.pi_0), self.lmbda)).sum()
2021-12-11 18:42:06,562 current C=0.5 and cips=0.22150944434601125
  return 1 / len(self.df) * (self.target_df.values * np.minimum(np.nan_to_num(self.pi_1 / self.pi_0), self.lmbda)).sum()
  return 1 / len(self.df) * (self.target_df.values * np.minimum(np.nan_to_num(self.pi_1 / self.pi_0), self.lmbda)).sum()
2021-12-11 18:45:53,904 current C=0.7 and cips=0.22113744956420558
  return 1 / len(self.df) * (self.target_df.values * np.minimum(np.nan_to_num(self.pi_1 / self.pi_0), self.lmbda)).sum()
  return 1 / len(self.df) * (self.target_df.values * np.minimum(np.nan_to_num(self.pi_1 / self.pi_0), self.lmbda)).sum()
2021-12-11 18:49:44,687 current C=0.8999999999999999 and cips=0.2208085165486101
  return 1 / len(self.df) * (self.target_df.values * np.min

In [174]:
sorted(results.items(), key=lambda x: -x[1])[:10]

[(0.5, 0.22150944434601125),
 (0.7, 0.22113744956420558),
 (0.8999999999999999, 0.2208085165486101),
 (1.0999999999999999, 0.22068759264967383),
 (1.2999999999999998, 0.22043636590354704)]

In [175]:
best_c = sorted(results.items(), key=lambda x: -x[1])[0][0]
best_c

0.5

In [176]:
model_ = LogisticRegression(random_state=1, n_jobs=-1, max_iter=300, C=best_c)
model_.fit(X_y_test['train_x'][features], X_y_test['train_y'])
preds_test = logit(model_.predict_proba(X_y_test['val1_x'][features])[:, 1])
preds_test2 = logit(model_.predict_proba(X_y_test['val2_x'][features])[:, 1])

test_cips = CIPS(X_y_test['val1_x'], X_y_test['val_y'], preds_test, preds_test2)
test_cips.get_pi_0()
test_cips.get_pi_1()
test_cips.get_cips()

  return 1 / len(self.df) * (self.target_df.values * np.minimum(np.nan_to_num(self.pi_1 / self.pi_0), self.lmbda)).sum()
  return 1 / len(self.df) * (self.target_df.values * np.minimum(np.nan_to_num(self.pi_1 / self.pi_0), self.lmbda)).sum()


0.19616456588430053