In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
not_to_use = 'campaign_clicks'.split(', ')
df = pd.read_csv('../../data/data.csv').drop(not_to_use, axis=1)
df['date_time'] = pd.to_datetime(df['date_time'], format='%Y-%m-%d %H:%M:%S.%f')

df.head()

Unnamed: 0,date_time,zone_id,banner_id,oaid_hash,os_id,country_id,banner_id0,rate0,g0,coeff_sum0,banner_id1,rate1,g1,coeff_sum1,impressions,clicks
0,2021-09-27 00:01:30,0,0,5664530014561852622,0,0,1240,0.067,0.035016,-7.268846,0,0.01,0.049516,-5.369901,1,1
1,2021-09-26 22:54:49,1,1,5186611064559013950,0,1,1,0.002,0.054298,-2.657477,269,0.004,0.031942,-4.44922,1,1
2,2021-09-26 23:57:20,2,2,2215519569292448030,0,0,2,0.014,0.014096,-3.824875,21,0.014,0.014906,-3.939309,1,1
3,2021-09-27 00:04:30,3,3,6262169206735077204,1,1,3,0.012,0.015232,-3.461357,99,0.006,0.050671,-3.418403,1,1
4,2021-09-27 00:06:21,4,4,4778985830203613115,1,0,4,0.019,0.051265,-4.009026,11464230,6.79,0.032005,-2.828797,1,1


In [3]:
df['date'] = df['date_time'].dt.date
df = df.loc[df['date'] > datetime(2021, 9, 10).date()]
df = df.loc[df['banner_id'] == df['banner_id0']]
df['hour'] = df['date_time'].dt.hour

df['last_date'] = (df['date_time'] - timedelta(days=1)).dt.date
df['last_hour'] = (df['date_time'] - timedelta(hours=1)).dt.hour

# features

Посчитаем те фичи, что в 1 дз, только теперь добавим такие же для banner_id1 и уберем campaign_clicks.

In [4]:
def add_feature_by_time(df, feature, use_banner_id_1=False):
    feature_date = (
        df
        .groupby(['date', feature], as_index=False)
        .agg(
            feature_last_date_actions=('impressions', 'count'), 
            feature_last_date_conversion=('clicks', 'mean')
        )
        .rename(columns={
            'date': 'last_date',
            'feature_last_date_actions': f'{feature}_last_date_actions',
            'feature_last_date_conversion': f'{feature}_last_date_conversion'
        })
    )
    
    feature_date_hour = (
        df
        .groupby(['date', 'hour', feature], as_index=False)
        .agg(
            feature_last_hour_actions=('impressions', 'count'), 
            feature_last_hour_conversion=('clicks', 'mean')
        )
        .rename(columns={
            'date': 'last_date',
            'hour': 'last_hour',
            'feature_last_hour_actions': f'{feature}_last_hour_actions',
            'feature_last_hour_conversion': f'{feature}_last_hour_conversion'
        })
    )
    
    if use_banner_id_1:
        return (
            df
            .merge(
                feature_date
                .rename(columns={
                    f'{feature}_last_date_actions': f'{feature}_last_date_actions_banner_id1',
                    f'{feature}_last_date_conversion': f'{feature}_last_date_conversion_banner_id1',
                    'banner_id': 'banner_id1'
                }),
                on=['last_date', 'banner_id1'],
                how='left'
            )
            .merge(
                feature_date_hour
                .rename(columns={
                    f'{feature}_last_hour_actions': f'{feature}_last_hour_actions_banner_id1',
                    f'{feature}_last_hour_conversion': f'{feature}_last_hour_conversion_banner_id1',
                    'banner_id': 'banner_id1'
                }),
                on=['last_date', 'last_hour', 'banner_id1'],
                how='left'
            )
    )
    
    return (
        df
        .merge(
            feature_date,
            on=['last_date', feature],
            how='left'
        )
        .merge(
            feature_date_hour,
            on=['last_date', 'last_hour', feature],
            how='left'
        )
    )

df_merged = add_feature_by_time(df, 'banner_id')
df_merged = add_feature_by_time(df_merged, 'banner_id', use_banner_id_1=True)
df_merged = add_feature_by_time(df_merged, 'zone_id')
df_merged = add_feature_by_time(df_merged, 'os_id')
df_merged = add_feature_by_time(df_merged, 'country_id')

In [5]:
os_ids = pd.get_dummies(df_merged.os_id, prefix='os_id')
country_ids = pd.get_dummies(df_merged.country_id, prefix='country_id')

df_for_train = pd.concat([
    df_merged.fillna(0), 
    os_ids, 
    country_ids
], axis=1)

In [6]:
df_for_train.to_csv('data.csv', index=False)

df_for_train.head()

Unnamed: 0,date_time,zone_id,banner_id,oaid_hash,os_id,country_id,banner_id0,rate0,g0,coeff_sum0,...,country_id_7,country_id_8,country_id_9,country_id_10,country_id_11,country_id_12,country_id_13,country_id_14,country_id_15,country_id_16
0,2021-09-26 22:54:49,1,1,5186611064559013950,0,1,1,0.002,0.054298,-2.657477,...,0,0,0,0,0,0,0,0,0,0
1,2021-09-26 23:57:20,2,2,2215519569292448030,0,0,2,0.014,0.014096,-3.824875,...,0,0,0,0,0,0,0,0,0,0
2,2021-09-27 00:04:30,3,3,6262169206735077204,1,1,3,0.012,0.015232,-3.461357,...,0,0,0,0,0,0,0,0,0,0
3,2021-09-27 00:06:21,4,4,4778985830203613115,1,0,4,0.019,0.051265,-4.009026,...,0,0,0,0,0,0,0,0,0,0
4,2021-09-27 00:06:50,5,5,2377014068362699676,2,2,5,0.004,0.337634,-3.222757,...,0,0,0,0,0,0,0,0,0,0


In [4]:
train_cols = [
    'clicks', 'hour', 'banner_id_last_date_actions',
    'banner_id_last_date_conversion', 'banner_id_last_hour_actions',
    'banner_id_last_hour_conversion', 'zone_id_last_date_actions',
    'zone_id_last_date_conversion', 'zone_id_last_hour_actions',
    'zone_id_last_hour_conversion', 'os_id_last_date_actions',
    'os_id_last_date_conversion', 'os_id_last_hour_actions',
    'os_id_last_hour_conversion', 'country_id_last_date_actions',
    'country_id_last_date_conversion', 'country_id_last_hour_actions',
    'country_id_last_hour_conversion'
]

In [5]:
train = df_for_train[df_for_train['date'] < datetime(2021, 10, 2).date()][train_cols]

# model

In [6]:
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline

In [32]:
model = Pipeline([
    ('scaler', MinMaxScaler()),
    ('predictor', LogisticRegression(penalty='l2', class_weight='balanced', n_jobs=-1, max_iter=100))
])

model.fit(train.drop('clicks', axis=1), train.clicks);

In [8]:
import joblib
joblib.dump(model, './model.pkl')

['./model.pkl']

# predictions

In [10]:
test = df_for_train.loc[df_for_train['date'] == datetime(2021, 10, 2).date()]

test_banner_0 = test[train_cols].drop('clicks', axis=1)

test_banner_1 = (
    test
    [[i if not i.startswith('banner_id') else i + '_banner_id1' for i in train_cols]]
    .drop('clicks', axis=1)
    .rename(columns={
        'banner_id_last_date_actions_banner_id1': 'banner_id_last_date_actions',
        'banner_id_last_date_conversion_banner_id1': 'banner_id_last_date_conversion', 
        'banner_id_last_hour_actions_banner_id1': 'banner_id_last_hour_actions',
        'banner_id_last_hour_conversion_banner_id1': 'banner_id_last_hour_conversion',
    })
)

preds_0 = model.predict_proba(test_banner_0)[:, 1]
preds_1 = model.predict_proba(test_banner_1)[:, 1]

# cips

In [11]:
from scipy.stats import norm
from scipy.special import logit

In [30]:
# вывод тут: https://stats.stackexchange.com/questions/50501/probability-of-one-random-variable-being-greater-than-another/431484
mu0 = test.coeff_sum0 - test.coeff_sum1
sigma = np.sqrt(test.g0 ** 2 + test.g1 ** 2)

# sf = 1 - cdf
test['pi_0'] = norm.sf(-mu0 / sigma)

In [31]:
test['preds_0'] = logit(preds_0)
test['preds_1'] = logit(preds_1)

mu1 = test.preds_0 - test.preds_1

test['pi_1'] = norm.sf(-mu1 / sigma)

In [29]:
def cips(df, lambda_=10):
    return (df['clicks'] * np.minimum((test['pi_1'] / test['pi_0']), lambda_)).sum() / df.shape[0]

cips(test)

0.0797244271372369