In [86]:
import numpy as np
import pandas as pd

import os

import datetime

from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import log_loss

In [84]:
import warnings
warnings.filterwarnings("ignore")

In [89]:
df_ = pd.read_csv('../data.csv')

In [90]:
# убираем ненужные колонки

df_ = df_.drop(columns=['oaid_hash', 'banner_id0', 'rate0', 'g0', 'coeff_sum0',
       'banner_id1', 'rate1', 'g1', 'coeff_sum1'])

In [81]:
def prepare_features(df_, zone_id_dict):
    # убираем impressions - везде 1
    df_.drop(columns=['impressions'], inplace=True) 
    
    # date_time -> день недели, час, выходной ли, месяц
    df_['day_of_week']= pd.to_datetime(df_['date_time']).dt.dayofweek
    df_['is_weekend'] = (df_['day_of_week'] > 4).astype(int)
    df_['hour']= pd.to_datetime(df_['date_time']).dt.hour
    df_['month']= pd.to_datetime(df_['date_time']).dt.month - pd.to_datetime(df_['date_time']).dt.month.min() # чтобы были 0 и 1
    
    # campaign_clicks -> непрерывные значения к категориальному признаку с интервалами: 0, 1, 2-10, 11-20, 20-200, >200
    df_['campaign_clicks_'] = 0
    df_.loc[df_['campaign_clicks'] == 1, 'campaign_clicks_'] = 1
    df_.loc[df_['campaign_clicks'] > 1, 'campaign_clicks_'] = 2
    df_.loc[df_['campaign_clicks'] > 10, 'campaign_clicks_'] = 3
    df_.loc[df_['campaign_clicks'] > 20, 'campaign_clicks_'] = 4
    df_.loc[df_['campaign_clicks'] > 200, 'campaign_clicks_'] = 5  
    
    # zone_id -> 10 классов по доле кликов, если новый, то 11
    df_['zone_id_coded'] = df_['zone_id'].map(zone_id_dict)
    df_['zone_id_coded'] = df_['zone_id_coded'].fillna(11)
    
    # и для 9, и для 10 os_id - только негативные примеры, можно объединить в один класс
    df_.loc[df_['os_id'] == 10]['os_id'] = 9
    # убираем преобразованные фичи
    df_.drop(columns=['zone_id', 'campaign_clicks'], inplace=True)
    
    # для создания интеракций
    df_['banner_str'] = df_['banner_id'].astype(str)
    
    # интеракции между индексом баннера и фичами контекста/пользователя
    sep = '_'
    df_['id_weekend'] = df_['banner_str'] + sep + df_['is_weekend'].astype(str)
    df_['id_day'] = df_['banner_str'] + sep + df_['day_of_week'].astype(str)
    df_['id_clicks'] = df_['banner_str'] + sep + df_['campaign_clicks_'].astype(str)
    df_['id_os'] = df_['banner_str'] + sep + df_['os_id'].astype(str)
    df_['id_time'] = df_['banner_str'] + sep + df_['hour'].astype(str)
    df_['id_zone'] = df_['banner_str'] + sep + df_['zone_id_coded'].astype(str)
    df_['id_country'] = df_['banner_str'] + sep + df_['country_id'].astype(str)
    
    return df_

In [82]:
def cv(df):
    # семплируем по 10к событий для каждого дня
    df_['date'] = pd.to_datetime(df_['date_time']).dt.date
    sampled_df = df_[df_['date'] > datetime.date(2021, 9, 1)].groupby('date').sample(n=10000, random_state=0)
    
    # проверяем на предпоследнем дне
    train_sampled_df = sampled_df[sampled_df['date'] < datetime.date(2021, 10, 1)]
    test_sampled_df = sampled_df[sampled_df['date'] == datetime.date(2021, 10, 1)]
    
    # подготовка train и validate
    zone_id_dict_sampled = ((train_sampled_df.groupby('zone_id')['clicks'].mean() * 10 // 1).astype(int)).to_dict()
    
    columns = ['banner_id', 'os_id', 'country_id', 'day_of_week', 'hour', 'month', 'campaign_clicks_',
               'zone_id_coded', 'id_day', 'id_clicks', 'id_os', 'id_time', 'id_zone', 'id_country', 'id_weekend']
    train_sampled_df = prepare_features(train_sampled_df, zone_id_dict_sampled)
    test_sampled_df = prepare_features(test_sampled_df, zone_id_dict_sampled)

    test_X_ = test_sampled_df[columns]
    test_Y_ = test_sampled_df['clicks']

    train_X_ = train_sampled_df[columns]
    train_Y_ = train_sampled_df['clicks']
    
    encoder = OneHotEncoder(handle_unknown='ignore')
    train_sampled_enc = encoder.fit_transform(train_X_)
    test_sampled_enc = encoder.transform(test_X_)
    
    scores = []
    regularisation = [1, 0.75, 0.5, 0.25, 0.1, 0.01, 0.001, 0.0001, 0.00001]
    for C in regularisation:
        clf = LogisticRegression(random_state=0, penalty='l2', solver='liblinear', C=C).fit(train_sampled_enc, train_Y_)
        predicts = clf.predict_proba(test_sampled_enc) 
        res = log_loss(test_Y_, predicts)
        scores.append(res)
    return dict(zip(regularisation, scores))

In [85]:
# подбор регуляризации

cv(df_)

{1: 0.16970188060742192,
 0.75: 0.1672048576526487,
 0.5: 0.16451936893705066,
 0.25: 0.1616862473685213,
 0.1: 0.1600885796619021,
 0.01: 0.1605053563713203,
 0.001: 0.16306107136909306,
 0.0001: 0.20961572437057147,
 1e-05: 0.44383882755278575}

# Тест на 2021-10-02

In [92]:
def eval_on_last_day(df_, C):
    train_df = df_[df_['date_time'] < '2021-10-02 00:00:01.000000']
    test_df = df_[df_['date_time'] >= '2021-10-02 00:00:01.000000']

    zone_id_dict = ((train_df.groupby('zone_id')['clicks'].mean() * 10 // 1).astype(int)).to_dict()

    train_df = prepare_features(train_df, zone_id_dict)
    test_df = prepare_features(test_df, zone_id_dict)

    columns = ['banner_id', 'os_id', 'country_id', 'day_of_week', 'hour', 'month', 'campaign_clicks_',
                   'zone_id_coded', 'id_day', 'id_clicks', 'id_os', 'id_time', 'id_zone', 'id_country', 'id_weekend']

    train_X = train_df[columns]
    train_Y = train_df['clicks']

    test_X = test_df[columns]
    test_Y = test_df['clicks']

    enc = OneHotEncoder(handle_unknown='ignore')
    train_enc = enc.fit_transform(train_X)
    test_enc = enc.transform(test_X)

    clf = LogisticRegression(random_state=0, penalty='l2', solver='liblinear', C=C).fit(train_enc, train_Y)
    
    predicts = clf.predict_proba(test_enc)
    score = log_loss(test_Y, predicts)
    
    return predicts, score

# Результат:

In [93]:
predicts, score = eval_on_last_day(df_, C=0.01)

In [94]:
score

0.14342618853703984

### Другие результаты

In [None]:
# baseline (mean = 0.025334063951591646)-> 0.15486314749253863
# l2, c=1, мало интеракций -> 0.14843151382160424
# l2, c=1, много интеракций -> 0.15472858830794564
# l2, c=0.5, много интеракций -> 0.15041069018638425
# l2, c=0.1, много интеракций -> 0.14518818362859107
# l2, c=0.1, заменила day_of_week на is_weekend -> 0.1458