In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
not_to_use = 'oaid_hash, banner_id0, banner_id1, rate0, rate1, g0, g1, coeff_sum0, coeff_sum1'.split(', ')
df = pd.read_csv('../../data/data.csv').drop(not_to_use, axis=1)
df['date_time'] = pd.to_datetime(df['date_time'], format='%Y-%m-%d %H:%M:%S.%f')

df.head()

Unnamed: 0,date_time,zone_id,banner_id,campaign_clicks,os_id,country_id,impressions,clicks
0,2021-09-27 00:01:30,0,0,0,0,0,1,1
1,2021-09-26 22:54:49,1,1,0,0,1,1,1
2,2021-09-26 23:57:20,2,2,3,0,0,1,1
3,2021-09-27 00:04:30,3,3,0,1,1,1,1
4,2021-09-27 00:06:21,4,4,0,1,0,1,1


# analysis

Посмотрим на данные

In [3]:
# бесполезная фича
df.groupby('impressions').agg(num_actions=('clicks', 'count')).sort_index().head(10)

Unnamed: 0_level_0,num_actions
impressions,Unnamed: 1_level_1
1,15821472


In [4]:
(
    df.os_id.nunique(), 
    df.country_id.nunique(), 
    df.zone_id.nunique(),
    df.banner_id.nunique(),
)

(11, 17, 3444, 1633)

Так как уникальных значений для `os_id` и `country_id` не так много, можно сделать OHE.

Посмотрим на распределение по датам.

In [5]:
df['date'] = df['date_time'].dt.date

df.groupby('date').agg(num_actions=('clicks', 'count')).sort_index().head(10)

Unnamed: 0_level_0,num_actions
date,Unnamed: 1_level_1
2021-09-01,1
2021-09-26,3102610
2021-09-27,2367303
2021-09-28,2307355
2021-09-29,2420588
2021-09-30,1851189
2021-10-01,1643448
2021-10-02,2128978


Видно один аутлаер. Его можно выбросить.

In [6]:
df = df[df['date'] > datetime(2021, 9, 10).date()]

df.head()

Unnamed: 0,date_time,zone_id,banner_id,campaign_clicks,os_id,country_id,impressions,clicks,date
0,2021-09-27 00:01:30,0,0,0,0,0,1,1,2021-09-27
1,2021-09-26 22:54:49,1,1,0,0,1,1,1,2021-09-26
2,2021-09-26 23:57:20,2,2,3,0,0,1,1,2021-09-26
3,2021-09-27 00:04:30,3,3,0,1,1,1,1,2021-09-27
4,2021-09-27 00:06:21,4,4,0,1,0,1,1,2021-09-27


Посмотрим на распределение по часу в сутках.

In [7]:
df['hour'] = df['date_time'].dt.hour

df.groupby('hour').agg(num_actions=('clicks', 'count')).sort_index().head(25)

Unnamed: 0_level_0,num_actions
hour,Unnamed: 1_level_1
0,550787
1,498533
2,482018
3,444033
4,405813
5,474251
6,476654
7,481859
8,547525
9,516624


И на баланс классов.

In [8]:
df.groupby('clicks').agg(num_actions=('impressions', 'count')).sort_index().head(10)

Unnamed: 0_level_0,num_actions
clicks,Unnamed: 1_level_1
0,15399222
1,422249


In [9]:
df.groupby(['clicks', 'date']).agg(num_actions=('impressions', 'count')).sort_index().head(15)

Unnamed: 0_level_0,Unnamed: 1_level_0,num_actions
clicks,date,Unnamed: 2_level_1
0,2021-09-26,3041381
0,2021-09-27,2325714
0,2021-09-28,2262535
0,2021-09-29,2356509
0,2021-09-30,1784718
0,2021-10-01,1574749
0,2021-10-02,2053616
1,2021-09-26,61229
1,2021-09-27,41589
1,2021-09-28,44820


Положительных примеров намного меньше, нужно по-разному штрафовать за некорректный предикт.

# feature engineering

Для каждого из столбцов `os_id`, `zone_id`, `country_id`, `banner_id` сформируем следующие фичи: количество рекламных показов за последний день/час, конверсия в клик за последний день/час.

In [10]:
# вычтем из текущего timestamp для последующего джоина
df['last_date'] = (df['date_time'] - timedelta(days=1)).dt.date

df['last_hour'] = (df['date_time'] - timedelta(hours=1)).dt.hour

In [11]:
def add_feature_by_time(df, feature):
    feature_date = (
        df
        .groupby(['date', feature], as_index=False)
        .agg(
            feature_last_date_actions=('impressions', 'count'), 
            feature_last_date_conversion=('clicks', 'mean')
        )
        .rename(columns={
            'date': 'last_date',
            'feature_last_date_actions': f'{feature}_last_date_actions',
            'feature_last_date_conversion': f'{feature}_last_date_conversion'
        })
    )
    
    feature_date_hour = (
        df
        .groupby(['date', 'hour', feature], as_index=False)
        .agg(
            feature_last_hour_actions=('impressions', 'count'), 
            feature_last_hour_conversion=('clicks', 'mean')
        )
        .rename(columns={
            'date': 'last_date',
            'hour': 'last_hour',
            'feature_last_hour_actions': f'{feature}_last_hour_actions',
            'feature_last_hour_conversion': f'{feature}_last_hour_conversion'
        })
    )
    
    return (
        df
        .merge(
            feature_date,
            on=['last_date', feature],
            how='left'
        )
        .merge(
            feature_date_hour,
            on=['last_date', 'last_hour', feature],
            how='left'
        )
    )

df_merged = add_feature_by_time(df, 'banner_id')
df_merged = add_feature_by_time(df_merged, 'zone_id')
df_merged = add_feature_by_time(df_merged, 'os_id')
df_merged = add_feature_by_time(df_merged, 'country_id')

Посмотрим на получившуюся таблицу и список столбцов.

In [12]:
df_merged.head()

Unnamed: 0,date_time,zone_id,banner_id,campaign_clicks,os_id,country_id,impressions,clicks,date,hour,...,zone_id_last_hour_actions,zone_id_last_hour_conversion,os_id_last_date_actions,os_id_last_date_conversion,os_id_last_hour_actions,os_id_last_hour_conversion,country_id_last_date_actions,country_id_last_date_conversion,country_id_last_hour_actions,country_id_last_hour_conversion
0,2021-09-27 00:01:30,0,0,0,0,0,1,1,2021-09-27,0,...,8537.0,0.004685,703262.0,0.019747,27951.0,0.018032,1001263.0,0.01592,54894.0,0.011713
1,2021-09-26 22:54:49,1,1,0,0,1,1,1,2021-09-26,22,...,,,,,,,,,,
2,2021-09-26 23:57:20,2,2,3,0,0,1,1,2021-09-26,23,...,,,,,,,,,,
3,2021-09-27 00:04:30,3,3,0,1,1,1,1,2021-09-27,0,...,1234.0,0.014587,604068.0,0.01398,22547.0,0.008693,321213.0,0.023688,24937.0,0.016121
4,2021-09-27 00:06:21,4,4,0,1,0,1,1,2021-09-27,0,...,42.0,0.071429,604068.0,0.01398,22547.0,0.008693,1001263.0,0.01592,54894.0,0.011713


In [13]:
df_merged.columns

Index(['date_time', 'zone_id', 'banner_id', 'campaign_clicks', 'os_id',
       'country_id', 'impressions', 'clicks', 'date', 'hour', 'last_date',
       'last_hour', 'banner_id_last_date_actions',
       'banner_id_last_date_conversion', 'banner_id_last_hour_actions',
       'banner_id_last_hour_conversion', 'zone_id_last_date_actions',
       'zone_id_last_date_conversion', 'zone_id_last_hour_actions',
       'zone_id_last_hour_conversion', 'os_id_last_date_actions',
       'os_id_last_date_conversion', 'os_id_last_hour_actions',
       'os_id_last_hour_conversion', 'country_id_last_date_actions',
       'country_id_last_date_conversion', 'country_id_last_hour_actions',
       'country_id_last_hour_conversion'],
      dtype='object')

Добавим упоминавшиеся в начале ноутбука OHE и выбросим нечисловые столбцы. 

In [14]:
cols_for_drop = [
    'date_time',
    'zone_id',
    'banner_id',
    'os_id',
    'country_id',
    'impressions',
    'last_date',
    'last_hour',
    'date'
]

In [15]:
os_ids = pd.get_dummies(df.os_id, prefix='os_id')
country_ids = pd.get_dummies(df.country_id, prefix='country_id')

df_for_train = pd.concat([
    df_merged.fillna(0), 
    os_ids, 
    country_ids
], axis=1)

Возьмем по одному дню на валидацию и тест.

In [16]:
train = df_for_train[df_for_train['date'] < datetime(2021, 10, 2).date()].reset_index()

test = df_for_train[df_for_train['date'] == datetime(2021, 10, 2).date()].drop(cols_for_drop, axis=1)

In [17]:
train_test_split = [
    train[train['date'] < datetime(2021, 10, 1).date()].index, 
    train[train['date'] == datetime(2021, 10, 1).date()].index,
]

train = train.drop(cols_for_drop, axis=1)

# baseline

In [28]:
from sklearn.metrics import log_loss

dummy_predict = [train.clicks.mean()] * len(test)
baseline_score = log_loss(test.clicks.values, dummy_predict)

print(f'Baseline log-loss score: {baseline_score:.4f}')

Baseline log-loss score: 0.1549


# model

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline

In [3]:
params = {
    'penalty': ['l2', 'none'], # l1 поддерживает только liblinear (не влезает в память) или saga (sgd-like) :(
    'C': [0.5, 0.75, 1, 1.5, 2],
    'class_weight': [None, 'balanced'],
    'n_jobs': [-1]
}

model = Pipeline([
    ('scaler', MinMaxScaler()),
    ('predictor', GridSearchCV(
        LogisticRegression(), 
        scoring='neg_log_loss', 
        param_grid=params,
        cv=[train_test_split]
    ))
])

model.fit(train.drop('clicks', axis=1), train.clicks);

In [4]:
log_loss(test.clicks, model.predict_proba(test.drop('clicks', axis=1).fillna(0))[:, 1])

0.1479