# Предсказание вероятности клика на банер

* Модель logreg с фичами взаимодействий и target encoding
* best_params = {'params': {'C': 0.5, 'max_iter': 300, 'random_state': 1}
* test log_loss with best params =  0.13998072945658324

In [1]:
import pandas as pd

In [37]:
from dateutil import relativedelta
from typing import List
from sklearn.metrics import log_loss

from sklearn.linear_model import LogisticRegression
import numpy as np
import category_encoders as ce
import logging
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ParameterGrid
import matplotlib.pyplot as plt
from tqdm import tqdm
logging.basicConfig(format='%(asctime)s %(message)s',
                    level=logging.DEBUG)

In [4]:
columns = ['date_time',
 'zone_id',
 'campaign_clicks',
 'os_id',
 'banner_id',
 'impressions',
 'country_id',
 'clicks']

In [5]:
data = pd.read_csv('../data/data.csv',
                   parse_dates=['date_time']
                   , usecols=columns)
data.head()

Unnamed: 0,date_time,zone_id,banner_id,campaign_clicks,os_id,country_id,impressions,clicks
0,2021-09-27 00:01:30,0,0,0,0,0,1,1
1,2021-09-26 22:54:49,1,1,0,0,1,1,1
2,2021-09-26 23:57:20,2,2,3,0,0,1,1
3,2021-09-27 00:04:30,3,3,0,1,1,1,1
4,2021-09-27 00:06:21,4,4,0,1,0,1,1


# EDA

In [6]:
data['date_time'].min(), data['date_time'].max()

(Timestamp('2021-09-01 00:02:49'), Timestamp('2021-10-02 23:59:59'))

In [7]:
all_dates = sorted(data.date_time.dt.date.unique())

In [8]:
all_dates

[datetime.date(2021, 9, 1),
 datetime.date(2021, 9, 26),
 datetime.date(2021, 9, 27),
 datetime.date(2021, 9, 28),
 datetime.date(2021, 9, 29),
 datetime.date(2021, 9, 30),
 datetime.date(2021, 10, 1),
 datetime.date(2021, 10, 2)]

In [9]:
data[data['date_time'].dt.date==all_dates[0]]

Unnamed: 0,date_time,zone_id,banner_id,campaign_clicks,os_id,country_id,impressions,clicks
1390198,2021-09-01 00:02:49,30,596,0,0,7,1,0


In [10]:
data = data[data['date_time'].dt.date!=all_dates[0]]

In [11]:
data.describe()

Unnamed: 0,zone_id,banner_id,campaign_clicks,os_id,country_id,impressions,clicks
count,15821470.0,15821470.0,15821470.0,15821470.0,15821470.0,15821471.0,15821470.0
mean,81.5268,381.6483,0.623854,1.840605,4.346986,1.0,0.02668835
std,163.2448,395.9386,9.249152,1.530005,4.317701,0.0,0.161171
min,0.0,0.0,0.0,0.0,0.0,1.0,0.0
25%,14.0,52.0,0.0,1.0,0.0,1.0,0.0
50%,19.0,217.0,0.0,2.0,4.0,1.0,0.0
75%,60.0,611.0,0.0,3.0,7.0,1.0,0.0
max,3443.0,1632.0,829.0,10.0,16.0,1.0,1.0


In [12]:
data[['zone_id', 'campaign_clicks', 'os_id', 'country_id']].nunique()

zone_id            3444
campaign_clicks     822
os_id                11
country_id           17
dtype: int64

In [13]:
# bcs of const value
data.drop(columns=['impressions'], inplace=True)

In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15821471 entries, 0 to 15821471
Data columns (total 7 columns):
 #   Column           Dtype         
---  ------           -----         
 0   date_time        datetime64[ns]
 1   zone_id          int64         
 2   banner_id        int64         
 3   campaign_clicks  int64         
 4   os_id            int64         
 5   country_id       int64         
 6   clicks           int64         
dtypes: datetime64[ns](1), int64(6)
memory usage: 965.7 MB


In [15]:
data['os_id'] = data['os_id'].astype(np.int8)
data['country_id'] = data['country_id'].astype(np.int8)
data['banner_id'] = data['banner_id'].astype(np.int16)
data['zone_id'] = data['zone_id'].astype(np.int16)
data['campaign_clicks'] = data['campaign_clicks'].astype(np.int16)
data['clicks'] = data['clicks'].astype(np.int8)

In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15821471 entries, 0 to 15821471
Data columns (total 7 columns):
 #   Column           Dtype         
---  ------           -----         
 0   date_time        datetime64[ns]
 1   zone_id          int16         
 2   banner_id        int16         
 3   campaign_clicks  int16         
 4   os_id            int8          
 5   country_id       int8          
 6   clicks           int8          
dtypes: datetime64[ns](1), int16(3), int8(3)
memory usage: 377.2 MB


In [17]:
test_mask = all_dates[-1]
test = data[data.date_time.dt.date==test_mask]

train_mask = all_dates[-2]
train = data[data.date_time.dt.date==train_mask]

In [18]:
sorted(train.date_time.dt.date.unique())

[datetime.date(2021, 10, 1)]

In [19]:
test.date_time.min()

Timestamp('2021-10-02 00:00:00')

In [20]:
train.banner_id.nunique()

1080

In [21]:
# часть банеров из теста не присутствует в трейне, но 88% совпадает
len(set(test.banner_id).intersection(set(train.banner_id))), test.banner_id.nunique(), len(set(test.banner_id.unique()).intersection(set(train.banner_id.unique())))/test.banner_id.nunique()

(800, 914, 0.87527352297593)

# Process data

In [22]:
cat_cols = ['zone_id', 'banner_id', 'os_id', 'country_id']
for col in cat_cols:
    data[col] = data[col].astype(str)

In [23]:


def get_feature_zone_id_country_id(df:pd.DataFrame) -> pd.DataFrame:
    return df['zone_id'] + df['country_id']

def get_feature_zone_id_os_id(df:pd.DataFrame) -> pd.DataFrame:
    return df['zone_id'] + df['os_id']

def get_feature_os_id_country_id(df:pd.DataFrame) -> pd.DataFrame:
    return df['os_id'] + df['country_id']

def get_feature_os_id_hour(df:pd.DataFrame) -> pd.DataFrame:
    return df['os_id'] + df['hour']

def get_feature_os_id_hour_country_id(df:pd.DataFrame) -> pd.DataFrame:
    return df['os_id'] + df['hour'] + df['country_id']

def get_feature_os_id_zone_id_country_id(df:pd.DataFrame) -> pd.DataFrame:
    return df['os_id'] + df['zone_id'] + df['country_id']

def get_feature_hour_zone_id_country_id(df:pd.DataFrame) -> pd.DataFrame:
    return df['zone_id'] + df['hour'] + df['country_id']

def get_feature_banner_id_country_id(df:pd.DataFrame) -> pd.DataFrame:
    return df['banner_id'] + df['country_id']

def get_feature_banner_id_zone_id(df:pd.DataFrame) -> pd.DataFrame:
    return df['banner_id'] + df['zone_id']

def get_feature_banner_id_os_id(df:pd.DataFrame) -> pd.DataFrame:
    return df['banner_id'] + df['os_id']

def get_feature_banner_id_hour(df:pd.DataFrame) -> pd.DataFrame:
    return df['banner_id'] + df['hour']

def get_transform_campaign_clicks(df:pd.DataFrame) -> pd.DataFrame:
    return np.where(df.campaign_clicks==0, '0',
                 np.where(df.campaign_clicks>100, '>100',
                     np.where(df.campaign_clicks>50, '>50<=100',
                         np.where(df.campaign_clicks>10, '>10<=50', '>0<=10'))))

def get_feature_os_id_transform_campaign_clicks(df:pd.DataFrame) -> pd.DataFrame:
    return df['os_id'] + df['transform_campaign_clicks']

def get_feature_zone_id_transform_campaign_clicks(df:pd.DataFrame) -> pd.DataFrame:
    return df['zone_id'] + df['transform_campaign_clicks']

def get_feature_country_id_transform_campaign_clicks(df:pd.DataFrame) -> pd.DataFrame:
    return df['country_id'] + df['transform_campaign_clicks']

def get_feature_hour_transform_campaign_clicks(df:pd.DataFrame) -> pd.DataFrame:
    return df['hour'] + df['transform_campaign_clicks']

def get_hour(df: pd.DataFrame) -> pd.DataFrame:
    return df.date_time.dt.hour.astype(str)

def feature_preparation_pipeline(df: pd.DataFrame) -> pd.DataFrame:
    df['hour'] = get_hour(df)
    df['zone_id_country_id'] = get_feature_zone_id_os_id(df)
    df['zone_id_os_id'] = get_feature_zone_id_country_id(df)
    df['os_id_country_id'] = get_feature_os_id_country_id(df)
    df['os_id_hour'] = get_feature_os_id_hour(df)
    df['os_id_hour_country_id'] = get_feature_os_id_hour_country_id(df)
    df['os_id_zone_id_country_id'] = get_feature_os_id_zone_id_country_id(df)
    df['hour_zone_id_country_id'] = get_feature_hour_zone_id_country_id(df)
    df['banner_id_country_id'] = get_feature_banner_id_country_id(df)
    df['banner_id_zone_id'] = get_feature_banner_id_zone_id(df)
    df['banner_id_os_id'] = get_feature_banner_id_os_id(df)
    df['banner_id_os_hour'] = get_feature_banner_id_hour(df)
    df['transform_campaign_clicks'] = get_transform_campaign_clicks(df)
    df['os_id_transform_campaign_clicks'] = get_feature_os_id_transform_campaign_clicks(df)
    df['zone_id_transform_campaign_clicks'] = get_feature_zone_id_transform_campaign_clicks(df)
    df['country_id_transform_campaign_clicks'] = get_feature_country_id_transform_campaign_clicks(df)
    df['hour_transform_campaign_clicks'] = get_feature_hour_transform_campaign_clicks(df)
    return df


In [24]:
%%time
data = feature_preparation_pipeline(data)
data.columns

CPU times: user 43.9 s, sys: 27.1 s, total: 1min 10s
Wall time: 1min 27s


Index(['date_time', 'zone_id', 'banner_id', 'campaign_clicks', 'os_id',
       'country_id', 'clicks', 'hour', 'zone_id_country_id', 'zone_id_os_id',
       'os_id_country_id', 'os_id_hour', 'os_id_hour_country_id',
       'os_id_zone_id_country_id', 'hour_zone_id_country_id',
       'banner_id_country_id', 'banner_id_zone_id', 'banner_id_os_id',
       'banner_id_os_hour', 'transform_campaign_clicks',
       'os_id_transform_campaign_clicks', 'zone_id_transform_campaign_clicks',
       'country_id_transform_campaign_clicks',
       'hour_transform_campaign_clicks'],
      dtype='object')

In [25]:
all_dates

[datetime.date(2021, 9, 1),
 datetime.date(2021, 9, 26),
 datetime.date(2021, 9, 27),
 datetime.date(2021, 9, 28),
 datetime.date(2021, 9, 29),
 datetime.date(2021, 9, 30),
 datetime.date(2021, 10, 1),
 datetime.date(2021, 10, 2)]

In [26]:
features = ['zone_id', 'campaign_clicks', 'os_id', 'banner_id',
           'country_id', 'zone_id_country_id', 'zone_id_os_id',
           'os_id_country_id', 'hour', 'os_id_hour',
           'os_id_hour_country_id', 'os_id_zone_id_country_id',
           'hour_zone_id_country_id', 'banner_id_country_id', 'banner_id_zone_id',
           'banner_id_os_id', 'banner_id_os_hour', 'transform_campaign_clicks',
           'os_id_transform_campaign_clicks', 'zone_id_transform_campaign_clicks',
           'country_id_transform_campaign_clicks',
           'hour_transform_campaign_clicks']

target_enc_features = ['zone_id', 'os_id', 'banner_id',
           'country_id', 'zone_id_country_id', 'zone_id_os_id',
           'os_id_country_id', 'hour', 'os_id_hour',
           'os_id_hour_country_id', 'os_id_zone_id_country_id',
           'hour_zone_id_country_id', 'banner_id_country_id', 'banner_id_zone_id',
           'banner_id_os_id', 'banner_id_os_hour', 'transform_campaign_clicks',
           'os_id_transform_campaign_clicks', 'zone_id_transform_campaign_clicks',
           'country_id_transform_campaign_clicks',
           'hour_transform_campaign_clicks']


In [27]:
def data_transformation(data: pd.DataFrame,
                        train_date: object,
                        predict_date: object,
                        features: List,
                        target_enc_features: List) -> List:
    
    logging.info(f'start data trasformation')
    cat_transformer = ce.target_encoder.TargetEncoder(cols=target_enc_features)
    ss = StandardScaler()
    
    tr_mask= data.date_time.dt.date==train_date
    val_mask = data.date_time.dt.date==predict_date
    tr_ = data[tr_mask]
    tr_X = tr_[features]
    tr_y = tr_['clicks']

    logging.info(f'fit target encoder')
    tr_X = cat_transformer.fit_transform(tr_X, tr_y)
    tr_X['campaign_clicks'] = np.log(tr_X['campaign_clicks']+ 1)
    logging.info(f'fit standard scaller')
    tr_X['campaign_clicks'] = ss.fit_transform(tr_X)
    logging.info(f'tr_X.shape is {tr_X.shape}')

    val_ = data[val_mask]
    val_X = val_[features]
    val_y = val_['clicks']
    val_X = cat_transformer.transform(val_X)
    val_X['campaign_clicks'] = np.log(val_X['campaign_clicks'] + 1)
    val_X['campaign_clicks'] = ss.transform(val_X)
    logging.info(f'val_X.shape is {val_X.shape}')

    
    return [(tr_X, tr_y), (val_X, val_y)]


In [28]:
X_y_validation = data_transformation(data, all_dates[-3], all_dates[-2], features, target_enc_features)

2021-11-05 18:49:39,895 start data trasformation
2021-11-05 18:54:49,382 fit target encoder
2021-11-05 19:40:41,640 fit standard scaller
2021-11-05 19:40:44,414 tr_X.shape is (1851189, 22)
2021-11-05 19:52:58,926 val_X.shape is (1643448, 22)


In [29]:
X_y_test = data_transformation(data, all_dates[-2], all_dates[-1], features, target_enc_features)

2021-11-05 19:54:09,982 start data trasformation
2021-11-05 19:55:44,379 fit target encoder
2021-11-05 20:40:31,337 fit standard scaller
2021-11-05 20:40:33,539 tr_X.shape is (1643448, 22)
2021-11-05 20:53:33,338 val_X.shape is (2128978, 22)


# Train and eval models

In [72]:
def train_eval_model(x_y_data, model, params):
    logging.info(f"params={params}")
    model = model(**params)
    tr_X, tr_y = x_y_data[0]
    model.fit(tr_X, tr_y)
    val_X, val_y = x_y_data[1]
    tr_preds = model.predict_proba(tr_X)
    val_preds = model.predict_proba(val_X)
    preds_train = log_loss(tr_y, tr_preds)
    preds_val = log_loss(val_y, val_preds)
    logging.info(f"train log_loss={preds_train}")
    logging.info(f"val log_loss={preds_val}")
    importances = pd.Series(model.coef_.reshape(-1), index=tr_X.columns).sort_values(ascending=False)
#     logging.info(f"feature importances={importances}")
    return {"params": params, "train_score": preds_train, "val_score": preds_val}
    

In [62]:
params = {'C': np.arange(0.5, 6, 0.25),
          'random_state': [1],
          'max_iter': range(100, 1000, 200)}
params = ParameterGrid(params)

model = LogisticRegression

In [63]:
import gc
gc.collect()

244

In [64]:
results = []
for param in params:
    results.append(train_eval_model(X_y_validation, model, param))

2021-11-05 23:50:42,412 params={'C': 0.5, 'max_iter': 100, 'random_state': 1}
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
2021-11-05 23:51:05,089 train log_loss=0.13004898263909
2021-11-05 23:51:05,090 val log_loss=0.15907214016883073
2021-11-05 23:51:05,098 params={'C': 0.5, 'max_iter': 300, 'random_state': 1}
2021-11-05 23:52:02,229 train log_loss=0.13002862725272493
2021-11-05 23:52:02,230 val log_loss=0.15894319623049835
2021-11-05 23:52:02,235 params={'C': 0.5, 'max_iter': 500, 'random_state': 1}
2021-11-05 23:52:53,408 train log_loss=0.13002862725272493
2021-11-05 23:52:53,409 val log_loss=0.15894319623049835
2021-11-05 23:52:53,415 params={'C': 0.5, 'max

2021-11-06 00:14:06,368 train log_loss=0.12996854250971007
2021-11-06 00:14:06,369 val log_loss=0.15974284880003656
2021-11-06 00:14:06,375 params={'C': 1.75, 'max_iter': 300, 'random_state': 1}
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
2021-11-06 00:15:17,940 train log_loss=0.12992725935436547
2021-11-06 00:15:17,941 val log_loss=0.1595242045655199
2021-11-06 00:15:17,948 params={'C': 1.75, 'max_iter': 500, 'random_state': 1}
2021-11-06 00:16:39,657 train log_loss=0.12992485986488975
2021-11-06 00:16:39,658 val log_loss=0.15952255392498463
2021-11-06 00:16:39,664 params={'C': 1.75, 'max_iter': 700, 'random_state': 1}
2021-11-06 00:17:59,337 train log_loss=0.

2021-11-06 00:36:04,104 params={'C': 2.75, 'max_iter': 500, 'random_state': 1}
2021-11-06 00:37:24,330 train log_loss=0.12991577302177676
2021-11-06 00:37:24,331 val log_loss=0.15964533080675766
2021-11-06 00:37:24,337 params={'C': 2.75, 'max_iter': 700, 'random_state': 1}
2021-11-06 00:38:47,793 train log_loss=0.12991577302177676
2021-11-06 00:38:47,794 val log_loss=0.15964533080675766
2021-11-06 00:38:47,802 params={'C': 2.75, 'max_iter': 900, 'random_state': 1}
2021-11-06 00:40:10,131 train log_loss=0.12991577302177676
2021-11-06 00:40:10,132 val log_loss=0.15964533080675766
2021-11-06 00:40:10,136 params={'C': 3.0, 'max_iter': 100, 'random_state': 1}
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression

2021-11-06 01:01:40,134 train log_loss=0.12996862838437542
2021-11-06 01:01:40,135 val log_loss=0.15973692935928352
2021-11-06 01:01:40,145 params={'C': 4.0, 'max_iter': 300, 'random_state': 1}
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
2021-11-06 01:02:45,555 train log_loss=0.1299130030167316
2021-11-06 01:02:45,556 val log_loss=0.15967354927853863
2021-11-06 01:02:45,561 params={'C': 4.0, 'max_iter': 500, 'random_state': 1}
2021-11-06 01:04:04,771 train log_loss=0.12991072290520336
2021-11-06 01:04:04,771 val log_loss=0.1597099870417482
2021-11-06 01:04:04,776 params={'C': 4.0, 'max_iter': 700, 'random_state': 1}
2021-11-06 01:05:30,567 train log_loss=0.1299

2021-11-06 01:23:56,418 train log_loss=0.1299489990098209
2021-11-06 01:23:56,419 val log_loss=0.15977491614312084
2021-11-06 01:23:56,424 params={'C': 5.0, 'max_iter': 300, 'random_state': 1}
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
2021-11-06 01:25:09,374 train log_loss=0.12991137070919834
2021-11-06 01:25:09,375 val log_loss=0.1597363122252091
2021-11-06 01:25:09,380 params={'C': 5.0, 'max_iter': 500, 'random_state': 1}
2021-11-06 01:26:44,279 train log_loss=0.12990940159874226
2021-11-06 01:26:44,280 val log_loss=0.15975015247832788
2021-11-06 01:26:44,285 params={'C': 5.0, 'max_iter': 700, 'random_state': 1}
2021-11-06 01:28:15,102 train log_loss=0.1299

In [65]:
results_df = pd.DataFrame(results)

In [71]:
results_df.sort_values('val_score', ascending=True).head()

Unnamed: 0,params,train_score,val_score
1,"{'C': 0.5, 'max_iter': 300, 'random_state': 1}",0.130029,0.158943
2,"{'C': 0.5, 'max_iter': 500, 'random_state': 1}",0.130029,0.158943
3,"{'C': 0.5, 'max_iter': 700, 'random_state': 1}",0.130029,0.158943
4,"{'C': 0.5, 'max_iter': 900, 'random_state': 1}",0.130029,0.158943
0,"{'C': 0.5, 'max_iter': 100, 'random_state': 1}",0.130049,0.159072


In [69]:
best_params = {'C': 0.5, 'max_iter': 300, 'random_state': 1} 

In [73]:
train_eval_model(X_y_test, model, best_params)

2021-11-06 10:41:15,917 params={'C': 0.5, 'max_iter': 300, 'random_state': 1}
2021-11-06 10:41:50,650 train log_loss=0.1427875464240435
2021-11-06 10:41:50,651 val log_loss=0.13998072945658324


{'params': {'C': 0.5, 'max_iter': 300, 'random_state': 1},
 'train_score': 0.1427875464240435,
 'val_score': 0.13998072945658324}