In [None]:
!pip install category_encoders

In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from category_encoders.one_hot import OneHotEncoder
from category_encoders.target_encoder import TargetEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

In [2]:
data=pd.read_csv('data.csv', usecols=['date_time','zone_id','banner_id',
                                        'campaign_clicks','os_id','country_id',
                                        'clicks'], parse_dates=['date_time'], 
                                        dtype={'zone_id':str,'banner_id':str,
                                                'os_id':str,'country_id':str,'clicks':int})
data.head()

Unnamed: 0,date_time,zone_id,banner_id,campaign_clicks,os_id,country_id,clicks
0,2021-09-27 00:01:30,0,0,0,0,0,1
1,2021-09-26 22:54:49,1,1,0,0,1,1
2,2021-09-26 23:57:20,2,2,3,0,0,1
3,2021-09-27 00:04:30,3,3,0,1,1,1
4,2021-09-27 00:06:21,4,4,0,1,0,1


In [3]:
data.clicks.value_counts(normalize=True)

0    0.973312
1    0.026688
Name: clicks, dtype: float64

In [4]:
data = data.set_index('date_time').sort_index()
np.unique(data.index.date)

array([datetime.date(2021, 9, 1), datetime.date(2021, 9, 26),
       datetime.date(2021, 9, 27), datetime.date(2021, 9, 28),
       datetime.date(2021, 9, 29), datetime.date(2021, 9, 30),
       datetime.date(2021, 10, 1), datetime.date(2021, 10, 2)],
      dtype=object)

In [5]:
def split_transform_data(data: pd.DataFrame, smooth=1):
    X_train = data.loc[:'2021-09-30'].copy()
    X_valid = data.loc['2021-10-01'].copy()
    X_test = data.loc['2021-10-02'].copy()
    
    one_hot = OneHotEncoder().fit(X_train[['os_id','country_id']])
    target = TargetEncoder(smoothing=smooth).fit(X_train[['zone_id','banner_id']], X_train['clicks'])
    
    def feature_engineering(data: pd.DataFrame) -> pd.DataFrame:
        data['hour'] = data.index.hour
        data[['zone_id','banner_id']] = target.transform(data[['zone_id','banner_id']])
        return pd.concat([data.drop(columns=['os_id','country_id']), 
                          one_hot.transform(data[['os_id','country_id']])], axis=1)

    X_train = feature_engineering(X_train)
    X_valid = feature_engineering(X_valid)
    X_test = feature_engineering(X_test)
    
    return X_train, X_valid, X_test

In [6]:
def create_model(data: pd.DataFrame, smooth=1, **kwargs):
    X_train, X_valid, X_test = split_transform_data(data, smooth)
    X, y = X_train.drop(columns='clicks').values, X_train.clicks.values
    model = LogisticRegression(**kwargs).fit(X, y)
    return model, X_train, X_valid, X_test

In [7]:
def cv(data: pd.DataFrame, smooth=1, **kwargs):
    print(f'Params: smooth={smooth} {" ".join(f"{key}={value}" for key, value in kwargs.items())}')
    model, X_train, X_valid, X_test = create_model(data, smooth, **kwargs)
    valid_score = log_loss(X_valid.clicks.values, model.predict_proba(X_valid.drop(columns='clicks').values))
    test_score = log_loss(X_test.clicks.values, model.predict_proba(X_test.drop(columns='clicks').values))
    print(f'validation score: {valid_score}')
    print('\n')
    
cv(data)
cv(data, smooth=3)
cv(data, smooth=5)
cv(data, max_iter=300)
cv(data, max_iter=500)
cv(data, C=3)
cv(data, C=5)
cv(data, C=3, max_iter=300)

Params: smooth=1 
validation score: 0.16658134775772626


Params: smooth=3 
validation score: 0.16687538095334495


Params: smooth=5 
validation score: 0.1681042530650715


Params: smooth=1 max_iter=300
validation score: 0.1650812450711003


Params: smooth=1 max_iter=500
validation score: 0.1651759413651248


Params: smooth=1 C=3
validation score: 0.16535681950093653


Params: smooth=1 C=5
validation score: 0.16619088344426566


Params: smooth=1 C=3 max_iter=300
validation score: 0.16514746004543288




In [8]:
def test(data: pd.DataFrame, **kwargs):
    print(f'Params: {" ".join(f"{key}={value}" for key, value in kwargs.items())}')
    model, X_train, X_valid, X_test = create_model(data, **kwargs)
    valid_score = log_loss(X_valid.clicks.values, model.predict_proba(X_valid.drop(columns='clicks').values))
    test_score = log_loss(X_test.clicks.values, model.predict_proba(X_test.drop(columns='clicks').values))
    print(f'test score: {test_score}')
    
test(data, max_iter=300)

Params: max_iter=300
test score: 0.14498839732100124


