In [1]:
import numpy as np
import pandas as pd
from itertools import combinations
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit


def analysis(data: pd.DataFrame):
    print('clicks distribution:', data['clicks'].value_counts().to_dict())
    for col in ['zone_id', 'banner_id', 'os_id', 'country_id', 'impressions', 'clicks']:
        n_unique = data[col].nunique()
        print(f'{col}: \t {data[col].unique() if n_unique <= 20 else n_unique} unique values')


def feature_engineering(data: pd.DataFrame) -> pd.DataFrame:
    data['date_time'] = pd.to_datetime(data['date_time'])
    data = data.sort_values('date_time').reset_index(drop=True)

    seconds_in_day = 24 * 60 * 60
    seconds = (data['date_time'].astype(int) // 10 ** 9) % seconds_in_day
    data['sin_time'] = np.sin(2 * np.pi * seconds / seconds_in_day)
    data['cos_time'] = np.cos(2 * np.pi * seconds / seconds_in_day)

    # счетчики кликов для разных категориальных фичей
    id_columns = ['zone_id', 'banner_id', 'os_id', 'country_id']
    for col in id_columns:
        data[f'{col}_cum_clicks'] = data.groupby(col)['clicks'].cumsum()
        data[f'{col}_cum_impressions'] = data.groupby(col)['impressions'].cumsum()
        data[f'{col}_ctr'] = (data[f'{col}_cum_clicks'] - data['clicks']) / (
                data[f'{col}_cum_impressions'] - 1)
        data.loc[data[f'{col}_cum_impressions'] == 1, f'{col}_ctr'] = np.nan

    # кросс-счетчики кликов для разных категориальных фичей
    for col1, col2 in combinations(['zone_id', 'banner_id', 'os_id', 'country_id'], r=2):
        col = f'{col1}_{col2}'
        data[f'{col}_cum_clicks'] = data.groupby([col1, col2])['clicks'].cumsum()
        data[f'{col}_cum_impressions'] = data.groupby([col1, col2])['impressions'].cumsum()
        data[f'{col}_ctr'] = (data[f'{col}_cum_clicks'] - data['clicks']) / (
                data[f'{col}_cum_impressions'] - 1)
        data.loc[data[f'{col}_cum_impressions'] == 1, f'{col}_ctr'] = np.nan

    # заполняем ctr для первых в истории взаимодействий средним значением
    for col in data:
        data[col].fillna(data[col].mean(skipna=True), inplace=True)

    drop_columns = ['oaid_hash', 'banner_id0', 'banner_id1', 'rate0', 'rate1', 'g0', 'g1',
                    'coeff_sum0', 'coeff_sum1']
    return data.drop(drop_columns, axis=1)


def get_x_y(data: pd.DataFrame) -> tuple:
    drop_columns = ['date_time', 'zone_id', 'banner_id', 'os_id', 'country_id', 'impressions']
    return data.drop(drop_columns + ['clicks'], axis=1), data['clicks']


def cv(data: pd.DataFrame, sample_size: int = 100_000) -> dict:
    sample_data = pd.concat([
        data[data['clicks'] == 0].sample(n=sample_size),
        data[data['clicks'] == 1].sample(n=sample_size)]).sort_values('date_time').reset_index(drop=True)
    X_train, y_train = get_x_y(sample_data)

    param_grid = {'C': np.linspace(1, 50, 100), 'solver': ['newton-cg']}

    # в cv используем сплиты по времени
    grid = GridSearchCV(LogisticRegression(), param_grid, n_jobs=-1,
                        verbose=10, scoring="neg_log_loss", cv=TimeSeriesSplit(3))
    grid.fit(X_train, y_train)

    display(pd.DataFrame(grid.cv_results_).sort_values('rank_test_score').head())
    return grid.best_params_


def create_model(data: pd.DataFrame, best_params: dict) -> LogisticRegression:
    X_train, y_train = get_x_y(data)
    model = LogisticRegression(**best_params, max_iter=10, n_jobs=-1).fit(X_train, y_train)

    model_importances = sorted(list(zip(X_train.columns, model.coef_[0])), key=lambda x: -abs(x[1]))
    display(pd.DataFrame(model_importances, columns=['feature', 'importance']))

    return model


if __name__ == '__main__':
    data = pd.read_csv('../data/data.csv')
    data = feature_engineering(data)
    analysis(data)

    last_date = data.iloc[-1]['date_time'].normalize()
    test_data = data[data['date_time'].dt.normalize() == last_date]
    train_data = data[data['date_time'].dt.normalize() != last_date]
    print(f'train data size: {len(train_data)}')
    print(f'test  data size: {len(test_data)}')
    del data

    best_params = cv(train_data)
    print(f'best params from gridSearch: {best_params}')
    model = create_model(train_data, best_params)

    X_test, y_test = get_x_y(test_data)
    print(f'best cv model logloss: {log_loss(y_test, model.predict_proba(X_test)[:, 1])}')
    
    print('baseline logloss')
    print('-'*20)
    print(f"mean   logloss: {log_loss(y_test, np.full_like(y_test, train_data['clicks'].mean()))}")
    print(f"random logloss: {log_loss(y_test, np.random.random(y_test.shape))}")

clicks distribution: {0: 15399223, 1: 422249}
zone_id: 	 3444 unique values
banner_id: 	 1633 unique values
os_id: 	 [ 0  3  2  4  1  5  6  9  8  7 10] unique values
country_id: 	 [ 7  0 15  5 13  1 11  3 12  6  9 10  8  4 14  2 16] unique values
impressions: 	 [1] unique values
clicks: 	 [0 1] unique values
train data size: 13692494
test  data size: 2128978
Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 56 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   31.9s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   54.2s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  50 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 109 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 153 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done 220 out of 300 | elapsed:  5.7min remaining:  2.1min
[Parallel(n_jobs=-1)]: Done 251 out of 300 | elapsed:  6.5min remaining:  1.3min
[Parallel(n_jobs=-1)]: Done 282 out of 300 | elapsed:  7.1min remaining:   27.0s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  7.4mi

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_solver,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
69,81.140707,34.980214,0.075361,0.013205,35.151515,newton-cg,"{'C': 35.151515151515156, 'solver': 'newton-cg'}",-0.667164,-0.647167,-0.702997,-0.672443,0.023096,1
43,80.956703,31.721409,0.075074,0.008678,22.282828,newton-cg,"{'C': 22.282828282828284, 'solver': 'newton-cg'}",-0.663584,-0.635402,-0.732158,-0.677048,0.040631,2
19,66.632465,29.829828,0.061189,0.001846,10.40404,newton-cg,"{'C': 10.404040404040405, 'solver': 'newton-cg'}",-0.663573,-0.650905,-0.721405,-0.678628,0.030687,3
49,84.406377,34.411687,0.066748,0.003863,25.252525,newton-cg,"{'C': 25.252525252525253, 'solver': 'newton-cg'}",-0.666134,-0.646875,-0.722925,-0.678645,0.032283,4
5,93.747219,35.682037,0.084094,0.013359,3.474747,newton-cg,"{'C': 3.474747474747475, 'solver': 'newton-cg'}",-0.664471,-0.644661,-0.72684,-0.678657,0.035017,5


best params from gridSearch: {'C': 35.151515151515156, 'solver': 'newton-cg'}


Unnamed: 0,feature,importance
0,os_id_cum_clicks,1.592679e-05
1,zone_id_cum_clicks,1.397262e-05
2,country_id_cum_clicks,1.196824e-05
3,banner_id_cum_impressions,-1.019763e-05
4,banner_id_country_id_cum_impressions,-5.432586e-06
5,zone_id_country_id_cum_clicks,5.273583e-06
6,os_id_country_id_cum_impressions,5.21508e-06
7,zone_id_os_id_cum_clicks,5.141571e-06
8,os_id_country_id_cum_clicks,5.069006e-06
9,banner_id_country_id_cum_clicks,3.016862e-06


best cv model logloss: 0.21344911650083706
baseline logloss
--------------------
mean   logloss: 1.2226106923947875
random logloss: 0.9999078644339084
