In [2]:
import numpy as np
import pandas as pd
from itertools import combinations
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit


def analysis(data: pd.DataFrame):
    print('clicks distribution:', data['clicks'].value_counts().to_dict())
    for col in ['zone_id', 'banner_id', 'os_id', 'country_id', 'impressions', 'clicks']:
        n_unique = data[col].nunique()
        print(f'{col}: \t {data[col].unique() if n_unique <= 20 else n_unique} unique values')


def feature_engineering(data: pd.DataFrame) -> pd.DataFrame:
    data['date_time'] = pd.to_datetime(data['date_time'])
    data = data.sort_values('date_time').reset_index(drop=True)

    seconds_in_day = 24 * 60 * 60
    seconds = (data['date_time'].astype(int) // 10 ** 9) % seconds_in_day
    data['sin_time'] = np.sin(2 * np.pi * seconds / seconds_in_day)
    data['cos_time'] = np.cos(2 * np.pi * seconds / seconds_in_day)

    # счетчики кликов для разных категориальных фичей
    id_columns = ['zone_id', 'banner_id', 'os_id', 'country_id']
    for col in id_columns:
        data[f'{col}_cum_clicks'] = data.groupby(col)['clicks'].cumsum()
        data[f'{col}_cum_impressions'] = data.groupby(col)['impressions'].cumsum()
        data[f'{col}_ctr'] = (data[f'{col}_cum_clicks'] - data['clicks']) / (
                data[f'{col}_cum_impressions'] - 1)
        data.loc[data[f'{col}_cum_impressions'] == 1, f'{col}_ctr'] = np.nan

    # кросс-счетчики кликов для разных категориальных фичей
    for col1, col2 in combinations(['zone_id', 'banner_id', 'os_id', 'country_id'], r=2):
        col = f'{col1}_{col2}'
        data[f'{col}_cum_clicks'] = data.groupby([col1, col2])['clicks'].cumsum()
        data[f'{col}_cum_impressions'] = data.groupby([col1, col2])['impressions'].cumsum()
        data[f'{col}_ctr'] = (data[f'{col}_cum_clicks'] - data['clicks']) / (
                data[f'{col}_cum_impressions'] - 1)
        data.loc[data[f'{col}_cum_impressions'] == 1, f'{col}_ctr'] = np.nan

    # заполняем ctr для первых в истории взаимодействий средним значением
    for col in data:
        data[col].fillna(data[col].mean(skipna=True), inplace=True)

    drop_columns = ['oaid_hash', 'banner_id0', 'banner_id1', 'rate0', 'rate1', 'g0', 'g1',
                    'coeff_sum0', 'coeff_sum1']
    return data.drop(drop_columns, axis=1)


def get_x_y(data: pd.DataFrame) -> tuple:
    drop_columns = ['date_time', 'zone_id', 'banner_id', 'os_id', 'country_id', 'impressions']
    return data.drop(drop_columns + ['clicks'], axis=1), data['clicks']


def cv(data: pd.DataFrame, sample_size: int = 100_000) -> dict:
    sample_data = pd.concat([
        data[data['clicks'] == 0].sample(n=sample_size),
        data[data['clicks'] == 1].sample(n=sample_size)]).sort_values('date_time').reset_index(drop=True)
    X_train, y_train = get_x_y(sample_data)

    param_grid = {'C': np.linspace(1, 50, 100), 'solver': ['newton-cg']}

    # в cv используем сплиты по времени
    grid = GridSearchCV(LogisticRegression(), param_grid, n_jobs=-1,
                        verbose=10, scoring="neg_log_loss", cv=TimeSeriesSplit(3))
    grid.fit(X_train, y_train)

    display(pd.DataFrame(grid.cv_results_).sort_values('rank_test_score').head())
    return grid.best_params_


def create_model(data: pd.DataFrame, best_params: dict) -> LogisticRegression:
    X_train, y_train = get_x_y(data)
    model = LogisticRegression(**best_params, max_iter=10, n_jobs=-1).fit(X_train, y_train)

    model_importances = sorted(list(zip(X_train.columns, model.coef_)), key=lambda x: -abs(x[1]))
    display(pd.DataFrame(model_importances, columns=['feature', 'importance']))

    return model


if __name__ == '__main__':
    data = pd.read_csv('../data/data.csv')
    data = feature_engineering(data)
    analysis(data)

    last_date = data.iloc[-1]['date_time'].normalize()
    test_data = data[data['date_time'].dt.normalize() == last_date]
    train_data = data[data['date_time'].dt.normalize() != last_date]
    print(f'train data size: {len(train_data)}')
    print(f'test  data size: {len(test_data)}')
    del data

    best_params = cv(train_data)
    print(f'best params from gridSearch: {best_params}')
    model = create_model(train_data, best_params)

    X_test, y_test = get_x_y(test_data)
    print(f'best cv model logloss: {log_loss(y_test, model.predict_proba(X_test)[:, 1])}')
    
    print('baseline logloss')
    print('-'*20)
    print(f"mean   logloss: {log_loss(y_test, np.full_like(y_test, train_data['clicks'].mean()))}")
    print(f"random logloss: {log_loss(y_test, np.random.random(y_test.shape))}")

clicks distribution: {0: 15399223, 1: 422249}
zone_id: 	 3444 unique values
banner_id: 	 1633 unique values
os_id: 	 [ 0  3  2  4  1  5  6  9  8  7 10] unique values
country_id: 	 [ 7  0 15  5 13  1 11  3 12  6  9 10  8  4 14  2 16] unique values
impressions: 	 [1] unique values
clicks: 	 [0 1] unique values
train data size: 13692494
test  data size: 2128978
Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 56 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   35.1s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   46.2s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  50 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 109 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 153 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done 220 out of 300 | elapsed:  5.9min remaining:  2.1min
[Parallel(n_jobs=-1)]: Done 251 out of 300 | elapsed:  6.7min remaining:  1.3min
[Parallel(n_jobs=-1)]: Done 282 out of 300 | elapsed:  7.1min remaining:   27.4s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  7.4mi

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_solver,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
12,82.441536,42.716219,0.084454,0.012204,6.939394,newton-cg,"{'C': 6.9393939393939394, 'solver': 'newton-cg'}",-0.673569,-0.661097,-0.686592,-0.673752,0.010409,1
73,82.25421,36.936773,0.061301,0.001165,37.131313,newton-cg,"{'C': 37.131313131313135, 'solver': 'newton-cg'}",-0.660071,-0.652456,-0.711763,-0.674763,0.026347,2
27,79.970292,33.005676,0.069607,0.002687,14.363636,newton-cg,"{'C': 14.363636363636365, 'solver': 'newton-cg'}",-0.668144,-0.65173,-0.705477,-0.675117,0.022489,3
70,85.010461,35.330784,0.075378,0.007862,35.646465,newton-cg,"{'C': 35.64646464646465, 'solver': 'newton-cg'}",-0.66776,-0.658511,-0.702092,-0.676121,0.018748,4
36,82.01158,34.224191,0.067726,0.003475,18.818182,newton-cg,"{'C': 18.81818181818182, 'solver': 'newton-cg'}",-0.656255,-0.651712,-0.720558,-0.676175,0.031438,5


best params from gridSearch: {'C': 6.9393939393939394, 'solver': 'newton-cg'}


Unnamed: 0,feature,importance
0,campaign_clicks,"[-9.756596570235439e-09, -3.5269325283710806e-..."


best cv model logloss: 0.21344959037381933
baseline logloss
--------------------
mean   logloss: 1.2226106923947875
random logloss: 0.9994340211626352
