In [1]:
import random
import itertools

import pandas as pd
import numpy as np
import scipy.sparse
from scipy.sparse import csr_matrix, csc_matrix

In [2]:
data = pd.read_csv('../data/data.csv')
data.head()

Unnamed: 0,date_time,zone_id,banner_id,oaid_hash,campaign_clicks,os_id,country_id,banner_id0,rate0,g0,coeff_sum0,banner_id1,rate1,g1,coeff_sum1,impressions,clicks
0,2021-09-27 00:01:30.000000,0,0,5664530014561852622,0,0,0,1240,0.067,0.035016,-7.268846,0,0.01,0.049516,-5.369901,1,1
1,2021-09-26 22:54:49.000000,1,1,5186611064559013950,0,0,1,1,0.002,0.054298,-2.657477,269,0.004,0.031942,-4.44922,1,1
2,2021-09-26 23:57:20.000000,2,2,2215519569292448030,3,0,0,2,0.014,0.014096,-3.824875,21,0.014,0.014906,-3.939309,1,1
3,2021-09-27 00:04:30.000000,3,3,6262169206735077204,0,1,1,3,0.012,0.015232,-3.461357,99,0.006,0.050671,-3.418403,1,1
4,2021-09-27 00:06:21.000000,4,4,4778985830203613115,0,1,0,4,0.019,0.051265,-4.009026,11464230,6.79,0.032005,-2.828797,1,1


In [3]:
def preprocess_inplace(data: pd.DataFrame):
    # в описании указано что эти столбцы использовать не нужно
    data.drop(columns=['oaid_hash', 'banner_id0', 'banner_id1', 'rate0', 'rate1', 'g0', 'g1', 'coeff_sum0', 'coeff_sum1'], inplace=True)
    data['date_time'] = pd.to_datetime(data['date_time'])
    data.sort_values(by='date_time', inplace=True)
    
preprocess_inplace(data)
data.head()

Unnamed: 0,date_time,zone_id,banner_id,campaign_clicks,os_id,country_id,impressions,clicks
1390198,2021-09-01 00:02:49,30,596,0,0,7,1,0
5041415,2021-09-26 00:00:00,41,29,1,3,0,1,0
1442602,2021-09-26 00:00:00,1,188,2,2,15,1,0
7232498,2021-09-26 00:00:00,17,52,2,2,5,1,0
14938691,2021-09-26 00:00:00,47,73,1,4,13,1,0


In [4]:
from typing import List, Tuple, Union, Optional

Feature = Tuple[List[str], np.array]
CategoricalFeature = Tuple[List[str], csc_matrix]

def categorical_feature_from_series(series: pd.Series) -> CategoricalFeature:
    dummies = pd.get_dummies(series, prefix=series.name, sparse=True)
    return dummies.columns.tolist(), csc_matrix(dummies.sparse.to_coo())

def product_of_categorical_features(
    feature1: CategoricalFeature, feature2: CategoricalFeature
) -> CategoricalFeature:
    names1, feature1 = feature1
    names2, feature2 = feature2
    
    new_names = []
    new_columns = []
    for i, name1 in enumerate(names1):
        for j, name2 in enumerate(names2):
            new_names.append(f"{name1}__{name2}")
            new_columns.append(feature1[:, i].multiply(feature2[:, j]))
    
    return new_names, scipy.sparse.hstack(new_columns)

In [5]:
def feature_engineering(data: pd.DataFrame, feature_products: List[Tuple[str, str]] = []) -> Feature:
    cat_feature_names = ["zone_id", "banner_id", "os_id", "country_id"]
    features = [categorical_feature_from_series(data[name]) for name in cat_feature_names]
    
    cat_feature_dict = dict(zip(cat_feature_names, features))
    for name1, name2 in feature_products:
        product_feature = product_of_categorical_features(cat_feature_dict[name1], cat_feature_dict[name2])
        features.append(product_feature)
    
    features.append((["log_clicks"], csc_matrix(np.log(1 + data.campaign_clicks.values.reshape(-1, 1)))))
    
    # gather everything
    all_names, all_features = [], []
    for names, csc in features:
        all_names.extend(names)
        all_features.append(csc)
    all_features = csr_matrix(scipy.sparse.hstack(all_features))
    
    return all_names, all_features

In [6]:
%time basic_features = feature_engineering(data)
%time more_features = feature_engineering(data, feature_products=[["os_id", "country_id"]])
#%time lots_of_features = feature_engineering(data, feature_products=[["os_id", "country_id"], ["zone_id", "banner_id"]])

CPU times: user 13.9 s, sys: 823 ms, total: 14.7 s
Wall time: 14.7 s
CPU times: user 15.5 s, sys: 1.07 s, total: 16.5 s
Wall time: 16.5 s


In [7]:
answers = csr_matrix(data.clicks.values.reshape(-1, 1))

In [8]:
def last_day_eval_split(data, X, y):
    last_event = data.date_time.iloc[-1]
    day = last_event.day
    month = last_event.month
    year = last_event.year
    events_on_last_day = ((data.date_time.dt.day == day) & (data.date_time.dt.month == month) & (data.date_time.dt.year == year)).sum()
    
    X_train = X[:-events_on_last_day]
    y_train = y[:-events_on_last_day]
    
    X_eval = X[-events_on_last_day:]
    y_eval = y[-events_on_last_day:]
    
    return X_train, y_train, X_eval, y_eval
    
def train_val_split(X, y):
    n = X.shape[0]
    
    val = n // 10
    
    X_train = X[:-val]
    X_val = X[-val:]
    
    y_train = y[:-val]
    y_val = y[-val:]
    
    return X_train, y_train, X_val, y_val

In [9]:
X, y, X_eval, y_eval = last_day_eval_split(data, basic_features[-1], answers)
X_train, y_train, X_val, y_val = train_val_split(X, y)

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

def create_model(X, y, C=1.0):
    y = y.astype(np.float64).todense().reshape((-1, 1))  # no idea why
    model = LogisticRegression(penalty='l2', C=C, solver='lbfgs')
    model.fit(X, y)
    return model

In [15]:
def cv(data_dict, answers):
    models = {}
    scores = {}
    
    C_grid = [0.00001, 0.0001, 0.001]  # larger do not converge
    for name, features in data_dict.items():
        X, y, X_eval, y_eval = last_day_eval_split(data, features[-1], answers)
        X_train, y_train, X_val, y_val = train_val_split(X, y)
        
        for C in C_grid:
            model = create_model(X_train, y_train, C)
            
            models[(name, C)] = model
            scores[(name, C)] = log_loss(y_val.todense(), model.predict_proba(X_val))
            
    best_score = 10e8
    best_model = None
    best_model_id = None
    for id_, score in scores.items():
        if score < best_score:
            best_score = score
            best_model = models[id_]
            best_model_id = id_
    
    return best_model, best_score, best_model_id

In [16]:
best_model, best_score, (features, C) = cv({
    "base": basic_features,
    "with_interactions": more_features
}, answers)
print(f"Best model chosen with {features} features and C={C}")

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Best model chosen with with_interactions features and C=0.001




In [21]:
X, y, X_eval, y_eval = last_day_eval_split(data, more_features[-1], answers)
logloss_eval = log_loss(y_eval.todense(), best_model.predict_proba(X_eval))

print(f"Log loss on last day data: {logloss_eval}")

Log loss on last day data: 0.1378303725707748


