In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/real-time-advertisers-auction/Dataset.csv')
df['date'] = pd.to_datetime(df['date'])
df.head(5)

In [None]:
df['CPM'] = (df['total_revenue'] / df['measurable_impressions']) * 100 * 1000
df.loc[df['measurable_impressions']==0, 'CPM'] = 0

In [None]:
df[['revenue_share_percent', 'integration_type_id']].nunique()

In [None]:
df.drop(['revenue_share_percent', 'integration_type_id'], axis='columns', inplace=True)

In [None]:
split_mask = df['date'] < pd.to_datetime('2019-06-22')
train_df, test_df = df[split_mask], df[~split_mask]
train_df = train_df[train_df['CPM'] >= 0]
test_df = test_df[test_df['CPM'] >= 0]

train_df = train_df[train_df['CPM'] <= np.percentile(train_df['CPM'], 95)]
test_df = test_df[test_df['CPM'] <= np.percentile(test_df['CPM'], 95)]

train_df['sample'] = 'train'
test_df['sample'] = 'test'

df = pd.concat([train_df, test_df])

In [None]:
df.columns

In [None]:
cat_cols = [
    'site_id', 'ad_type_id', 'device_category_id', 'line_item_type_id', 'os_id',
    'monetization_channel_id'
]
id_cols = [
    'geo_id', 'advertiser_id', 'order_id', 'ad_unit_id'
]
all_discrete_cols = cat_cols + id_cols

other_cols = [
    'total_impressions', 'viewable_impressions', 'measurable_impressions'
]
other_cols_wo_targets = [
    'total_impressions', 'viewable_impressions'
]

target_col = 'CPM'

In [None]:
train_df[all_discrete_cols].nunique()

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
mean_squared_error(test_df[target_col], np.zeros(test_df.shape[0]))

In [None]:
mean_squared_error(
    test_df[target_col], 
    np.ones(test_df.shape[0]) * np.mean(train_df[target_col])
)

## Baseline on OHE + LinReg

In [None]:
from sklearn.model_selection import TimeSeriesSplit, cross_validate
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer

In [None]:
train_df = train_df.sort_values('date')
cv = TimeSeriesSplit(n_splits=5)

In [None]:
baseline_model = Pipeline([
    ('ohe', OneHotEncoder(handle_unknown='ignore')),
    ('linreg', LinearRegression())
])

In [None]:
# baseline_cv = cross_validate(
#     baseline_model, 
#     train_df[all_discrete_cols], train_df[target_col],
#     cv=cv,
#     scoring='neg_mean_squared_error',
#     n_jobs=-1,
# )

# baseline_cv

In [None]:
baseline_model.fit(train_df[all_discrete_cols], train_df[target_col])
mean_squared_error(baseline_model.predict(test_df[all_discrete_cols]), test_df[target_col])

### MSE = 4569 :good-enough: :pepe-happy:

In [None]:
from matplotlib import pyplot as plt
fig, [ax1, ax2] = plt.subplots(nrows=1, ncols=2, figsize=(20, 10))

train_df['CPM'].hist(bins=40, ax=ax1)
ax1.set_title('train')

test_df['CPM'].hist(bins=40, ax=ax2)
ax2.set_title('test');

## try to classify CPM == 0

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
cpm_divider = Pipeline([
    ('ohe', OneHotEncoder(handle_unknown='ignore')),
    ('logreg', LogisticRegression(solver='sag', max_iter=500))
])
# cv_logreg = cross_validate(
#     cpm_divider, 
#     train_df[all_discrete_cols], (train_df[target_col]>0), 
#     scoring=['roc_auc', 'precision', 'recall'],
# )
# cv_logreg

In [None]:
from sklearn.base import BaseEstimator
class CustomPredictor(BaseEstimator):
    def __init__(self, divider, predictor):
        self.divider = divider
        self.predictor = predictor
        self.threshold = 0.5
        
    def fit(self, X, y):
        cpm_over_zero = y > 0
        self.divider.fit(X, cpm_over_zero)
        self.predictor.fit(X[cpm_over_zero], y[cpm_over_zero])
    
    def predict(self, X):
        divider_score = self.divider.predict_proba(X)[:, 1]
        predicted_cpm = self.predictor.predict(X)
        predicted_cpm[divider_score < self.threshold] = 0
        return predicted_cpm

In [None]:
cpm_divider = Pipeline([
    ('ohe', OneHotEncoder(handle_unknown='ignore')),
    ('logreg', LogisticRegression(solver='sag', max_iter=500))
])
cpm_predictor = Pipeline([
    ('ohe', OneHotEncoder(handle_unknown='ignore')),
    ('linreg', LinearRegression())
])

custom = CustomPredictor(cpm_divider, cpm_predictor)

In [None]:
# custom_cv = cross_validate(
#     custom, 
#     train_df[all_discrete_cols], train_df[target_col], 
#     scoring='neg_mean_squared_error',
# )
# custom_cv

In [None]:
custom.fit(train_df[all_discrete_cols], train_df[target_col])
mean_squared_error(test_df[target_col], custom.predict(test_df[all_discrete_cols]))

### MSE = 4234 without tuning threshold for logreg

## Feature engineering and boosting

In [None]:
df_first = df.copy()
df_first.sort_values('date', inplace=True)
df_first['view_to_total_impressions'] = df_first['viewable_impressions'] / df_first['total_impressions']
df_first.loc[df_first['total_impressions']==0, 'view_to_total_impressions'] = 0

df_first = df_first.drop('total_revenue',axis=1)

### Simple boosting

In [None]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split

# clip irrelevant predictions
def clip(x):
    return np.clip(x, 0, None)


In [None]:
params = {
    'alpha': 0.8,
    'colsample_bytree': 0.6000000000000001,
    'learning_rate': 0.01,
    'n_estimators': 1950,
    'num_leaves': 511,
    'subsample': 0.6,
    'objective': 'mse'
}
model_base = LGBMRegressor(random_state=17, **params)

In [None]:
is_train = df_first['sample'] == 'train'
is_test = df_first['sample'] == 'test'

new_train, new_test = df_first[is_train], df_first[is_test]
train_df, eval_df = train_test_split(new_train, test_size=0.2, random_state=17)

model_base.fit(
    train_df.drop(['CPM','date','sample'],axis=1), train_df['CPM'],
    eval_set=(eval_df.drop(['CPM','date','sample'],axis=1), eval_df['CPM']),
    early_stopping_rounds=20,
    verbose=80,
)
mean_squared_error(new_test['CPM'], clip(model_base.predict(new_test.drop(['CPM','date','sample'],axis=1))))

### MSE 2667

## Add historical features without CPM
### Compute rolling stats

In [None]:
num_columns = [
    'viewable_impressions',
    'measurable_impressions',
    'total_impressions', 
    'view_to_total_impressions'
]

In [None]:
aggs = ['mean', 'sum', 'median']
agg_df = []
for agg in aggs:
    agg_df.append(df_first.rolling(1)[num_columns].agg(agg).add_suffix(f'_{agg}'))
agg_df = pd.concat(agg_df, axis='columns')
agg_df.head(5)

In [None]:
from tqdm.auto import tqdm
from collections import defaultdict
from itertools import product

def compute_agg_by_cat_cols(df, cols_to_agg):
    dates = df['date'].unique()
    dates.sort()

    aggs = ['sum', 'mean', 'median']

    group_stats_df = defaultdict(lambda: defaultdict(list))
    for agg, date, window_days in tqdm(list(product(aggs, dates, range(1, 8)))):
        idx = np.searchsorted(dates, date)
        window_dates = dates[idx - window_days:idx]

        window_df = df[df['date'].isin(window_dates)]
        if window_df.shape[0] == 0:
            continue
        stat_df = (
            window_df
            .groupby(id_cols)[cols_to_agg]
            .agg(agg)
            .add_suffix(f'_win_{window_days}_{agg}')
        )
        for id_col in id_cols:
            grouped_by_col = (
                window_df
                .groupby(id_col)[cols_to_agg]
                .agg(agg)
                .add_suffix(f'_win_{window_days}_{agg}_{id_col}')
            )
            stat_df = stat_df.join(grouped_by_col)
        stat_df['date'] = date
        group_stats_df[agg][window_days].append(stat_df)
    return group_stats_df

def compute_agg_by_id_cols(df, cols_to_agg):
    dates = df['date'].unique()
    dates.sort()
    
    aggs = ['sum', 'mean', 'median']
    
    group_stats_df_cat = defaultdict(lambda: defaultdict(list))
    for agg, date, window_days in tqdm(list(product(aggs, dates, range(1, 8)))):
        idx = np.searchsorted(dates, date)
        window_dates = dates[idx - window_days:idx]

        window_df = df[df['date'].isin(window_dates)][cat_cols + cols_to_agg]
        stat_df = window_df[cat_cols].copy().drop_duplicates()
        for id_col in cat_cols:
            grouped_by_col = (
                window_df
                .groupby(id_col)[cols_to_agg]
                .agg(agg)
                .add_suffix(f'_win_{window_days}_{agg}_{id_col}')
            )
            stat_df = stat_df.join(grouped_by_col, on=id_col)
        stat_df['date'] = date
        group_stats_df_cat[agg][window_days].append(stat_df)
    return group_stats_df_cat

In [None]:
cols_to_agg = num_columns

group_stats_df = compute_agg_by_cat_cols(df_first, cols_to_agg)
group_stats_df_cat = compute_agg_by_id_cols(df_first, cols_to_agg)

In [None]:
from itertools import chain
full_concated = df_first

join_cols = id_cols + ['date']
aggs, win_days = ['mean'], [7]
for agg, win_day in product(aggs, win_days):
    stat_df = pd.concat(group_stats_df[agg][win_day]).reset_index().set_index(join_cols)
    full_concated = full_concated.join(stat_df, on=join_cols)

join_cols = cat_cols + ['date']
aggs, win_days = ['mean'], [7]
for agg, win_day in product(aggs, win_days):
    stat_df = pd.concat(group_stats_df_cat[agg][win_day]).set_index(join_cols)
    
    full_concated = full_concated.join(stat_df, on=join_cols, how='left')
full_concated = full_concated.join(agg_df)

In [None]:
is_train = full_concated['sample'] == 'train'
is_test = full_concated['sample'] == 'test'

new_train, new_test = full_concated[is_train], full_concated[is_test]
train_df, eval_df = train_test_split(new_train, test_size=0.2, random_state=17)

In [None]:
model_hist = LGBMRegressor(random_state=17, **params)

In [None]:
model_hist.fit(
    train_df.drop(['CPM','date','sample'],axis=1), train_df['CPM'],
    eval_set=(eval_df.drop(['CPM','date','sample'],axis=1), eval_df['CPM']),
    early_stopping_rounds=20,
    verbose=80,
)
mean_squared_error(new_test['CPM'], clip(model_hist.predict(new_test.drop(['CPM','date','sample'],axis=1))))

### MSE 2720 > 2667 for model without historical features. May be we should add some smoothing or drop stat for smth categories

## Add historical CPM from previous dates with different window size 
It's ok in production case, because on date[i] we already know CPM for date[j] when j < i

In [None]:
cols_to_agg = num_columns + ['CPM']

group_stats_cpm_df = compute_agg_by_cat_cols(df_first, cols_to_agg)
group_stats_cpm_cat_df = compute_agg_by_id_cols(df_first, cols_to_agg)

In [None]:
from itertools import chain
full_concated_cpm = df_first

join_cols = id_cols + ['date']
aggs, win_days = ['mean'], [1, 2, 7]
for agg, win_day in product(aggs, win_days):
    stat_df = pd.concat(group_stats_cpm_df[agg][win_day]).reset_index().set_index(join_cols)
    full_concated_cpm = full_concated_cpm.join(stat_df, on=join_cols)

join_cols = cat_cols + ['date']
aggs, win_days = ['mean'], [1, 2, 7]
for agg, win_day in product(aggs, win_days):
    stat_df = pd.concat(group_stats_cpm_cat_df[agg][win_day]).set_index(join_cols)
    
    full_concated_cpm = full_concated_cpm.join(stat_df, on=join_cols, how='left')
full_concated_cpm = full_concated_cpm.join(agg_df)

In [None]:
is_train = full_concated_cpm['sample'] == 'train'
is_test = full_concated_cpm['sample'] == 'test'

new_train_cpm, new_test_cpm = full_concated_cpm[is_train], full_concated_cpm[is_test]
train_df, eval_df = train_test_split(new_train_cpm, test_size=0.2, random_state=17)

In [None]:
model_hist_cpm = LGBMRegressor(random_state=17, **params)
model_hist_cpm.fit(
    train_df.drop(['CPM','date','sample'],axis=1), train_df['CPM'],
    eval_set=(eval_df.drop(['CPM','date','sample'],axis=1), eval_df['CPM']),
    early_stopping_rounds=20,
    verbose=80,
)

In [None]:
mean_squared_error(new_test_cpm['CPM'], clip(model_hist_cpm.predict(new_test_cpm.drop(['CPM','date','sample'],axis=1))))

### MSE 2485

## Add historical features based only on CPM

In [None]:
cols_to_agg = ['CPM']

group_stats_only_cpm_df = compute_agg_by_cat_cols(df_first, cols_to_agg)
group_stats_only_cpm_df_cat = compute_agg_by_id_cols(df_first, cols_to_agg)

In [None]:
from itertools import chain
full_concated = df_first

join_cols = id_cols + ['date']
aggs, win_days = ['mean'], [1, 5, 7]
for agg, win_day in product(aggs, win_days):
    stat_df = pd.concat(group_stats_only_cpm_df[agg][win_day]).reset_index().set_index(join_cols)
    full_concated = full_concated.join(stat_df, on=join_cols)

join_cols = cat_cols + ['date']
aggs, win_days = ['mean'], [1, 5, 7]
for agg, win_day in product(aggs, win_days):
    stat_df = pd.concat(group_stats_only_cpm_df_cat[agg][win_day]).set_index(join_cols)
    
    full_concated = full_concated.join(stat_df, on=join_cols, how='left')
full_concated = full_concated.join(agg_df)

In [None]:
is_train = full_concated['sample'] == 'train'
is_test = full_concated['sample'] == 'test'

new_train, new_test = full_concated[is_train], full_concated[is_test]
train_df, eval_df = train_test_split(new_train, test_size=0.2, random_state=17)

In [None]:
model = LGBMRegressor(random_state=17, **params)
model.fit(
    train_df.drop(['CPM','date','sample'],axis=1), train_df['CPM'],
    eval_set=(eval_df.drop(['CPM','date','sample'],axis=1), eval_df['CPM']),
    early_stopping_rounds=20,
    verbose=80,
)
mean_squared_error(new_test['CPM'], clip(model.predict(new_test.drop(['CPM','date','sample'],axis=1))))

## MSE 2457