In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_squared_error
from itertools import combinations

## read and prepare data

In [None]:
df = pd.read_csv('/kaggle/input/real-time-advertisers-auction/Dataset.csv', parse_dates=['date'])

In [None]:
df.head()

In [None]:
def weird_division(n, d):
    return n / d if d else 0

df['CPM'] = df.apply(lambda x: weird_division(((x['total_revenue']*100)),x['measurable_impressions'])*1000 , axis=1)

## features

In [None]:
drop_cols = set([
    "total_revenue",
    "date",
    "CPM",
    "total_impressions",
    "viewable_impressions",
    "revenue_share_percent",
    "measurable_impressions"
])
used_cols = set(df.columns) - drop_cols

generate interactions

In [None]:
new_cols = set()
for col1, col2 in combinations(used_cols, 2):
    col = f"{col1}_{col2}"
    new_cols.add(col)
    df[col] = df[col1] * 100000 + df[col2]
used_cols |= new_cols

In [None]:
len(used_cols)

## split data

In [None]:
df = df.sort_values("date")
df_train = df.query("(date < '2019-06-17') and (CPM >= 0)")
df_train = df_train[df_train["CPM"] < df_train["CPM"].quantile(0.95)]

df_val = df.query("(date >= '2019-06-17') and (date < '2019-06-22') and (CPM >= 0)")
df_val = df_val[df_val["CPM"] < df_val["CPM"].quantile(0.95)]

df_test= df.query("(date >= '2019-06-22') and (CPM >= 0)")
df_test = df_test[df_test["CPM"] < df_test["CPM"].quantile(0.95)]

df_train.shape, df_val.shape, df_test.shape

## build model

In [None]:
train_set = Pool(
    df_train[used_cols],
    df_train["CPM"],
    cat_features=used_cols
)
eval_set = Pool(
    df_val[used_cols],
    df_val["CPM"],
    cat_features=used_cols
)

model = CatBoostRegressor(
    iterations=100,
    loss_function='RMSE',
    verbose=False
)

model.fit(
    train_set,
    eval_set=eval_set,
    plot=True
)

## score

In [None]:
y_pred = model.predict(df_test[used_cols])
y_true = df_test["CPM"].values

In [None]:
mean_squared_error(y_true, y_pred)