In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Import some usefull libraries

In [None]:
import h2o
from h2o.automl import H2OAutoML
from sklearn.metrics import mean_squared_error

In [None]:
DATASET_PATH = '/kaggle/input/real-time-advertisers-auction/Dataset.csv'
dataset = pd.read_csv(DATASET_PATH)

## Slightly preprocess dataset

In [None]:
dataset.head()

In [None]:
dataset.info()

In [None]:
dataset['date'] = dataset['date'].apply(lambda x: pd.to_datetime(x))

In [None]:
dataset['cpm'] = (dataset['total_revenue'] * 100 / dataset['measurable_impressions']) * 1000
dataset['cpm'] = dataset['cpm'].replace(np.inf, 0)
dataset['cpm'] = dataset['cpm'].fillna(0)

In [None]:
dataset = dataset[dataset['cpm'] >= 0]
dataset = dataset[dataset['cpm'] < dataset['cpm'].quantile(0.95)]

In [None]:
dataset['cpm'].describe()

In [None]:
dataset = dataset.drop(columns=['total_revenue'])

## Modeling

 #### In this notebook i will use [h2o](https://h2o.ai) as simple baseline
#### h2o is very usefull tool, that can show you what model performance you can get in "fit-predict-data-scientist" mode
#### Moreover, h2o is very strong baseline, that is not always so easy to beat. So, using h2o as a baseline is help you to understand the real plank, you should get to approve that you can do smth cool

In [None]:
DATE_X = pd.to_datetime('21.06.2019')

In [None]:
train = dataset[dataset['date'] <= DATE_X]
test = dataset[dataset['date'] > DATE_X]

In [None]:
print(f"Train size = {train.shape[0]}, test size = {test.shape[0]}") 

In [None]:
train.nunique()

In [None]:
train = train.drop(columns=['revenue_share_percent', 'integration_type_id'])
test = test.drop(columns=['revenue_share_percent', 'integration_type_id'])

In [None]:
cat_features = [
                'site_id', 'ad_type_id','geo_id', 'device_category_id',
               'advertiser_id', 'order_id', 'line_item_type_id', 'os_id',
                'monetization_channel_id', 'monetization_channel_id'
               ]

In [None]:
h2o.init()

In [None]:
train = h2o.H2OFrame(train)
test = h2o.H2OFrame(test)
x = train.columns
y = "cpm"
x.remove(y)
train[cat_features] = train[cat_features].asfactor()
test[cat_features] = test[cat_features].asfactor()

In [None]:
aml = H2OAutoML(max_runtime_secs= 5 * 60)
aml.train(x=x, y=y, training_frame=train, leaderboard_frame=test)

In [None]:
preds = aml.predict(test).as_data_frame()['predict'].values

In [None]:
print(f"Baseline in this dataset is {mean_squared_error(test[y].as_data_frame().values, preds)}")

In [None]:
aml.leaderboard