### Outline <a name = 'outline'></a>
* [Imports](#imports) 
* [Preprocessing](#dataset)
* [Log and train test split](#split) 
* [Catboost](#catboost)

### Imports <a name = 'imports'></a>

In [None]:
import numpy as np
import pandas as pd

from sklearn import (
    model_selection,
    metrics,
)
from catboost import CatBoostRegressor  

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
def weird_division(n, d):
    return n / d if d else 0

### Preprocessing <a name = 'dataset'></a>

In [None]:
df = pd.read_csv('/kaggle/input/real-time-advertisers-auction/Dataset.csv')
df.head()

In [None]:
df.info()

In [None]:
df.nunique()

In [None]:
df["date"] = df.date.astype("datetime64[s]")

In [None]:
df['CPM'] = df.apply(lambda x: weird_division(((x['total_revenue'] * 100)), x['measurable_impressions']) * 1000, axis=1)
df = df[df.CPM >= 0]

cols_for_delete = [
#     "order_id",  # dataset overview
#     "line_item_type_id",  # dataset overview
    "total_revenue",  # CPM depends 
    "measurable_impressions", # CPM depends
    "integration_type_id", # one unique value
    "revenue_share_percent", # one unique value
]
df.drop(cols_for_delete, axis = 1, inplace=True)

### Log and train test split <a name = 'split'></a>

In [None]:
numeric_features = [
    "total_impressions", 
    "viewable_impressions", 
]
categorical_features = [
    "site_id", 
    "ad_type_id", 
    "geo_id", 
    "device_category_id", 
    "advertiser_id", 
    "os_id", 
    "monetization_channel_id", 
    "ad_unit_id",
    "order_id", 
    "line_item_type_id", 
]

features = numeric_features + categorical_features
target_name = "CPM"

In [None]:
for cur_col in numeric_features + [target_name]:
    df[cur_col] = np.log1p(df[cur_col])

In [None]:
train_df = df[df.date < "2019-06-22"]
test_df = df[df.date >= "2019-06-22"]

train_df = train_df.loc[train_df["CPM"] < train_df["CPM"].quantile(0.95)]
test_df = test_df.loc[test_df["CPM"] < test_df["CPM"].quantile(0.95)]

In [None]:
X = train_df[features]
y = train_df[target_name]

X_train, X_val, y_train, y_val = model_selection.train_test_split(X, y)

### CatBoost <a name = 'catboost'></a>

In [None]:
cbr = CatBoostRegressor(
    learning_rate=0.5,
    iterations=1000, 
    random_seed=13, 
    depth=6, 
)

In [None]:
cbr.fit(X_train, y_train, cat_features=categorical_features, verbose=200)

In [None]:
y_pred_val = cbr.predict(X_val)

print(f"MSE on validation = {metrics.mean_squared_error(np.expm1(y_val), np.expm1(y_pred_val))}")

In [None]:
y_pred_test = cbr.predict(test_df[features])

print(f"MSE on test = {metrics.mean_squared_error(np.expm1(test_df.CPM), np.expm1(y_pred_test))}")