In [None]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import seaborn as sns
from matplotlib import pyplot as plt

from sklearn.metrics import mean_squared_error

from catboost import CatBoostRegressor, CatBoostClassifier, Pool

%matplotlib inline

### Loading data

In [None]:
dataset = pd.read_csv("/kaggle/input/real-time-advertisers-auction/Dataset.csv")

In [None]:
dataset.info()

In [None]:
dataset.describe()

In [None]:
list(dataset.columns)

In [None]:
dataset.nunique()

In [None]:
dataset.isnull().sum()

### Making target (CPM)

In [None]:
def special_division(n, d):
    return n / d if d else 0

dataset['CPM'] = dataset.apply(lambda x: special_division(((x['total_revenue']*100)),x['measurable_impressions'])*1000 , axis=1)

### Preparing data for modeling

In [None]:
dataset.drop_duplicates(inplace=True)

In [None]:
dataset.drop("revenue_share_percent", axis=1, inplace=True)

In [None]:
dataset.drop("total_revenue", axis=1, inplace=True)

In [None]:
dataset["date"] = pd.to_datetime(dataset["date"])

In [None]:
# Leave ~30% for test
train = dataset.loc[dataset["date"] <= pd.to_datetime("2019-06-21")]
test = dataset.loc[dataset["date"] > pd.to_datetime("2019-06-21")]
test.shape[0], train.shape[0], test.shape[0] / (train.shape[0] + test.shape[0])

In [None]:
# Excluding outliers
test = test[test.CPM >= 0]
test = test[test.CPM < test.CPM.quantile(.95)]
train = train[train.CPM >= 0]
train = train[train.CPM < train.CPM.quantile(.95)]

### Some EDA

In [None]:
# train cpm mean
train["CPM"].mean()

In [None]:
# test cpm mean
test["CPM"].mean()

In [None]:
plt.figure(figsize=(12, 9))
sns.countplot(pd.cut(train.loc[:, "CPM"], [-1, 0, 10, 100, 250, 500, 1000]))

In [None]:
plt.figure(figsize=(12, 9))
sns.countplot(pd.cut(test.loc[:, "CPM"], [-1, 0, 10, 100, 250, 500, 1000]))

In [None]:
plt.figure(figsize=(6, 6))
sns.countplot(pd.cut(train.loc[:, "CPM"], [-1, 0, 1000]))

So, most of all CPMs is 0. We can use classification task for seperating 0 from other values and get better distribution

In [None]:
plt.figure(figsize=(12, 9))
sns.countplot(pd.cut(test.loc[:, "CPM"], [0, 10, 100, 250, 500, 1000]))

### Modeling

In [None]:
train.columns

In [None]:
# Define categoracal features
cats = ["site_id", "ad_type_id", "geo_id", "device_category_id",
       "advertiser_id", "order_id", "line_item_type_id", "os_id",
       "monetization_channel_id", "ad_unit_id"]

In [None]:
# Define target for classification
target = train["CPM"].apply(lambda x: 0 if x == 0 else 1)

In [None]:
train.drop("date", axis=1, inplace=True)
test.drop("date", axis=1, inplace=True)

In [None]:
params = {
     "loss_function": "Logloss",
     "n_estimators": 650,
     "eval_metric": "Accuracy",
     "random_state": 42,
}

In [None]:
# Training classifier
model_cat_clf = CatBoostClassifier(cat_features=cats, random_state=42, verbose=100)
model_cat_clf.fit(train.drop("CPM", axis=1), target)

In [None]:
preds_0 = model_cat_clf.predict(data=test.drop("CPM", axis=1))

In [None]:
# Separating 0 from other values
test_rest = test[preds_0 == 1]
train_rest = train[train["CPM"] != 0]

In [None]:
# Training regression
model_cat = CatBoostRegressor(cat_features=cats, random_state=42)
model_cat.fit(train_rest.drop("CPM", axis=1), train_rest["CPM"], verbose=100)

In [None]:
preds = model_cat.predict(test_rest.drop("CPM", axis=1))

In [None]:
# Uniting all predictions
test_rest["preds"] = preds
test = test.join(test_rest["preds"], how="left")
test["preds"].fillna(0, inplace=True)

In [None]:
mean_squared_error(test["CPM"].values, test["preds"])

#### MSE without outliers = 2835.566

So we can improve it by encoding categorical features and tuning models