In [None]:
import pandas as pd

In [None]:
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error

In [None]:
# load data
df = pd.read_csv('/kaggle/input/real-time-advertisers-auction/Dataset.csv')

In [None]:
# CPM is cost per thousand impressions
# actual formula is provided by Akshay Paliwal for comparison
def weird_division(n, d):
    return n / d if d else 0
df['CPM'] = df.apply(lambda x: weird_division(x['total_revenue'] * 100,
                                              x['measurable_impressions']) * 1000,
                     axis = 1)

In [None]:
# remove leak and features with a single unique value
df.drop(columns = ['total_revenue',
                   'integration_type_id',
                   'revenue_share_percent'],
        inplace = True)

In [None]:
# CPM should make sense
df = df[df.CPM >= 0]

In [None]:
# split data into train and test by date
train = df[df.date < '2019-06-22'].drop(columns = 'date')
test = df[df.date >= '2019-06-22'].drop(columns = 'date')

In [None]:
# remove outliers
train = train[train.CPM <= train.CPM.quantile(.95)]
test = test[test.CPM <= test.CPM.quantile(.95)]

In [None]:
# prepare inputs
x_train = train.drop(columns = 'CPM')
y_train = train.CPM
x_test = test.drop(columns = 'CPM')
y_test = test.CPM

In [None]:
# use the simplest CatBoost model possible to avoid overfitting
model = CatBoostRegressor(iterations = 100,
                          depth = 3,
                          random_seed = 42)

In [None]:
# fit model
# don't forget to set categorical features
# since it's the main reason to choose CatBoost
model.fit(x_train, y_train,
          cat_features = list(range(10)),
          verbose = False)

In [None]:
# calculate MSE on test data
print('MSE:', mean_squared_error(y_test.values, model.predict(x_test)))