In [None]:
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, make_scorer

In [None]:
TEST_TIMELINE = '2019-06-21'

## Reading data

In [None]:
df = pd.read_csv('/kaggle/input/real-time-advertisers-auction/Dataset.csv')
df.date = pd.to_datetime(df.date)

## Preprocessing

In [None]:
def weird_division(n, d):
    return n / d if d else 0


df['CPM'] = df.apply(
    lambda x: 1000*weird_division(((100*x.total_revenue)), x.measurable_impressions),
    axis=1
)
df = df[df.CPM.between(0, df.CPM.quantile(.95))].copy()

In [None]:
train_mask = (df.date <= TEST_TIMELINE).values
df.drop([
    'integration_type_id',
    'revenue_share_percent',
    
    'total_revenue',
    'measurable_impressions',
    
    'date',
], axis=1, inplace=True)

train, test = df[train_mask].copy(), df[~train_mask].copy()

X_train, y_train = train.drop('CPM', axis=1), train.CPM.values
X_test, y_test = test.drop('CPM', axis=1), test.CPM.values

## Validation

In [None]:
clf = CatBoostRegressor(random_seed=1, verbose=0)
cross_val_score(clf, X_train, y_train, scoring=make_scorer(mean_squared_error), cv=5)

## Train & Inference

In [None]:
clf.fit(X_train, y_train)
y_test_hat = clf.predict(X_test)
mean_squared_error(y_test, y_test_hat)