In [None]:
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor, cv, Pool
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))
import seaborn as sns

In [None]:
df = pd.read_csv('/kaggle/input/real-time-advertisers-auction/Dataset.csv')
# reference to Akshay Paliwal, many thanks to him
def weird_division(n, d):
    return n / d if d else 0

df['CPM'] = df.apply(lambda x: weird_division(((x['total_revenue']*100)),x['measurable_impressions'])*1000 , axis=1)

In [None]:
df.drop([
#          'order_id', 
#          'line_item_type_id', 
#          'total_impressions', 
         'total_revenue',
#          'integration_type_id', 
         'revenue_share_percent',
         'measurable_impressions',
#          'viewable_impressions',
         'revenue_share_percent'], axis=1, inplace=True)

In [None]:
df.isnull().sum()

In [None]:
train = df.loc[df['date'] <= '2019-06-21']
test = df.loc[df['date'] > '2019-06-21']

In [None]:
test = test[(test['CPM'] < test['CPM'].quantile(.95)) & (test['CPM'] >= 0)]
train = train[(train['CPM'] < train['CPM'].quantile(.95)) & (train['CPM'] >= 0)]

In [None]:
train = train.drop(columns=['date'])
test = test.drop(columns=['date'])

In [None]:
test.head()

In [None]:
for col in train.columns:
    print(col, len(train[col].value_counts()) / len(train) * 100)

In [None]:
train_pool = Pool(data=train.loc[:, train.columns != 'CPM'], 
             label=train['CPM'], 
#              cat_features=[0, 1, 2, 3, 4, 5, 6, 7]
            )
test_pool = Pool(data=test.loc[:, test.columns != 'CPM'], 
#              cat_features=[0, 1, 2, 3, 4, 5, 6, 7]
           )

In [None]:
model = CatBoostRegressor(eval_metric="RMSE")

grid = {'learning_rate': [0.01, 0.05, 0.1, 0.5],
        'n_estimators': [100],
        'depth': [8, 10, 12],
        'l2_leaf_reg': [1, 2, 3],
        'bagging_temperature': [0, 0.5, 1],
        'random_strength': [15, 20, 25, 50, 100]}

random_search_result = model.randomized_search(grid, 
                                               train_pool,
                                               cv=3, 
                                               n_iter=100, 
                                               verbose=False,
                                               plot=True)

In [None]:
random_search_result['params']['iterations'] = 2000
final_model = CatBoostRegressor(**random_search_result['params'])

In [None]:
final_model.fit(train_pool, verbose=False, plot=True)

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
preds = final_model.predict(test_pool)

In [None]:
mean_squared_error(y_pred=preds, y_true=test['CPM'])