In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

DATA_TRAIN_FPATH = '/kaggle/working/train.csv'
DATA_TEST_FPATH = '/kaggle/working/test.csv'
DATA_FPATH = '/kaggle/input/real-time-advertisers-auction/Dataset.csv'

In [None]:
df = pd.read_csv(DATA_FPATH)
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values('date')
df.loc[df.date <= '2019-06-21'].to_csv(DATA_TRAIN_FPATH, index=False)
df.loc[df.date > '2019-06-21'].to_csv(DATA_TEST_FPATH, index=False)

In [None]:
import matplotlib.pyplot as plt # plotting
import numpy as np # linear algebra
import os # accessing directory structure
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    explained_variance_score
)
from catboost import CatBoostRegressor, CatBoostClassifier
from catboost import CatBoost, Pool, MetricVisualizer, cv

SEED = 42


def get_data(type='train', return_df = False):
    assert type in ('train', 'test')
    
    if type == 'train':
        data_path = DATA_TRAIN_FPATH
    else:
        data_path = DATA_TEST_FPATH
    df = pd.read_csv(data_path)
    df['date'] = pd.to_datetime(df['date'])

    df.drop(['order_id' , 
             'line_item_type_id',
             'integration_type_id',
            ], axis = 1, inplace=True)

    def weird_division(n, d):
        return n / d if d else 0



    #calculating CPM
    #calculating the value that the Advertisers Bid for the month of June
    # CPM(the value which was the winning bid value) = 
    #((revenue of the publisher*100)/revenue_share_percentage)/measurable_impressions)*1000
    df['CPM'] = df.apply(lambda x: 1e3 * \
                         weird_division(x['total_revenue'] * (x['revenue_share_percent'] * 100.),
                                        x['measurable_impressions']),
                                        axis=1)
    df.drop(['revenue_share_percent'], axis=1, inplace=True)
    df['View/measurable'] = df.apply(lambda x: weird_division(x['viewable_impressions'],x['measurable_impressions']) , axis=1)

    # CPM can't be less then zero
    # also filter out anomaly high values
    df = df[df['CPM'].between(0, df['CPM'].quantile(.95))]
        
    if return_df:
        return df
    
    
    X = (df
         .drop(['date', 'CPM', 'total_revenue'], axis=1)
    )
    X['total_impressions'] =  X['total_impressions'].apply(lambda x: np.log(x + 1))
    X['viewable_impressions'] =  X['viewable_impressions'].apply(lambda x: np.log(x + 1))
    X['measurable_impressions'] =  X['measurable_impressions'].apply(lambda x: np.log(x + 1))
    
    y = (df
         .CPM#.apply(lambda x: np.log(x + 1))
    )
    return X, y

# CV
Uncoment if you want to run cv

In [None]:
# import scipy
# params = {'loss_function': 'RMSE', 'custom_metric': ['RMSE'], 'verbose':True}
# max_depth = np.unique(np.random.randint(3, 10, size=5))
# learning_rate = np.random.uniform(0.001,1,5)
# l2_leaf_reg = scipy.stats.reciprocal.rvs(a=1e-2, b=1e1, size=5)

# X, y = get_data()

# df_cv = Pool(data=X, label=y, cat_features = ['ad_type_id', 
#                                               'site_id', 
#                                               'geo_id',
#                                               'device_category_id',
#                                               'advertiser_id',
#                                               'os_id',
#                                               'monetization_channel_id',
#                                               'ad_unit_id'
#                                              ])

# tuning_table = pd.DataFrame()
# print('start cv')
# for depth in max_depth:
#     for _lr in learning_rate:
#         for l2 in l2_leaf_reg:
#             params['learning_rate'] = _lr
#             params['l2_leaf_reg'] = l2
#             params['max_depth'] = depth
#             print(f"start training on params lr={params['learning_rate']};l2={params['l2_leaf_reg']};max_depth={params['max_depth']};")
#             tuning_table = cv(df_cv,
#                      params = params,
#                      plot=False,
#                      nfold=3,
#                      shuffle=True,
#                      iterations = 1000, 
#                      early_stopping_rounds=50,
#                      logging_level='Silent', 
#                      seed = SEED)
#             name = f"lr={params['learning_rate']};l2={params['l2_leaf_reg']};max_depth={params['max_depth']};iterations={tuning_table.iterations.max()}"
#             tuning_table['model'] = name
#             tuning_table.to_pickle(f'catboost_res/{name}.pkl')
#             display(tuning_table.groupby('model')[
#                 ['train-RMSE-mean', 'train-RMSE-std','test-RMSE-mean', 'test-RMSE-std']
#             ].min())

In [None]:
params = {}
params['learning_rate'] = 0.7
params['l2_leaf_reg'] = 0.09
params['max_depth'] = 3
params['task_type'] = 'CPU'
params['random_seed'] = SEED


cbr = CatBoostRegressor(objective='RMSE',
                        iterations=400,
                       **params)
X, y = get_data()
X_train = X#.iloc[:TRAIN_END_IDX_18]
y_train = y#.iloc[:TRAIN_END_IDX_18]

X_train_pool = Pool(
    data=X_train, 
    label=y_train, 
    cat_features = ['ad_type_id', 
                    'site_id', 
                    'geo_id',
                    'device_category_id',
                    'advertiser_id',
                    'os_id',
                    'monetization_channel_id',
                    'ad_unit_id'
                 ])

# X_val = X.iloc[TRAIN_END_IDX_18 + 1:]
# y_val = y.iloc[TRAIN_END_IDX_18 + 1:]

# X_val_pool = Pool(data=X_val, label=y_val, cat_features = ['ad_type_id', 
#                                               'site_id', 
#                                               'geo_id',
#                                               'device_category_id',
#                                               'advertiser_id',
#                                               'os_id',
#                                               'monetization_channel_id',
#                                               'ad_unit_id'
#                                              ])




cbr.fit(X_train_pool)#, eval_set=X_val_pool, plot=True)

In [None]:
preds = np.array([x if x >=0 else 0 for x in cbr.predict(X_train_pool)])
plt.hist(preds);

mean_squared_error(
    y_train, 
    preds
)

In [None]:
X_test, y_test = get_data('test')
X_test_pool = Pool(
    data = X_test.loc[:, X_train.columns],
    label = y_test,
    cat_features = ['ad_type_id', 
                    'site_id', 
                    'geo_id',
                    'device_category_id',
                    'advertiser_id',
                    'os_id',
                    'monetization_channel_id',
                    'ad_unit_id'
                  ]
)

# MSE on test set

In [None]:
preds = np.array([x if x >=0 else 0 for x in cbr.predict(X_test_pool)])

plt.hist(preds);

mean_squared_error(
    y_test, 
    cbr.predict(X_test_pool)
)