In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor

In [None]:
df_full = pd.read_csv('../input/real-time-advertisers-auction/Dataset.csv',
                      parse_dates=['date'],
                      date_parser=(lambda dt: pd.to_datetime(dt, format='%Y-%m-%d')))

In [None]:
# check features dtypes and nan
df_full.info()

In [None]:
# check our full dataframe
df_full.head(5)

In [None]:
#calculating CPM
#calculating the value that the Advertisers Bid for the month of June
# CPM(the value which was the winning bid value) = 
#((revenue of the publisher*100)/revenue_share_percentage)/measurable_impressions)*1000

def weird_division(n, d):
    return n / d if d else 0

df_full['CPM'] = df_full.apply(lambda x: weird_division(((x['total_revenue'] * 100)), x['measurable_impressions']) * 1000, axis=1)

In [None]:
#we can remove integration type as it has only one value and revenue share percent as that we have already used and 
#is only one single value as well
df_full.drop(['integration_type_id' , 'revenue_share_percent'], axis = 1, inplace=True)

In [None]:
# we can remove total impressions as well as that is account the same information as measurable impressions 
df_full.drop(['total_impressions'], axis = 1, inplace=True)

In [None]:
# lets remove total_revenue and measurable_impressions because we use it for finding CPM
df_full.drop(['measurable_impressions' , 'total_revenue'], axis = 1, inplace=True)

In [None]:
# check our full dataframe
df_full.head(5)

In [None]:
# select columns
cat_columns =  ['site_id', 
              'ad_type_id', 
              'geo_id',
              'device_category_id', 
              'advertiser_id', 
              'order_id',
              'line_item_type_id', 
              'os_id',
              'monetization_channel_id', 
              'ad_unit_id',]

num_columns = ['viewable_impressions',]

y_feature = ['CPM',]

In [None]:
# check unique cat values in full dataset
for col in cat_columns:
    print(col, df_full[col].unique())
    print(100 * '-')

In [None]:
# delete negative CPM values from full dataset
df_full = df_full[df_full['CPM'] >= 0]

In [None]:
# split full data in train and test dataframes by "2019-06-22"
df_train = df_full[df_full['date'] < "2019-06-22"]
df_test = df_full[df_full['date'] >= "2019-06-22"]

In [None]:
# delete outliers in train and test datasets
df_train = df_train[df_train['CPM'] < df_train['CPM'].quantile(0.95)]
df_test = df_test[df_test['CPM'] < df_test['CPM'].quantile(0.95)]

In [None]:
# define X_train and y_train
X_train = df_train[cat_columns + num_columns]
y_train = df_train[y_feature]

In [None]:
# define X_test and y_test
X_test = df_test[cat_columns + num_columns]
y_test = df_test[y_feature]

In [None]:
# build and teach model CatBoostRegressor
model = CatBoostRegressor(
    iterations=400,
    random_seed=42,
    logging_level='Silent'
    )

# teach model
model.fit(
    X_train, 
    y_train,
    cat_features=cat_columns
)

In [None]:
# make predictions for test
print(mean_squared_error(y_test, model.predict(X_test)))