In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/real-time-advertisers-auction/Dataset.csv')

In [None]:
#calculating CPM
#calculating the value that the Advertisers Bid for the month of June
# CPM(the value which was the winning bid value) = 
#((revenue of the publisher*100)/revenue_share_percentage)/measurable_impressions)*1000

def weird_division(n, d):
    return n / d if d else 0

df['CPM'] = df.apply(lambda x: weird_division(((x['total_revenue']*100)),
                                                x['measurable_impressions'])*1000 , axis=1)

In [None]:
# Clean dataset a little bit
df = df.loc[df.CPM >= 0]

# Drop total_revenue as we are using it in CPM
df.drop(['total_revenue'], axis=1, inplace=True)

### EDA:

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df.info()

In [None]:
df['date'] = pd.to_datetime(df['date'])

In [None]:
df.describe()

In [None]:
sns.pairplot(df[['site_id', 'ad_type_id', 'geo_id', 'device_category_id',
       'advertiser_id', 'order_id', 'CPM']])

In [None]:
sns.pairplot(df[['line_item_type_id', 'os_id',
       'monetization_channel_id', 'ad_unit_id', 'total_impressions',
       'viewable_impressions', 'measurable_impressions', 'CPM']])

In [None]:
df.nunique()

In [None]:
# drop revenue_share_percent and integration_type_id as they have just one value
df.drop(['revenue_share_percent', 'integration_type_id'], axis=1, inplace=True)

### Baseline - Random Forest Regression:

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

In [None]:
# split train and test
train = df[df.date < '2019-06-22']
test = df[df.date > '2019-06-21']


train = train.loc[train['CPM'] < train['CPM'].quantile(0.95)]
test = test.loc[test['CPM'] < test['CPM'].quantile(0.95)]

In [None]:
# list of features for fit
full_features_list = ['site_id', 'ad_type_id', 'geo_id', 'device_category_id',
       'advertiser_id', 'order_id', 'line_item_type_id', 'os_id',
       'monetization_channel_id', 'ad_unit_id',
       'viewable_impressions', 'measurable_impressions', 'total_impressions']

In [None]:
# MSE on cross validation on train
rf_model = RandomForestRegressor()
rf_scores = cross_val_score(rf_model, train[full_features_list], train['CPM'],
                         cv = 3, scoring = 'neg_mean_squared_error')
print(rf_scores)

In [None]:
# RandomForestRegressor on test dataset
rf_model.fit(train[full_features_list], train['CPM'])
rf_test_preds = rf_model.predict(test[full_features_list])
print('MSE on test = ', mean_squared_error(test['CPM'], rf_test_preds))

### CatBoost Regression

In [None]:
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split

In [None]:
# list of categorical features
cat_features_list = ['site_id', 'ad_type_id', 'geo_id', 'device_category_id', 'advertiser_id', 
                     'order_id', 'line_item_type_id', 'os_id', 'monetization_channel_id', 'ad_unit_id']

In [None]:
# use log target
X_train, x_val, y_train, y_val = train_test_split(train[full_features_list], np.log1p(train['CPM']))

In [None]:
model_cat = CatBoostRegressor(cat_features=cat_features_list, random_state=42)

In [None]:
model_cat.fit(X_train, y_train, verbose = 200)
cat_preds_val = model_cat.predict(x_val)

print('MSE on validation = ', mean_squared_error(np.expm1(y_val), np.expm1(cat_preds_val)))

In [None]:
cat_preds_test = model_cat.predict(test[full_features_list])

print('MSE on test with catboost = ', mean_squared_error(test['CPM'], np.expm1(cat_preds_test)))

In [None]:
print('MSE on test with random forest = ', mean_squared_error(test['CPM'], rf_test_preds))
print('MSE on test with catboost = ', mean_squared_error(test['CPM'], np.expm1(cat_preds_test)))

MSE on test part is slightly better with catboost regressor