In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from xgboost import XGBRegressor
import seaborn as sns
import matplotlib.pyplot as plt


About this file
The dataset provided to you has data for several websites owned by the same company and they are asking for your help for what should be their approach to set reserve prices and what is the range for reserve prices they should be setting for July. The data is only of the actual revenue generation and not at bid level. The dataset has the following columns:

1. date
1. site\_id : each id denotes a different website
1. ad\_type\_id : each id denotes a different ad_type. These can be display ads , video ads, text ads etc
1. geo\_id : each id denotes a different country. our maximum traffic is from english speaking countries
1. device\_category\_id : each id denoted a different device_category like desktop , mobile, tablet
1. advertiser\_id: each id denotes a different bidder in the auction
1. order\_id : can be ignored
1. line\_item\_type\_id : can be ignored
1. os\_id : each id denotes a different operating system for mobile device category only (android , ios etc) . for all other device categories, osid will correspond to not_mobile
1. integration\_type\_id : it describes how the demand partner is setup within a publisher's ecosystem - can be adserver (running through the publisher adserver) or hardcoded
1. monetization\_channel\_id : it describes the mode through which demand partner integrates with a particular publisher - it can be header bidding (running via prebid.js), dynamic allocation, exchange bidding, direct etc
1. ad\_unit\_id - each id denotes a different ad unit (one page can have more than one ad units)
1. total\_impressions - measurement column measuring the impressions for the particular set of dimensions
1. total\_revenue - measurement column measuring the revenue for the particular set of dimensions
1. viewable\_impressions - Number of impressions on the site that were viewable out of all measurable impressions. A display ad is counted as viewable if at least 50% of its area was displayed on screen for at least one second
1. measurable\_impressions - Impressions that were measurable by Active View out of the total number of eligible impressions. This value should generally be close to 100%. For example, an impression that is rendering in a cross-domain iframe may not be measurable.
1. revenue\_share\_percent - not every advertiser gives all the revenue to the publisher. They charge a certain share for the services they provide. This captures the fraction of revenue that will actually reach the publishers pocket.

In [None]:
df = pd.read_csv("/kaggle/input/real-time-advertisers-auction/Dataset.csv")

In [None]:
df.drop(columns=['order_id', 'line_item_type_id'], axis=1, inplace=True)

In [None]:
df.reset_index(inplace=True)
df.info()

Calculate `CPM`

In [None]:
def weird_division(n, d):
    return n / d if d else 0

df['CPM'] = df.apply(lambda x: weird_division(((x['total_revenue']*100)),x['measurable_impressions'])*1000 , axis=1)

In [None]:
df.drop(columns=['total_revenue'], inplace=True)

Cut data CPM by 0 and 95 percentile

In [None]:
percetile_95 = df['CPM'].quantile(0.95)
tt = (percetile_95 > df['CPM']) & (0.0 <= df['CPM'])
df = df[tt].reset_index()

In [None]:
# (tt == True).any()
# percetile_95
df.drop(columns=['level_0', 'index'], inplace=True)

In [None]:
# Split data to train and test by date 2019-06-21
s = pd.to_datetime(df['date'])
date_split = pd.to_datetime("2019-06-21")
print(date_split)

df['day'] = s.dt.day
df['weekday'] = s.dt.weekday
# df_train = df[s <= date_split]
# date_split = '2019-06-21'
mask_to_split = s <= date_split
df.drop(columns=['date'], inplace=True)

## Split data to train and test by 2019-06-21

In [None]:
df_train = df[mask_to_split]
df_test = df[~mask_to_split]


In [None]:
categories = ['site_id', 'ad_type_id', 'geo_id', 'device_category_id',
       'advertiser_id', 'os_id', 'integration_type_id',
       'monetization_channel_id', 'ad_unit_id', 'day', 'weekday']

numericals = ['total_impressions',
       'total_revenue', 'viewable_impressions', 'measurable_impressions',
       'revenue_share_percent']

features = categories + numericals

In [None]:
df_train.isna().any()

In [None]:
for i in categories:
    print(f"{i} uniques :{df_train[i].unique().size}")

skip `integration_type_id` . It considers only 1 value 

In [None]:
skip_cols = ['integration_type_id']
target_col = 'CPM'

In [None]:
X_train, X_val, y_train, y_val = train_test_split(df_train.drop(columns=skip_cols + [target_col]),
                                                  df_train[target_col],
                                                  test_size=0.2, 
                                                  random_state=42,
                                                 )

In [None]:
tt = (y_train > -10.0) #& (y_train < np.exp(14))
_ = plt.hist(np.log1p(y_train[tt]), bins=30) #.describe()

In [None]:
minmax = MinMaxScaler()
X_train_ = minmax.fit_transform(X_train)

In [None]:
regr = XGBRegressor(objective='reg:tweedie',
                    n_estimators=300,
                    learning_rate=0.1,
                    max_depth=8,
                    metric='tweedie-nloglik',
                    n_jobs=-1,
                   )
# tt = (y_train > 0.0) & (y_train < np.exp(10))
tt = y_train > -10.0
                            
regr.fit(X_train_[tt], (y_train[tt]))

In [None]:
X_val_ = minmax.transform(X_val)
y_pred = regr.predict(X_val_)

In [None]:
mean_squared_error(y_pred, y_val)

Valid the main Test data

In [None]:
use_cols = ['site_id', 'ad_type_id', 'geo_id', 'device_category_id',
       'advertiser_id', 'os_id', 'monetization_channel_id', 'ad_unit_id',
       'total_impressions', 'viewable_impressions', 'measurable_impressions',
       'revenue_share_percent', 'day', 'weekday']

X_test_ = minmax.transform(df_test[use_cols])
y_pred = regr.predict(X_test_)

In [None]:
mean_squared_error(y_pred, df_test['CPM'])

## Results

Get the finel MSE: `4469.8`