In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data = pd.read_csv('/kaggle/input/real-time-advertisers-auction/Dataset.csv')

### Data exploration

In [None]:
data.head()

In [None]:
data.isna().sum()
# No nans, it is great

In [None]:
sns.pairplot(data)

In [None]:
data.columns

`integration_type_id` and `revenue_share_percent` - has one unique value, we should drop it


In [None]:
drop_cols = ['integration_type_id', 'revenue_share_percent']

In [None]:
# Let's generate target value CPM
data['cpm'] = np.where(data.measurable_impressions > 0, data.total_revenue * 100 / data.measurable_impressions * 1000, 0)

In [None]:
data.head()

In [None]:
# Let's drop target and useless cols
drop_cols += ['measurable_impressions', 'total_revenue']
data.drop(columns=drop_cols, inplace=True)

In [None]:
data.head()

### Data Preparation

In [None]:
# Continue Data exploring
data.info()

In [None]:
# Mark all data that have less than 255 unique values as categorical
CAT_THRESHOLD = 255
cat_features = set()
for col in data:
    if data[col].nunique() <= CAT_THRESHOLD:
        cat_features.add(col)
print(cat_features)

In [None]:
# Let's drop date from cat values
cat_features.remove('date')

In [None]:
# Transform date to datetime and get some features
data['date'] = pd.to_datetime(data['date'])
data['day'] = data['date'].dt.day
data['month'] = data['date'].dt.month
data['dayofweek'] = data['date'].dt.dayofweek
data['dayofyear'] = data['date'].dt.dayofyear

In [None]:
# add some datetime features as categorical
cat_features |= {'dayofweek', 'month', 'day'}

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from scipy import sparse

import datetime

In [None]:
cpm_threshold = data['cpm'].quantile(0.95)
data_raw = data.copy()
data = data.query('cpm < @cpm_threshold')

In [None]:
date_th = pd.to_datetime('2019-06-21')
train = data.query('date <= @date_th')
test = data.query('date > @date_th')

In [None]:
y_train = train.pop('cpm')
y_test = test.pop('cpm')

In [None]:
ohe = OneHotEncoder(handle_unknown='ignore')
X_cat_train = ohe.fit_transform(train[cat_features])
X_cat_test = ohe.transform(test[cat_features], )

In [None]:
# Let's take a look on the rest features
np.setdiff1d(train.columns.values, list(cat_features))

In [None]:
useful_features = list(np.setdiff1d(train.columns.values, list(cat_features)))
useful_features.remove('date')

In [None]:
useful_features

In [None]:
X_int_train = train[useful_features]
X_int_test = test[useful_features]

In [None]:
X_train = sparse.hstack([X_cat_train, X_int_train])
X_test = sparse.hstack([X_cat_test, X_int_test])

In [None]:
xgb_model = XGBRegressor(
#     predictor='gpu_predictor',
    objective='reg:squarederror',
#     cu
    n_estimators=850, # best estimators num
    verbosity=1,
    reg_alpha=0.23,
    reg_lambda=0.1,
    n_jobs=6,
    max_depth=9,
    eta=0.25,
    colsample_bytree=0.7
)

In [None]:
xgb_model.fit(X_train, y_train, eval_set=[(X_test, y_test)])

In [None]:
# Calc train loss
y_pred_train = xgb_model.predict(X_train)
print(f'Train MSE: {mean_squared_error(y_train, y_pred_train)}')

In [None]:
# Calc train loss
y_pred_test = xgb_model.predict(X_test)
print(f'Test MSE: {mean_squared_error(y_test, y_pred_test)}')