In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## General

### Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from pandas.plotting import register_matplotlib_converters
import seaborn as sns
import datetime
import time
import os

In [None]:
register_matplotlib_converters()
plt.style.use('default')

In [None]:
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 20) 

## Data load

### Dataset.csv

In [None]:
path = '/kaggle/input/real-time-advertisers-auction/Dataset.csv'
dataset = pd.read_csv(path, parse_dates=['date'])
dataset

In [None]:
dataset.info()

In [None]:
# calculating CPM
# calculating the value that the Advertisers Bid for the month of June
# CPM(the value which was the winning bid value) = \
#     ((revenue of the publisher * 100) / revenue_share_percentage) \
#     / measurable_impressions) * 1000

def weird_division(n, d):
    return n / d if d else 0

dataset['CPM'] = dataset.apply(lambda x: weird_division(((x['total_revenue']*100)),x['measurable_impressions'])*1000 , axis=1)

In [None]:
mask = dataset['CPM'] < 0
dataset.drop(dataset.index[mask], inplace=True)

In [None]:
mask = dataset['CPM'] >= dataset.CPM.quantile(q=0.95)
dataset.drop(dataset.index[mask], inplace=True)

In [None]:
dataset['CPM'].describe()

In [None]:
dataset.drop(columns='total_revenue', inplace=True)

In [None]:
dataset.info()

### train_test_split

In [None]:
from sklearn.model_selection import train_test_split
from datetime import datetime

In [None]:
train_data = datetime(2019, 6, 21)
target_col = 'CPM'

In [None]:
mask = dataset['date'] <= train_data
train = dataset.loc[mask]
target = train.pop(target_col)
test = dataset.loc[~mask]
target_test = test.pop(target_col)
train.shape, target.shape, test.shape, target_test.shape

## EDA

### Analysis

In [None]:
ax = target.hist(bins=50)
ax.set(title=target.name)
plt.box(False)
plt.show()

In [None]:
for col in train.columns:
    ax = train[col].hist(bins=50,)
    ax.set(title=col)
    plt.box(False)
    plt.show()
#     break

In [None]:
exluded_cols = ['date', 'integration_type_id', 'revenue_share_percent']

### Features

In [None]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

In [None]:
cols = list(set(train.columns) - set(exluded_cols))
features_pipe = ColumnTransformer([
    ('numeric', StandardScaler(), cols),
]) # , n_jobs=-1

In [None]:
x_train = features_pipe.fit_transform(train)
x_train.shape

In [None]:
x_test = features_pipe.transform(test)
x_test.shape

## Modeling

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, ShuffleSplit
from sklearn.dummy import DummyRegressor
from scipy.stats import uniform
from sklearn.metrics import make_scorer

In [None]:
def show_search_results(search):
    cols = ['rank_test_score','params','mean_test_score','std_test_score',]
    df = pd.DataFrame({k:search.cv_results_[k] for k in cols}).set_index('rank_test_score') \
        .sort_index()
    return df
# show_search_results(search)

### Constant benchmark

#### Numerical search of optimal value

In [None]:
scores = list()
for i in range(int(target.max())):
    score = list()
    y_pred = np.repeat(i, len(target))
    score.append(mean_squared_error(target, y_pred))
    y_pred = np.repeat(i, len(target_test))
    score.append(mean_squared_error(target_test, y_pred))
    scores.append(score)
# scores

In [None]:
df = pd.DataFrame(scores, columns=['train','test'])
df.nsmallest(5, 'train', keep='all')

In [None]:
df.plot()
plt.grid()
plt.box(False)

#### GridSearchCV DummyRegressor

In [None]:
search_params = {
    'constant': range(int(target.max())),
}

In [None]:
x_train = features_pipe.fit_transform(train)
ss1 = ShuffleSplit(n_splits=1, test_size=0.99999)
search = GridSearchCV(DummyRegressor(strategy='constant'), search_params, 
                      scoring='neg_mean_squared_error', 
                      cv=ss1, n_jobs=-1)
search.fit(x_train, target)

In [None]:
show_search_results(search).head(5)

#### RandomizedSearchCV DummyRegressor

In [None]:
const_pipe = Pipeline([
    ('dummy', DummyRegressor(strategy='constant')),
])

In [None]:
search_params = {
    'dummy__constant': uniform(loc=0, scale=int(target.max())),
}

In [None]:
ss1 = ShuffleSplit(n_splits=1, test_size=0.99999, random_state=42)
search = RandomizedSearchCV(const_pipe, search_params, scoring='neg_mean_squared_error',
                        random_state=42, n_jobs=-1, n_iter=100, cv=ss1)
search.fit(x_train, target)

In [None]:
show_search_results(search).head(5)

### XGBRegressor Fit

In [None]:
from xgboost import XGBRegressor

In [None]:
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)

In [None]:
# model = XGBRegressor(n_estimators=147, learning_rate=0.17, max_depth=9,
model = XGBRegressor(n_estimators=177, learning_rate=0.17, max_depth=9,
                     n_jobs=-1, objective='reg:squarederror', random_state=42)
model.fit(x_train, target,
    verbose=False,
)
score = mse_scorer(model, x_test, target_test)
print('MSE score:', score)