In [None]:
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import re
import pickle

from IPython.display import display
from collections import Counter
from functools import partial

import warnings
warnings.filterwarnings(action='ignore', category=DeprecationWarning)
warnings.filterwarnings(action='ignore', category=FutureWarning)
from pprint import pprint

In [None]:
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, KFold
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from lightgbm import LGBMClassifier, LGBMRegressor

### reading data

In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')

path_to_files = '/content/gdrive/My Drive/_made/auctions/'

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
path_to_files = '/kaggle/input/real-time-advertisers-auction/'

### CPM calculation

In [None]:
#calculating CPM
#calculating the value that the Advertisers Bid for the month of June
# CPM(the value which was the winning bid value) = 
#((revenue of the publisher*100)/revenue_share_percentage)/measurable_impressions)*1000

def weird_division(n, d):
    return n / d if d else 0

# all_data['CPM'] = all_data.apply(lambda x: weird_division( x['total_revenue'] * 100 , x['measurable_impressions']) * 1000 , axis = 1)

### winsorization + train-test time split

В качестве train необходимо использовать данные до 21.06.2019 включительно, для test - все оставшиеся (важно: из test исключаются отрицательные значения, а также обрезаются значения справа по 95 перцентилю)

- не совсем однозначное условие, предполагаем, что квантиль берётся после исключения отрицательных значений и только по распределению на test-выборке

In [None]:
all_data = pd.read_csv(path_to_files + 'Dataset.csv')
all_data['CPM'] = all_data.apply(lambda x: weird_division( x['total_revenue'] * 100 , x['measurable_impressions']) * 1000 , axis = 1)
(all_data['CPM'] < 0).sum(), (all_data['CPM'] == 0).sum(), (all_data['CPM'] > 0).sum()

#### вроде бы такой вариант сплита и винзоризации самый адекватный

In [None]:
all_data = pd.read_csv(path_to_files + 'Dataset.csv')
all_data['CPM'] = all_data.apply(lambda x: weird_division( x['total_revenue'] * 100 , x['measurable_impressions']) * 1000 , axis = 1)
all_data = all_data.query("CPM >= 0")
train_data = all_data.query("date < '2019-06-22'")
test_data = all_data.query("date >= '2019-06-22'")

In [None]:
 f"{np.quantile(all_data['CPM'], q = 0.95):.4f} {np.quantile(train_data['CPM'], q = 0.95):.4f} {np.quantile(test_data['CPM'], q = 0.95):.4f}"

In [None]:
np.quantile(all_data['CPM'], q = np.arange(0,1.01,0.01))[90:]

In [None]:
train_data = train_data[train_data['CPM'] < np.quantile(test_data['CPM'], q = 0.95)].reset_index(drop = True)
test_data = test_data[test_data['CPM'] < np.quantile(test_data['CPM'], q = 0.95)].reset_index(drop = True)

### target distribution

In [None]:
plt.figure(figsize = (10,6))

sns.distplot(train_data['CPM'], color = 'magenta')
sns.distplot(test_data['CPM'], color = 'forestgreen')
plt.grid(True)

#### логнормальненько так, хотя даже и не так (см. ниже) - что-то zero-inflated, но непрерывно, а не Poisson

In [None]:
EPS = 1

plt.figure(figsize = (10,6))

sns.distplot(np.log(train_data['CPM'] + EPS), color = 'magenta')
sns.distplot(np.log(test_data['CPM'] + EPS), color = 'forestgreen')
plt.grid(True)

In [None]:
EPS = 1

plt.figure(figsize = (10,6))

sns.distplot(np.log(train_data['CPM'][train_data['CPM'] > 0]), color = 'magenta')
sns.distplot(np.log(test_data['CPM'][test_data['CPM'] > 0]), color = 'forestgreen')
plt.grid(True)

### features distribution

In [None]:
drops = ['CPM', 'date', 'total_revenue', 'integration_type_id', 'revenue_share_percent']
features = [f for f in train_data.columns if f not in drops]

cat_features = [f for f in features if f.endswith('_id')]
num_features = [f for f in features if f not in cat_features]

### определим неконстистентные с обучающей фичи на тестовой выборке

In [None]:
from tqdm.notebook import tqdm
cat_unions = {}
for f in tqdm(cat_features):
    train_gb = train_data.groupby(f)[['CPM']].agg(['mean', 'count'])
    test_gb = test_data.groupby(f)[['CPM']].agg(['mean', 'count'])
    inconsistent_categories = pd.merge(train_gb, test_gb, left_index = True, right_index = True, how = 'outer')
    if inconsistent_categories.isna().sum(axis = 0).sum() > 0:
        cat_unions[f] = set(inconsistent_categories.index[inconsistent_categories.isna().sum(axis = 1) > 0])
        print(f'{len(train_data[train_data[f].isin(cat_unions[f])])} rows in train have categories in {f} which are not in the test')

In [None]:
for f in tqdm(cat_features):
    drop_criteria = cat_unions.get(f, None)
    if drop_criteria:
        print(f' dropping {len(train_data[train_data[f].isin(drop_criteria)])} rows from train cause of {f}')
        train_data = train_data[~train_data[f].isin(drop_criteria)]
        print(f'{len(train_data)} rows left')

### feature exploration

In [None]:
for f in tqdm(features):
    plt.figure(figsize = (8,8))
    _ = sns.scatterplot(data = train_data, y = 'CPM', x = f)

In [None]:
for f in tqdm(features):
    if f in cat_features:
        train_data[f] = train_data[f].astype(int)
        test_data[f] = test_data[f].astype(int)
    plt.figure(figsize = (10,6))
    sns.distplot(train_data[f], color = 'mediumorchid')
    sns.distplot(test_data[f], color = 'gold')
    
#### что это такое??!

#### очень хвостато всё

### feature generation


в итоге позволяет лишь сильнее переобучиться под train - не используем, но для истории оставим

In [None]:
# from itertools import combinations
# from copy import deepcopy

# train_data['weekday'] = pd.to_datetime(train_data['date']).dt.weekday
# test_data['weekday'] = pd.to_datetime(test_data['date']).dt.weekday

# cat_features += ['weekday']

# exclude_features = []

# CAT_UPPER_BOUND = 200

# for c in tqdm(cat_features):
#     if len(train_data[c].unique()) > CAT_UPPER_BOUND:
#         exclude_features.append(c)
#     train_data[c] = train_data[c].astype(str)
#     test_data[c] = test_data[c].astype(str)
    
# cross_cat_features = []
# for c1, c2 in tqdm(list(combinations(cat_features, 2))):
#     if c1 != c2:
#         c = f'{c1}_{c2}'
#         train_data[c] = train_data[c1].values + '_' + train_data[c2].values
#         test_data[c] = test_data[c1].values + '_' + test_data[c2].values
#         if len(train_data[c].unique()) > CAT_UPPER_BOUND:
#             exclude_features.append(c)
#         cross_cat_features.append(c)

In [None]:
# from sklearn.preprocessing import LabelEncoder
# cat_encoders = {}
# new_cat_value = 'NAN'

# cat_features = [f for f in cat_features if f not in exclude_features] 
# cat_features_init = deepcopy(cat_features)
# cat_features += [f for f in cross_cat_features if f not in exclude_features] 

# for c in tqdm(cat_features):
#     encoder = LabelEncoder()
#     encoder.fit(train_data[c].fillna(new_cat_value))
#     le = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))
#     if new_cat_value not in le:
#         le[new_cat_value] = -1
#     cat_encoders[c] = le
#     train_data[c] = train_data[c].fillna(new_cat_value).apply(lambda x: x if x in le else new_cat_value).replace(le)
#     test_data[c] = test_data[c].fillna(new_cat_value).apply(lambda x: x if x in le else new_cat_value).replace(le)

# from sklearn.preprocessing import PolynomialFeatures
# pf = PolynomialFeatures(2)
# train_pf = pd.DataFrame(pf.fit_transform(train_data[num_features]))
# test_pf = pd.DataFrame(pf.transform(test_data[num_features]))
# poly_features = [f'pf_{i}' for i in range(train_pf.shape[1])]
# train_pf.columns = poly_features
# test_pf.columns = poly_features

# train_data = pd.concat([train_data, train_pf], axis = 1)
# test_data = pd.concat([test_data, test_pf], axis = 1)

In [None]:
# with open(path_to_files + 'poly_features.pkl', 'wb') as f:
#     pickle.dump(poly_features, f)
# with open(path_to_files + 'cat_features_init.pkl', 'wb') as f:
#     pickle.dump(cat_features_init, f)
# with open(path_to_files + 'cat_features.pkl', 'wb') as f:
#     pickle.dump(cat_features, f)
# with open(path_to_files + 'cross_cat_features.pkl', 'wb') as f:
#     pickle.dump(cross_cat_features, f)
# with open(path_to_files + 'num_features.pkl', 'wb') as f:
#     pickle.dump(num_features, f)
# with open(path_to_files + 'cat_encoders.pkl', 'wb') as f:
#     pickle.dump(cat_encoders, f)

In [None]:
# train_data.to_csv(path_to_files + 'train.csv', index = False)
# test_data.to_csv(path_to_files + 'test.csv', index = False)

### metrics & optimizations

In [None]:
from sklearn.metrics import make_scorer
import scipy.optimize

def mean_absolute_percentage_error(y_true, y_pred): 
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def mean_absolute_error(y_true, y_pred): 
    return np.mean(np.abs((y_true - y_pred)))

def mean_squared_error(y_true, y_pred): 
    return np.mean((y_true - y_pred) ** 2)

def get_importances(lgb):
    return pd.DataFrame({'feature':lgb.booster_.feature_name(), 'importance': lgb.booster_.feature_importance('gain'),}).sort_values(by = 'importance', ascending = False)

def calc_optimal_shift(y_true, y_predicted):

    start_point = (y_true - y_predicted).mean(axis = 0)
    def f(x):
        return mean_squared_error(y_true, y_predicted + x)
    res = scipy.optimize.minimize(f, start_point, method='nelder-mead', options={'xtol': 1e-4, 'disp': True})

    return res.x

def calc_optimal_alpha(y_true, y_predicted):

    alpha = 1
    def f(alpha):
        return mean_squared_error(y_true, alpha * y_predicted)
    res = scipy.optimize.minimize(f, alpha, method='nelder-mead', options={'xtol': 1e-4, 'disp': True})

    return res.x

mape_scorer = make_scorer(mean_absolute_percentage_error, greater_is_better = False)
mae_scorer = make_scorer(mean_absolute_error, greater_is_better = False)
mse_scorer = make_scorer(mean_squared_error, greater_is_better = False)

### grid settings

In [None]:
lgb_params = {'max_depth': 3, 'num_leaves': 16, 'n_estimators':800, 'min_child_samples':100}
lgb_grid = {'max_depth': np.arange(2, 10), 'num_leaves': 2**np.arange(3,10),
            'reg_alpha': 10.0**np.arange(-2, 3), 'reg_lambda': 10.0**np.arange(-2, 3), 'feature_fraction': np.arange(0.7, 1.1, 0.1),
            'n_estimators':np.linspace(50, 1000, 100, dtype = int), 'min_child_samples':np.linspace(50, 1000, 100, dtype = int)}

### direct mse optimization through stochastic grid search





In [None]:
X_train, y_train = train_data[features], train_data['CPM']
X_val, y_val = test_data[features], test_data['CPM']

# X_train, y_train = train_data[cat_features_init + num_features + poly_features], train_data['CPM']
# X_val, y_val = test_data[cat_features_init + num_features + poly_features], test_data['CPM']

In [None]:
%%time
lgb = LGBMRegressor(random_state = 8, metric = None, n_jobs = -1, objective = 'mse')
grid = RandomizedSearchCV(lgb, param_distributions = lgb_grid, cv = KFold(2, random_state = 8),
                          n_jobs= -1, refit = False,
                          n_iter = 10, scoring = mse_scorer, verbose = True, random_state = 8)
grid.fit(X_train, y_train,
         eval_metric = 'mse', early_stopping_rounds = 50, eval_set = (X_val, y_val),
         **{'categorical_feature': cat_features}
         )

grid.best_params_, grid.best_score_

In [None]:
pprint(grid.best_params_)
lgb_best = lgb.set_params(**grid.best_params_)
lgb_best.set_params(**{'n_estimators': 5000,
                       'objective': 'mse'})

lgb_best.fit(X_train, y_train,
         eval_metric = 'mse', early_stopping_rounds = 50, eval_set = (X_val, y_val),
         **{'categorical_feature': cat_features}
         )

### посмотрим на базовые значимости

In [None]:
imps = get_importances(lgb_best)
fig, ax = plt.subplots(1,1,figsize = (20,6))
imps[['importance']].plot(kind = 'bar', ax = ax)
_ = ax.set_xticklabels(imps['feature'])

### scoring

In [None]:
y_train_pred = lgb_best.predict(X_train)
y_val_pred = lgb_best.predict(X_val)
mean_squared_error(y_train, y_train_pred), mean_squared_error(y_val, y_val_pred)

### кажется, вполне неплохо

да, мы немного использовали тест выборку при отборе значений категориальных фичей и при early stopping, но не то, чтобы тюнились под нее сильно явно

это не best practice, но здесь-то задача стоит именно на ней оптимизировать mse, так что..

### посмотрим на распределения

In [None]:
plt.figure(figsize = (10,6))

sns.distplot(y_train, color = 'mediumorchid')
sns.distplot(y_train_pred, color = 'crimson')
sns.distplot(y_val, color = 'forestgreen')
sns.distplot(y_val_pred, color = 'gold')
plt.grid(True)

### бустинг в регрессиях в силу своей природы построения иногда залезает в отрицательную область, даже если true target неотрицателен - подкорректируем


а раз уж решили оптимизироваться под test, то ещё и линейный сдвиг к прогнозу найдём

In [None]:
def clip_correction(y_predicted, low = 0, high = np.inf):
    return np.clip(y_predicted, low, high)

def calc_optimal_transformation(y_true, y_predicted):

    x = np.array([1,1])
    def f(x):
        return mean_squared_error(y_true, x[0] + x[1] * y_predicted)
    res = scipy.optimize.minimize(f, x, method='nelder-mead', options={'xtol': 1e-4, 'disp': True})

    return res.x

In [None]:
mean_squared_error(y_train, clip_correction(y_train_pred)), mean_squared_error(y_val, clip_correction(y_val_pred))

In [None]:
calc_optimal_transformation(y_train, clip_correction(y_train_pred)), calc_optimal_transformation(y_val, clip_correction(y_val_pred))

In [None]:
linear_correction_coefs = calc_optimal_transformation(y_val, clip_correction(y_val_pred))

### сохраним модель

In [None]:
with open(path_to_files+'lgb_final_model.pkl', 'wb') as f:
    pickle.dump(lgb_best, f)

### ещё раз убедимся, что не использовали ничего лишнего

In [None]:
display(X_train.head())
display(X_val.head())

### Finally,

In [None]:
final_forecast = linear_correction_coefs[0] + linear_correction_coefs[1] * clip_correction(y_val_pred)

In [None]:
f'Final MSE Value on Test :{mean_squared_error(y_val, final_forecast):.4f}'