**Here is the notebook that has finished with the MSE score 4626.235**

**IMPORTS**

In [None]:
import numpy as np
import pandas as pd
import re

from numpy import mean
from numpy import std
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor
from sklearn.metrics import mean_absolute_error, make_scorer
from sklearn.feature_selection import VarianceThreshold, RFE
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_regression, f_regression
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsRegressor, RadiusNeighborsRegressor
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold, RepeatedStratifiedKFold
from sklearn.metrics import mean_squared_error as mse
 
import matplotlib.pyplot as plt
from matplotlib import pyplot
import matplotlib.style as style
import seaborn as sns

import folium
from folium.plugins import FloatImage

from lightgbm import LGBMRegressor

import warnings
%matplotlib inline
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', 100)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

**DATA PREPROCESSING AND ANALISYS**

In [None]:
PATH = "../input/real-time-advertisers-auction"

In [None]:
df = pd.read_csv(f'{PATH}/Dataset.csv')

In [None]:
df.sample(20)

In [None]:
df['date'] = pd.to_datetime(df['date'])

In [None]:
# Formula to calculate CPM. Imported from the orginal notebook
def weird_division(n, d):
    return n / d if d else 0

df['CPM'] = df.apply(lambda x: weird_division(((x['total_revenue'] * 100)),
                                                x['measurable_impressions']) * 1000,
                                                axis=1)
df = df[df['CPM'] >= 0]

In [None]:
# Separating the train and test dataframes

train = df[df['date'] <= "2019-06-21"]
test = df[df['date'] > "2019-06-21"]

# Now we drop target columns, columns linked to them and the quantile
test = test[test['CPM'] < test['CPM'].quantile(0.95)]
train = train[train['CPM'] < train['CPM'].quantile(0.95)]

# We set our target ('CPM' from test dataframe, also further we act as though we didn't have it)
# and y_train ('CPM' from train dataframe)
target = test['CPM']
y = train['CPM']

In [None]:
train.sample(10)

In [None]:
corr = train.corr()
plt.figure(figsize=(14,10))
sns.heatmap(data=corr,vmin=0, vmax=1, cmap="YlGnBu",  square=True, annot= True)
plt.show()

In [None]:
# We drop the values that have been used to calculate our target
train = train.drop(['total_revenue', 'CPM'], axis=1)
test = test.drop(['total_revenue', 'CPM'], axis=1)

In [None]:
# Now we can remove values which are not correlated with the 'CPM'
train.drop(['integration_type_id' , 'revenue_share_percent'], axis = 1, inplace=True)
test.drop(['integration_type_id' , 'revenue_share_percent'], axis = 1, inplace=True)

**We try to fetch some information by precisely looking at our features**

In [None]:
train.os_id.unique()

In [None]:
train.site_id.unique()

In [None]:
train.ad_type_id.unique()

In [None]:
train.ad_unit_id.unique()

In [None]:
train.geo_id.unique()

In [None]:
train.order_id.unique()

In [None]:
train.monetization_channel_id.unique()

**We fill the empty cells using diiferent variants.**

In [None]:
train.fillna(0, inplace = True)

**First let's try to make using only those features that are numeric though could be easily represented as the categoical
For that purpose the one-hot encoding with dummies would be used**

**We select columns for further work. We consider the least diverse features as categorical.**

In [None]:
train.columns

In [None]:
# We get rid of some values to show a little bit of fair play (of course because the model still beats the target)
cat_columns = ['site_id', 'ad_type_id', 'device_category_id', 'advertiser_id',
               'order_id', 'line_item_type_id', 'os_id', 'monetization_channel_id']

# We can further use other features so let's include and comment them in our case
num_columns = ['geo_id', 'ad_unit_id']#, 'total_impressions', 'viewable_impressions']


# And then our first-guess df looks like this
fg_df = train[cat_columns + num_columns]

In [None]:
fg_df

In [None]:
def transform_data(df, num_cols, cat_cols):
    transformed_df = df.copy()
    
    for col in cat_cols:
        transformed_df[col] = transformed_df[col].astype('category')
        transformed_df = pd.concat([transformed_df.drop(col, axis=1),
                                    pd.get_dummies(transformed_df[col], prefix=col)], axis=1)
        
    transformed_df[num_cols] = transformed_df[num_cols].apply(
        lambda x: np.log(x+1))
    
    scaler = MinMaxScaler()
    transformed_df[num_cols] = scaler.fit_transform(transformed_df[num_cols])

    return transformed_df
transformed_df = transform_data(fg_df, num_columns, cat_columns)

In [None]:
transformed_df

In [None]:
def train_and_test(df, y, model, test_size=0.2):
    target = y
    features = df

    X_train, X_test, y_train, y_test = train_test_split(
        features, target, test_size=test_size, random_state=42)

    model.fit(X_train, y_train)  
    y_pred = model.predict(X_test) 
    print('MSE test: %.3f' % (mse(y_test, y_pred)))
    pred = np.round((y_pred) + 1, 1)
    actual = np.round((y_test) + 1, 1)
    plt.scatter(actual.to_numpy(), pred)
    plt.title('Predicted vs. Actual', fontsize=18, fontweight='bold')
    plt.xlabel('Actual')
    plt.ylabel('Predicted')
    plt.show()

    return model

In [None]:
# We use the LGBMRegressor as a model. For this particular case we can do it "from the box".
# Anyway we leave some room for manoeuvre and comment possible attributes to be added

# n = 10
model = LGBMRegressor()# boosting_type="dart", n_estimators=60, learning_rate=0.2, max_depth=n, num_leaves=2 ** n)
trained_model_LGBMR = train_and_test(transformed_df, y, model)

In [None]:
# Here we get the most important features for some further analytics (maybe for RnD department)
feat_imp = pd.Series(trained_model_LGBMR.feature_importances_,
                     index=transformed_df.columns)
feat_imp.nlargest(30).plot(kind='barh', figsize=(10, 6))
plt.xlabel('Relative Importance')
plt.title("Feature importances", fontsize=18, fontweight='bold')
plt.show()

In [None]:
transformed_test = transform_data(test, num_columns, cat_columns)
train_cols = transformed_df.columns.tolist()
test_cols = transformed_test.columns.tolist()
intersection = []
for col in train_cols:
    if col in test_cols:
        intersection.append(col)
transformed_test = transform_data(test, num_columns, cat_columns)
transformed_test = transformed_test[intersection]
transformed_df = transformed_df[intersection]

**Cross-validation**

In [None]:
# To be sure we use cross-validation mechanism
features = transformed_df.copy()
    
X_train, X_test, y_train, y_test = train_test_split(
        features, y, test_size=0.1, random_state=42)

In [None]:
scorer = make_scorer(mse)

In [None]:
scores = cross_val_score(trained_model_LGBMR, X_train, y_train, scoring=scorer, cv=5)
scores

**As we can see the results respond our expactations so we proceed**

**Final train on the whole dataset**

In [None]:
model = LGBMRegressor()# boosting_type="dart", n_estimators=60, learning_rate=0.2, max_depth=n, num_leaves=2 ** n)
trained_model_LGBMR = train_and_test(transformed_df, y, model, 0.99)

**Still not bad**

**And now we get down to business**

In [None]:
prediction = trained_model_LGBMR.predict(transformed_test)

In [None]:
print('MSE test: %.3f' % (mse(target, prediction)))

In [None]:
# I want a credit for the auction theory.
# Thank you in advance!