In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
%%time
df = pd.read_csv('/kaggle/input/real-time-advertisers-auction/Dataset.csv')

# Calculating CPM (the value which was the winning bid value)
def weird_division(n, d):
    return n / d if d else 0

df['CPM'] = df.apply(lambda x: weird_division(x['total_revenue'] * 100, x['measurable_impressions']) * 1000,
                     axis=1)

df = df[df['CPM'] >=0]
df.drop('total_revenue',1,inplace=True)

In [None]:
# train / test split
train = df[df['date'] < '2019-06-22']
test = df[~df.index.isin(train.index)]

test = test[test['CPM'] < test['CPM'].quantile(.95)]
train = train[train['CPM'] < train['CPM'].quantile(.95)]

train.drop('date', 1, inplace=True)
test.drop('date', 1, inplace=True)

In [None]:
id_cols = [col for col in train.columns if '_id' in col]
num_cols = [col for col in train.columns if '_id' not in col]

### EDA

In [None]:
train[num_cols].describe()

In [None]:
train[num_cols].hist(figsize=(8,8));

In [None]:
def multi_collinearity_heatmap(df, figsize=(11,9)):
    """
    Creates a heatmap of correlations between features in the df. A figure size can optionally be set.
    """
    # Set the style of the visualization
    sns.set(style="white")

    # Create a covariance matrix
    corr = df.corr()

    # Set up the matplotlib figure
    f, ax = plt.subplots(figsize=figsize)

    # Generate a custom diverging colormap
    cmap = sns.diverging_palette(220, 10, as_cmap=True)

    # Draw the heatmap with the mask and correct aspect ratio
    sns.heatmap(corr,  cmap=cmap, center=0, square=True, linewidths=.5,
                cbar_kws={"shrink": .5}, vmax=corr[corr != 1.0].max().max()); 

In [None]:
multi_collinearity_heatmap(train[num_cols], figsize=(10,10))

In [None]:
train[id_cols].describe()

In [None]:
train[id_cols].nunique()

In [None]:
sns.distplot(train['measurable_impressions']);

We could possibly benefit from logarithming this feature:

In [None]:
sns.distplot(np.log(train['measurable_impressions'] + 1));

### A little preprocessing

In [None]:
# drop highly correlated and useless columns
train.drop(['revenue_share_percent', 'total_impressions', 'viewable_impressions'],1, inplace=True)
train.drop('integration_type_id',1, inplace=True)
id_cols = [col for col in train.columns if '_id' in col]

train['measurable_impressions'] = np.log(train['measurable_impressions'] + 1)
test['measurable_impressions'] = np.log(test['measurable_impressions'] + 1)

test = test[train.columns]

### Modeling, Cross-Validate on train

In [None]:
# ## for logarithmic target (didn't work out)
# from sklearn.metrics import make_scorer
# def exp_mse(y_true, y_pred, **kwargs): 
#     y_true = np.exp(y_true) + 1
#     y_pred = np.exp(y_pred) + 1
#     return mean_squared_error(y_true, y_pred)
# mse_scorer = make_scorer(exp_mse)

In [None]:
cb_model = CatBoostRegressor(n_estimators=100, loss_function='RMSE', eval_metric='RMSE', 
                            cat_features=id_cols)

In [None]:
%%time
scores = cross_val_score(cb_model, train.drop('CPM',1), train['CPM'], scoring='neg_mean_squared_error',
                         cv=3, n_jobs=-1)
scores

### Evaluate on test

In [None]:
cb_model = CatBoostRegressor(n_estimators=300, loss_function='RMSE', eval_metric='RMSE', 
                            cat_features=id_cols)

In [None]:
%%time
cb_model.fit(train.drop('CPM',1), train['CPM'], verbose=False, plot=True);

In [None]:
test_preds = cb_model.predict(test.drop('CPM',1))
mean_squared_error(test['CPM'], test_preds)

#### Resulting feature importances:

In [None]:
weights = pd.DataFrame(cb_model.feature_importances_, columns=['weight'], index=train.drop('CPM',1).columns)
weights.sort_values('weight', ascending=False, inplace=True)
weights