In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from catboost import CatBoostRegressor  
import matplotlib.pyplot as plt
import seaborn as sns
import json
from datetime import datetime

%matplotlib inline

In [None]:
data = pd.read_csv('/kaggle/input/real-time-advertisers-auction/Dataset.csv')
data.head().T

# Data descriptive analysis and Preprocessing

In [None]:
def data_stats_table(data, data_example=True, nlargest_num=1, stats_describe=True):
    data_stats = pd.DataFrame()

    if data_example:  
        data_stats['data[0]'] = data.loc[0, :].T

    data_stats['dtypes'] = data.dtypes  
    data_stats.loc['rows_count', 'dtypes'] = len(data.dtypes)
    data_stats['dupl'] = int(data.duplicated().sum())

    data_stats['NaNs'] = data.isnull().sum()  
    data_stats.loc['rows_count', 'NaNs'] = (
        data.isnull().sum(axis=1) != 0).sum()
    data_stats['NaNs'] = data_stats['NaNs'].astype('int')

    for name in data.columns:
        data_stats.loc[name, 'unique'] = data[name].nunique()
        top_freq = round(data[name].value_counts(
            normalize=True).nlargest(nlargest_num), 2)
        data_stats.loc[name, 'top_freq'] = json.dumps(list(top_freq))
        data_stats.loc[name, 'top_freq_value'] = json.dumps(
            list(top_freq.index))

    if stats_describe:  
        df_des = round(data.describe().T.drop(columns=['count']), 2)
        data_stats = pd.concat([data_stats, df_des], axis=1, sort=False)
    data_stats.fillna("", inplace=True)
    return data_stats

data_stats_table(data)

In [None]:
def weird_division(n, d):
    return n / d if d else 0

data['CPM'] = data.apply(lambda x: 1000*weird_division(x['total_revenue']*100, x['measurable_impressions']), axis=1)
data = data[data['CPM'] >= 0]

data.drop(columns = ['integration_type_id' , 'revenue_share_percent', 'total_revenue'], inplace=True)
data_cols = data.columns

data['date'] = pd.to_datetime(data['date'])
data["sample"] = (data['date'] < pd.to_datetime('22.06.2019')).astype("int")

In [None]:
sns.set(font_scale=1)
plt.subplots(figsize=(17, 10))
sns.heatmap(data.corr(), square=True, annot=True, fmt=".2f", cmap="coolwarm", vmin=-1, vmax=1, center=0, 
           linewidths=1, linecolor='white',  mask = np.tril(data.corr()));

In [None]:
def plot_train_test_hist(data_, col_names, col_number=2, figsize_=(18, 8), bins_=10):
    fig, axes_ = plt.subplots(-(-len(col_names)//col_number),
                              col_number, figsize=figsize_)

    for counter_ in range(len(col_names)):
        pic_row_ = counter_//col_number
        pic_col_ = counter_ % col_number
        axes_[pic_row_, pic_col_].hist(data_[col_names[counter_]], rwidth=0.95,
                                       alpha=0, color='green', bins=bins_, density = True)
        
        axes_[pic_row_, pic_col_].hist(data_[data_['sample'] == 1][col_names[counter_]], rwidth=0.95,
                                       alpha=0.65, label='learn_data', color='red', bins=bins_, density = True)

        axes_[pic_row_, pic_col_].hist(data_[data_['sample'] == 0][col_names[counter_]], rwidth=0.95,
                                       alpha=0.65, label='predict_data', color='blue', bins=bins_, density = True)

        axes_[pic_row_, pic_col_].set_title(col_names[counter_])
        axes_[pic_row_, pic_col_].legend(loc=1)

plot_train_test_hist(data, data_cols, col_number=4, figsize_=(16, 16), bins_=20)

# Taking the logarithm of features and CPM

In [None]:
num_logs = ["total_impressions", "viewable_impressions", "measurable_impressions", "CPM"]
for item in num_logs:
    data[item] = data[item].apply(lambda x: np.log(x+1))

In [None]:
sns.set(font_scale=1)
plt.subplots(figsize=(17, 10))
sns.heatmap(data.corr(), square=True, annot=True, fmt=".2f", cmap="coolwarm", vmin=-1, vmax=1, center=0, 
           linewidths=1, linecolor='white',  mask = np.tril(data.corr()));

In [None]:
plot_train_test_hist(data, data_cols, col_number=4, figsize_=(16, 16), bins_=20)

# Train/test split

In [None]:
data_train = data[data['date'] < pd.to_datetime('22.06.2019')]
data_train = data_train[data_train["CPM"] < data_train["CPM"].quantile(0.95)]
data_train = data_train.drop_duplicates()

data_test = data[data['date'] >= pd.to_datetime('22.06.2019')]
data_test = data_test[data_test["CPM"] < data_test["CPM"].quantile(0.95)]


train_cols = ['site_id', 'ad_type_id', 'geo_id', 'device_category_id', 'advertiser_id', 'order_id', 
              'line_item_type_id', 'os_id', 'monetization_channel_id', 'ad_unit_id', 
              'total_impressions', 'viewable_impressions', 'measurable_impressions']

X_train = data_train[train_cols]
y_train = data_train["CPM"]

X_test = data_test[train_cols]
y_test = data_test["CPM"]

# CatBoostRegressor Model for CPM

In [None]:
def log_y_mse_metrics(y, y_pred):
    return metrics.mean_squared_error(np.exp(y) - 1, np.exp(y_pred) - 1)

In [None]:
%%time
catb_params = {
    'random_seed': 0, 
    'learning_rate': 0.5,
    'iterations': 1000, 
    'depth': 6, 
    'l2_leaf_reg': 10, 
    'subsample' : 0.75, 
    'random_strength': 0.06, 
    'od_type': "Iter", 
    'od_wait': 100, 
    'verbose': False, 
}
cat_features = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
ctb_model = CatBoostRegressor(**catb_params)
ctb_model.fit(X_train, y_train, cat_features = cat_features) 

y_pred_test = ctb_model.predict(X_test)
print(log_y_mse_metrics(y_test, y_pred_test))