In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import time
from pathlib import Path

from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split

import xgboost as xgb
from xgboost import plot_importance
import lightgbm as lgb
from catboost import CatBoostRegressor as cbr

%matplotlib inline

# Load Data

In [None]:
dataset_path = r'/kaggle/input/real-time-advertisers-auction/Dataset.csv'

df = pd.read_csv(dataset_path, parse_dates=['date'])
df.head()

In [None]:
#calculating CPM
#calculating the value that the Advertisers Bid for the month of June
# CPM(the value which was the winning bid value) = 
#((revenue of the publisher*100)/revenue_share_percentage)/measurable_impressions)*1000

def weird_division(n, d):
    return n / d if d else 0

df['CPM'] = df.apply(lambda x: weird_division(((x['total_revenue']*100)),x['measurable_impressions'])*1000 , axis=1)
df.drop('total_revenue', axis=1, inplace=True)

In [None]:
df.head()

# EDA

In [None]:
def multi_collinearity_heatmap(df, figsize=(11,9)):
    
    """
    Creates a heatmap of correlations between features in the df. 
    A figure size can optionally be set.
    """
    
    # Set the style of the visualization
    sns.set(style="white")

    # Create a covariance matrix
    corr = df.corr()

    # Generate a mask the size of our covariance matrix
    mask = np.zeros_like(corr, dtype=np.bool)
    mask[np.triu_indices_from(mask)] = True

    # Set up the matplotlib figure
    f, ax = plt.subplots(figsize=figsize)

    # Generate a custom diverging colormap
    cmap = sns.diverging_palette(220, 10, as_cmap=True)

    # Draw the heatmap with the mask and correct aspect ratio
    sns.heatmap(
        corr, mask=mask, cmap=cmap, center=0, square=True, 
        linewidths=.5, cbar_kws={"shrink": .5}, 
        vmax=corr[corr != 1.0].max().max()
    );

In [None]:
cols_num = []

for col in df.columns:
    if df[col].dtype == 'int64' or df[col].dtype == 'float64':
        cols_num.append(col)


In [None]:
for col in cols_num:
    sns.violinplot(x=df[col], title=col)
    plt.show()

In [None]:
cols_to_drop = ['integration_type_id', 'ad_type_id', 'revenue_share_percent']
df.drop(cols_to_drop, axis=1, inplace=True) #.reset_index(drop=True)

In [None]:
cols_to_log = ['total_impressions', 'measurable_impressions', 'viewable_impressions']
for col in cols_to_log:
    df[col] = df[col].astype('float64').replace(0.0, 0.01)
    df[col] = np.log(df[col])

In [None]:
multi_collinearity_heatmap(df, figsize=(10, 10))

In [None]:
cols_to_drop = ['measurable_impressions', 'viewable_impressions']
df.drop(cols_to_drop, axis=1, inplace=True) #.reset_index(drop=True)

# Split for train/val/test data

In [None]:
def x_y_split(X, target='CPM'):
    return X.drop(target, axis=1), X[target]

AFTER_DATE = '2019-06-22'

df = df.loc[lambda x: x.CPM >= 0]
df_train = df.loc[lambda x: x.date < AFTER_DATE].drop('date', axis=1)
df_train = df_train.loc[lambda x: x.CPM < x.CPM.quantile(.95)]

df_test = df.loc[lambda x: x.date >= AFTER_DATE].drop('date', axis=1)
df_test = df_test.loc[lambda x: (x.CPM < x.CPM.quantile(.95))] 

X_train_all, y_train_all = x_y_split(df_train)
X_train, X_val, y_train, y_val = train_test_split(X_train_all, y_train_all, test_size=0.1, random_state=242)

X_test, y_test = x_y_split(df_test)


In [None]:
log_target = False

In [None]:
y_train.hist()

# Model

In [None]:
def do_prediction(model, X_train, y_train, X_val, y_val, validate=True, print_time=True, log_target=False):
    _start = time.time()    

    if log_target:
        model.fit(X_train, np.log(y_train))
        y_train_pred = np.exp(model.predict(X_train))
    else:
        model.fit(X_train, y_train)
        y_train_pred = model.predict(X_train)

    if validate:
        if log_target:
            y_val_pred = np.exp(model.predict(X_val))
        else:
            y_val_pred = model.predict(X_val)
        
    
        
    _end = time.time()
    if print_time:
        print(f"Time taken to run: {round((_end - _start)/60,1)} minutes")

    mse_train = mse(y_train, y_train_pred)
    if validate:
        mse_val = mse(y_val, y_val_pred)
        print(f"MSE train: {mse_train:.4f}\tMSE val: {mse_val}\tdelta: {abs(mse_train - mse_val):.4f}")
    else:    
        print(f"MSE train: {mse_train:.4f}")
    
    return y_train_pred, y_val_pred

In [None]:
def show_pred_vs_true(y_train, y_train_pred, y_val, y_val_pred):
    # Scatterplot of predicted vs. actual values
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
    fig.suptitle('Predicted vs. actual values', fontsize=14, y=1)
    plt.subplots_adjust(top=0.93, wspace=0)

    ax1.scatter(y_val, y_val_pred, s=2, alpha=0.7)
    ax1.plot(list(range(0, int(min(y_val.max(), y_val_pred.max())))),
             list(range(0, int(min(y_val.max(), y_val_pred.max())))), color='black', linestyle='--')
    ax1.set_title('Test set')
    ax1.set_xlabel('Actual values')
    ax1.set_ylabel('Predicted values')

    ax2.scatter(y_train, y_train_pred, s=2, alpha=0.7)
    ax2.plot(list(range(0, int(min(y_train.max(), y_train_pred.max())))),
             list(range(0, int(min(y_train.max(), y_train_pred.max())))), color='black', linestyle='--')
    ax2.set_title('Train set')
    ax2.set_xlabel('Actual values')
    ax2.set_ylabel('')
    ax2.set_yticklabels(labels='')

    plt.show()

In [None]:
model_lgb = lgb.LGBMRegressor(num_leaves=41, n_estimators=200)
# num_leaves=41, n_estimators=100

y_train_pred, y_val_pred = do_prediction(model_lgb, X_train, y_train, X_val, y_val, log_target=log_target)
show_pred_vs_true(y_train, y_train_pred, y_val, y_val_pred)

In [None]:
model_xgb = xgb.XGBRegressor(objective='reg:squarederror')

y_train_pred, y_val_pred = do_prediction(model_xgb, X_train, y_train, X_val, y_val, log_target=log_target)
show_pred_vs_true(y_train, y_train_pred, y_val, y_val_pred)

In [None]:
model_cbr = cbr(random_seed=242, verbose=0, early_stopping_rounds=10)
y_train_pred, y_val_pred = do_prediction(model_cbr, X_train, y_train, X_val, y_val, log_target=log_target)
show_pred_vs_true(y_train, y_train_pred, y_val, y_val_pred)

# Choose and train best model

In [None]:
best_model = model_cbr
best_model.fit(X_train_all, y_train_all)

# Calculate on test

In [None]:
y_pred = best_model.predict(X_test)
mse(y_test, y_pred)