In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import tqdm
import catboost
import warnings
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
warnings.filterwarnings("ignore")

In [None]:
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', 1000)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)

# Data load

In [None]:
df = pd.read_csv('/kaggle/input/real-time-advertisers-auction/Dataset.csv')
df.head()

In [None]:
df.shape

# Target variable create

In [None]:
def weird_division(n, d):
    return n / d if d else 0

In [None]:
df['CPM'] = df.apply(lambda x: weird_division(((x['total_revenue'] * 100)),
                                              x['measurable_impressions']) * 1000,
                     axis=1)

In [None]:
df.head()

In [None]:
df[df['CPM'] < 0]

In [None]:
df = df[df['CPM'] >= 0]
df.shape

# NaN check

In [None]:
df.isna().sum()

# EDA

In [None]:
fig, axs = plt.subplots(6, 3, figsize=(20, 25), facecolor='w', edgecolor='k')

plt.subplots_adjust(hspace=1.5)

axs = axs.ravel()

for i in range(len(df.columns)):
    axs[i].set_title('CPM - ' + df.columns[i])
    axs[i].scatter(df[df.columns[i]], df['CPM'])
    axs[i].set_ylabel('CPM')
    axs[i].set_xlabel(df.columns[i])
    axs[i].tick_params(axis='x', rotation=90)

#axs[17].set_axis_off()

# Create features

In [None]:
df['View/measurable'] = df.apply(lambda x: weird_division(x['viewable_impressions'],
                                                          x['measurable_impressions']),
                                 axis=1,
                                )

# Drop features

In [None]:
df_droped = df.drop(['revenue_share_percent', # 1 unique value
                     'integration_type_id', # 1 unique value
                     'total_revenue', # CPM depend
                     'measurable_impressions', # CPM depend, 'View/measurable' depend
                     'viewable_impressions', # 'View/measurable',
                     ],
                     axis=1,
                    )

In [None]:
df_droped.head()

In [None]:
df_droped.shape

# Create new features

In [None]:
# Convert date_time column to datetime type
df_droped['date'] = pd.to_datetime(df_droped['date'])

# Number of day in the week
df_droped['day_number'] = df_droped['date'].dt.day

# Name of day in the week
df_droped['day_week'] = df_droped['date'].dt.day_name()

# Weekend flag
df_droped['is_weekend'] = df_droped['day_week'].apply(lambda x : 1 if x in ['Saturday','Sunday'] else 0)

In [None]:
df_droped.head()

# Train-test split

In [None]:
# All days in data
np.sort(df_droped['date'].unique())

In [None]:
def data_splitter(df, train_valid_range=['2019-06-01 00:00:00', '2019-06-21 00:00:00'], valid_size=0.1): 
    # Train/val data separation
    mask = (df['date'] >= train_valid_range[0]) & (df['date'] <= train_valid_range[1])
    df_train_valid = df[mask]
    df_train_valid = df_train_valid[df_train_valid['CPM'] < df_train_valid['CPM'].quantile(0.95)]
    y_train_ = df_train_valid['CPM'].values
    X_train_ = df_train_valid.drop(['CPM', 'date'], axis=1).values
    X_train, X_valid, y_train, y_valid = train_test_split(X_train_,
                                                          y_train_,
                                                          test_size=valid_size,
                                                          random_state=42,
                                                          shuffle=True,
                                                         )
    # Test data separation
    mask = (df['date'] >= '2019-06-22 00:00:00')
    df_test = df[mask]
    df_test = df_test[df_test['CPM'] < df_test['CPM'].quantile(0.95)]
    y_test = df_test['CPM'].values
    X_test = df_test.drop(['CPM', 'date'], axis=1).values
    
    return X_train, X_valid, X_test, y_train, y_valid, y_test 

In [None]:
X_train, X_valid, X_test, y_train, y_valid, y_test = data_splitter(df_droped)

In [None]:
assert X_train.shape[1] == X_valid.shape[1] == X_test.shape[1]

In [None]:
X_train.shape, X_valid.shape, X_test.shape, y_train.shape, y_valid.shape, y_test.shape 

# Model create

In [None]:
ctb = catboost.CatBoostRegressor(iterations=1000,
                                 learning_rate=0.1,
                                 random_seed=42,
                                 depth=10,
                                 task_type="CPU",
                                 loss_function='RMSE',
                                 l2_leaf_reg=5,
                                 use_best_model=True,
                                 bagging_temperature=1000,
                                 border_count=255,
                                )

# Fit model on train data and validate on valid data

In [None]:
ctb.fit(X_train,
        y_train,
        eval_set=(X_valid, y_valid),
        plot=True,
        early_stopping_rounds=10,
        cat_features=[13],
        verbose=False,
       )

# Results of training

In [None]:
print('Number of trees: {}'.format(ctb.best_iteration_))
print('Best MSE on train: {}, on validation: {}'.format(ctb.best_score_['learn']['RMSE']**2,
                                                        ctb.best_score_['validation']['RMSE']**2,
                                                        ),
     )

In [None]:
# Reserve plot if the cell with catboost training plot is not displayed in Kaggle notebooks

plt.figure(figsize=(8, 6))

results = pd.DataFrame({'train_MSE': np.power(ctb.evals_result_['learn']['RMSE'], 2)[0:ctb.best_iteration_],
                        'validation_MSE': np.power(ctb.evals_result_['validation']['RMSE'], 2)[0:ctb.best_iteration_],
                        'trees_number': np.arange(1, ctb.best_iteration_+1, 1),
                       },
                      )

g = sns.lineplot(data=results,
                 x='trees_number',
                 y='train_MSE',
                 color="g",
                 linewidth=2,
                );
g = sns.lineplot(data=results,
                 x='trees_number',
                 y='validation_MSE',
                 color="y",
                 linewidth=2,
                );
g.set(xlabel='Number of trees',
      ylabel='MSE',
      title='Gradient boosting train and validation MSE',
     );
g.grid();
g.legend(loc='best',
         labels=['MSE on train data', 'MSE on on validation data'],
        );

# Predictions

In [None]:
train_preds = ctb.predict(X_train)
valid_preds = ctb.predict(X_valid)
test_preds = ctb.predict(X_test)

# Metrics

In [None]:
print('--------------------')
print('MSE score:')
print('--------------------')
print('Train MSE: {:.3f}'.format(mean_squared_error(y_train, train_preds)))
print('Valid MSE: {:.3f}'.format(mean_squared_error(y_valid, valid_preds)))
print('Test MSE: {:.3f}'.format(mean_squared_error(y_test, test_preds)))
print('\n--------------------')
print('R2 score:')
print('--------------------')
print('Train R2: {:.3f} %'.format(100 * r2_score(y_train, train_preds)))
print('Valid R2: {:.3f} %'.format(100 * r2_score(y_valid, valid_preds)))
print('Test R2: {:.3f} %'.format(100 * r2_score(y_test, test_preds)))