In [None]:
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, ShuffleSplit
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import VotingRegressor
from sklearn.feature_selection import SelectKBest, f_classif, VarianceThreshold

import matplotlib.pyplot as plt
import seaborn as sns

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
CUT_DATE = "2019-06-22"

In [None]:
df = pd.read_csv('/kaggle/input/real-time-advertisers-auction/Dataset.csv')

In [None]:
df['date'] = pd.to_datetime(df.date)

In [None]:
df.isnull().sum()

In [None]:
# CRM calculation 

def weird_division(n, d):
    return n / d if d else 0

df['CPM'] = df.apply(lambda x: weird_division(((x['total_revenue']*100)),x['measurable_impressions'])*1000 , axis=1)

df = df[df['CPM'] >= 0].reset_index(drop=True)

# removing existing in the formula variables
del df['total_revenue']
del df['measurable_impressions']

In [None]:
df['day'] = pd.DatetimeIndex(df['date']).day

In [None]:
df.nunique()

In [None]:
# Removing useless variables with 1 value

del df['revenue_share_percent']
del df['integration_type_id']

# Train and test preparation

In [None]:
test = df[pd.to_datetime(df["date"]) >= pd.to_datetime(CUT_DATE)]
test_quantile_val = test['CPM'].quantile(.95)
test = test[test['CPM'] < test_quantile_val].reset_index(drop=True)

In [None]:
train = df[pd.to_datetime(df['date']) < pd.to_datetime(CUT_DATE)]
train_quantile_val = train['CPM'].quantile(.95)
train = train[train['CPM'] < train_quantile_val].reset_index(drop=True)

In [None]:
train.head()

In [None]:
train.describe()

In [None]:
test.describe()

# Feature selection

In [None]:
plt.figure(figsize=(20,10))
cor = df.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
plt.show()

In [None]:
train.columns

In [None]:
column_names = ['site_id', 'ad_type_id', 'geo_id', 'device_category_id',
       'advertiser_id', 'order_id', 'line_item_type_id', 'os_id',
       'monetization_channel_id', 'ad_unit_id', 'total_impressions',
       'viewable_impressions', 'day']

x_data_best = SelectKBest(f_classif, k=5).fit_transform(train[column_names], train[['CPM']])
var_thresh = VarianceThreshold(.9).fit(train[column_names])
x_data_modified = var_thresh.transform(train[column_names])

In [None]:
var_thresh.get_support()

In [None]:
# so, device_category_id is not importaint

selected_columns = ['site_id', 'ad_type_id', 'geo_id',
       'advertiser_id', 'order_id', 'line_item_type_id', 'os_id',
       'monetization_channel_id', 'ad_unit_id', 'total_impressions',
       'viewable_impressions', 'day']

In [None]:
categorical_features = ['site_id', 'ad_type_id', 'geo_id',
                       'advertiser_id', 'order_id','line_item_type_id',
                       'os_id', 'monetization_channel_id',
                       'ad_unit_id', 'day',
                       ]

numerical_features   = ['total_impressions', 'viewable_impressions']

In [None]:
train_features = train[categorical_features + numerical_features]
target = train[['CPM']]

# Experiments

#### CatBoost

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', scaler, numerical_features)])

In [None]:
model = CatBoostRegressor(loss_function='RMSE', 
                          cat_features=categorical_features, 
                          random_seed=83)

In [None]:
pipe = Pipeline(steps=[('regressor', model)])

In [None]:
pipe.fit(train_features, target)

In [None]:
predictions = pipe.predict(test[categorical_features + numerical_features])

In [None]:
mean_squared_error(test['CPM'].tolist(), predictions)

## Sklearn regressors

In [None]:
reg1 = GradientBoostingRegressor()
reg2 = RandomForestRegressor()
reg3 = LinearRegression()
ereg = VotingRegressor(estimators=[('gb', reg1), ('rf', reg2), ('lr', reg3)])

In [None]:
pipe = Pipeline(steps=[('regressor', ereg)])

In [None]:
pipe.fit(train_features, target)

In [None]:
predictions = pipe.predict(test[categorical_features + numerical_features])

In [None]:
mean_squared_error(test['CPM'].tolist(), predictions)

Catboost results are better **3664.08**