In [None]:
import numpy as np
import pandas as pd
pd.options.display.max_columns = 200
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor, Pool, cv

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv("/kaggle/input/real-time-advertisers-auction/Dataset.csv")

In [None]:
df.head()

In [None]:
# Making label

def weird_division(n, d):
    return n / d if d else 0

df['CPM'] = df.apply(lambda x: weird_division(((x['total_revenue']*100)),
                                              x['measurable_impressions'])*1000 , axis=1)

In [None]:
df.nunique()

In [None]:
# Let's drop columns that are used in CPM formula and useless ones

df.drop(['total_revenue', 'measurable_impressions', 'integration_type_id', 'revenue_share_percent'], \
        axis=1, inplace=True)

In [None]:
df['date'] = pd.to_datetime(df.date)
df = df.sort_values('date')

In [None]:
df = df[df.CPM < df.CPM.quantile(.95)]
df = df[df.CPM >= 0]

In [None]:
df.CPM.hist();

In [None]:
corr = df.corr()
corr.style.background_gradient(cmap='coolwarm')

In [None]:
# Make some features

df['weekday'] = df.date.apply(lambda x: x.weekday())

In [None]:
dates = df.date.sort_values().unique()
date_df = pd.DataFrame({'date': dates, 'date_num': np.arange(len(dates)).astype(float)})

In [None]:
df = df.merge(date_df, left_on='date', right_on='date', how='inner')

In [None]:
df[['total_impressions', 'viewable_impressions']] = df[['total_impressions', 'viewable_impressions']].astype(float)

In [None]:
# Making two datasets to compare how big 'order_id' and 'line_item_type_id' improve score (they may contain leaks)

data1 = pd.get_dummies(df[[col for col in df.columns if col not in ['order_id' , 'line_item_type_id']]])
data2 = pd.get_dummies(df)

In [None]:
train = data1[data1.date <= pd.Timestamp(2019,6,21)]
y_train = train.pop('CPM')
test = data1[data1.date > pd.Timestamp(2019,6,21)]
y_test = test.pop('CPM')

In [None]:
del train['date']
del test['date']

In [None]:
X_train, X_val, y_train, y_val = train_test_split(train, y_train, test_size=0.3, random_state=42)

In [None]:
cat_feat = np.where(X_train.dtypes != np.float)[0]
params = {
    'iterations': 300,
    #'logging_level': 'Silent',
    'loss_function': 'RMSE',
    'use_best_model': True,
}
train_pool = Pool(X_train, y_train, cat_features=cat_feat)
validate_pool = Pool(X_val, y_val, cat_features=cat_feat)

model = CatBoostRegressor(**params, cat_features=cat_feat)
model.fit(train_pool, eval_set=validate_pool)

In [None]:
model.get_feature_importance(data=None,
                       #type=EFstrType.FeatureImportance,
                       prettified=True,
                       thread_count=-1,
                       verbose=False).head(20)

In [None]:
# Turning negative CPM values to zeros

y_pred = pd.Series(model.predict(test)).apply(lambda x: 0 if x < 0 else x).values

In [None]:
print(f'MSE = {mean_squared_error(y_pred, y_test)}')

In [None]:
# Checking second dataset

In [None]:
train = data2[data2.date <= pd.Timestamp(2019,6,21)]
y_train = train.pop('CPM')
test = data2[data2.date > pd.Timestamp(2019,6,21)]
y_test = test.pop('CPM')

del train['date']
del test['date']

X_train, X_val, y_train, y_val = train_test_split(train, y_train, test_size=0.3, random_state=42)

In [None]:
cat_feat = np.where(X_train.dtypes != np.float)[0]
params = {
    #'iterations': 300,
    #'logging_level': 'Silent',
    'loss_function': 'RMSE',
    'use_best_model': True,
}
train_pool = Pool(X_train, y_train, cat_features=cat_feat)
validate_pool = Pool(X_val, y_val, cat_features=cat_feat)

model = CatBoostRegressor(**params, cat_features=cat_feat)
model.fit(train_pool, eval_set=validate_pool)

In [None]:
model.get_feature_importance(data=None,
                       #type=EFstrType.FeatureImportance,
                       prettified=True,
                       thread_count=-1,
                       verbose=False).head(20)

In [None]:
y_pred = pd.Series(model.predict(test)).apply(lambda x: 0 if x < 0 else x).values

In [None]:
print(f'MSE = {mean_squared_error(y_pred, y_test)}')

#### We see, that in the second case considered features influence the model much better than others, so they can contain leak info

#### But stil we have obtain acceptable MSE under 4850 in both methods