In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/real-time-advertisers-auction/Dataset.csv', parse_dates=["date"], na_values=[" "," ",np.NAN,np.NaN,"NA","N/A"])

In [None]:
from datetime import datetime
import seaborn as sns
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt
%matplotlib inline

from catboost import CatBoostRegressor

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.describe(datetime_is_numeric=True)

In [None]:
df.drop(['integration_type_id' , 'revenue_share_percent',], axis = 1, inplace=True)

In [None]:
#calculating CPM
#calculating the value that the Advertisers Bid for the month of June
# CPM(the value which was the winning bid value) = 
#((revenue of the publisher*100)/revenue_share_percentage)/measurable_impressions)*1000

def weird_division(n, d):
    return n / d if d else 0

df['CPM'] = df.apply(lambda x: weird_division(((x['total_revenue']*100)),x['measurable_impressions'])*1000 , axis=1)
df.drop(['total_revenue', 'measurable_impressions'], axis=1, inplace=True)

In [None]:
df.CPM.describe()

In [None]:
df.corr()

In [None]:
df.drop('CPM', axis=1).hist(figsize=(20,20), bins=50);

In [None]:
df.nunique()

In [None]:
corr = df.corr()
plt.figure(figsize=(14,10))
sns.heatmap(data=corr,vmin=0, vmax=1, cmap="hot",  square=True, annot= True)
plt.show()

In [None]:
df['view/total'] = df.apply(lambda x: weird_division(x['viewable_impressions'],x['total_impressions']) , axis=1)
df.drop(['total_impressions'], axis = 1, inplace=True)

In [None]:
df.drop(list(df[df['CPM'] < 0].index) + list(df[df['CPM']>= df.CPM.quantile(q=0.95)].index), inplace=True)

In [None]:
df.date.unique()

In [None]:
df['day_of_week'] = df['date'].dt.dayofweek
df.head()

In [None]:
split_date = datetime(2019, 6, 22)

X_train = df[df.date < split_date].drop(['date', 'CPM'], axis=1)
X_test = df[df.date >= split_date].drop(['date', 'CPM'], axis=1)

y_train = df[df.date < split_date][['CPM']]
y_test = df[df.date >= split_date][['CPM']]

X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [None]:
%%time

cat = CatBoostRegressor(
    task_type="GPU",
    devices='0:1',
    loss_function='RMSE',
    eval_metric='RMSE',
    iterations=3000,
    metric_period=100,
)

grid = {
    'learning_rate': [0.1, 0.15, 0.2, 0.25, 0.3],
    'max_depth': [10, 12],
    'l2_leaf_reg': [3, 5, 7, 9],
    'border_count': [180, 200, 220],
}

grid_search_result = cat.grid_search(grid,
    X=X_train.values,
    y=y_train.values,
    plot=True, train_size=0.8, verbose=50,)

In [None]:
grid_search_result["params"]

In [None]:
%%time

best_params = grid_search_result["params"]

cat_reg = CatBoostRegressor(**best_params,
    iterations=10000,
    metric_period=100,
    use_best_model=True,
    loss_function='RMSE',
    task_type="GPU",
    devices='0:1',
    )

cat_reg.fit(X_train.values, y_train.values, verbose=False, plot=False, eval_set=(X_test.values, y_test.values))

model = cat_reg
train_preds = model.predict(X_train.values)
test_preds = model.predict(X_test.values)

In [None]:
print(f'MSE: {mean_squared_error(y_test.values, test_preds):.2f}')