In [None]:
import pandas as pd
import numpy as np
from matplotlib.pyplot import figure
import matplotlib as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor

In [None]:
data = pd.read_csv("/kaggle/input/real-time-advertisers-auction/Dataset.csv")

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data['date'] = data['date'].astype('datetime64')

In [None]:
# calculate cpm
def weird_division(n, d):
    return n / d if d else 0

data['cpm'] = data.apply(lambda x: weird_division(((x['total_revenue'] * 100)), x['measurable_impressions']) * 1000, axis=1)

In [None]:
data = data.drop(['total_revenue', 'measurable_impressions'], axis = 1)

In [None]:
# split train and test by date '2019-06-22'
train = data.loc[data['date'] < '2019-06-22']
test = data.loc[data['date'] >= '2019-06-22']

In [None]:
# remove outliers
max_cpm = test['cpm'].quantile(.95)
test = test.loc[(test['cpm'] >= 0) & (test['cpm'] < max_cpm)]

In [None]:
min_date = min(data['date'])
train['days'] = (train['date'] - min_date).dt.days
train = train.drop('date', axis=1)

In [None]:
test['days'] = (test['date'] - min_date).dt.days
test = test.drop('date', axis=1)

In [None]:
train = train.loc[train['cpm'] < max_cpm]

In [None]:
X_train = train.drop('cpm', axis=1)
y_train = train['cpm']
X_test = test.drop('cpm', axis=1)
y_test = test['cpm']

In [None]:
columns = X_train.columns
ss = StandardScaler()

X_train = pd.DataFrame(ss.fit_transform(X_train), columns=columns)
X_test = pd.DataFrame(ss.transform(X_test), columns=columns)

In [None]:
X_train.head()

In [None]:
# try linear model
lm = LinearRegression()
lm.fit(X_train, y_train)
predict = lm.predict(X_test)
MSE = mean_squared_error(y_test, predict)
print(f"MSE for linear model: {MSE:.2f}")

In [None]:
# try catboost
catboost_model = CatBoostRegressor(
    n_estimators = 2000,
    learning_rate = 0.05,
    # depth = 9,
    # l2_leaf_reg = 9,
    loss_function = 'RMSE',
    eval_metric = 'RMSE',
    # cat_features = categorical_columns,
    )

In [None]:
catboost_model.fit( X_train, y_train, 
                    use_best_model=True, 
                    eval_set=(X_test, y_test), 
                    logging_level = 'Silent',
                    # cat_features=categorical_columns,
                    )

In [None]:
predict = catboost_model.predict(X_test)
MSE_catboost = mean_squared_error(y_test, predict)
print(f"MSE for catboost model: {MSE_catboost:.2f}")