In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
from catboost import CatBoostRegressor

In [None]:
df = pd.read_csv('../input/real-time-advertisers-auction/Dataset.csv', parse_dates=['date'])

In [None]:
df.head()

In [None]:
def weird_division(n, d):
    return n / d if d else 0

df['CPM'] = df.apply(lambda x: weird_division(((x['total_revenue']*100)),x['measurable_impressions'])*1000 , axis=1)

In [None]:
df.drop(columns=['total_revenue', 'revenue_share_percent', 'integration_type_id'], inplace=True)

In [None]:
df.sort_values(by='date', inplace=True)
df = df[df.CPM >= 0]

In [None]:
threshold = pd.Timestamp(2019, 6, 22)
df_train = df[df.date < threshold]
df_test = df[df.date >= threshold]

In [None]:
df_test = df_test[df.CPM < df_test.CPM.quantile(q=0.95)]
df_train = df_train[df.CPM < df_train.CPM.mean() + 3 * df_train.CPM.std()]

In [None]:
categ_features_columns = ['site_id', 'ad_type_id', 'geo_id', 'device_category_id', 
                  'advertiser_id', 'order_id', 'line_item_type_id', 'os_id', 
                  'monetization_channel_id', 'ad_unit_id']

df_train[categ_features_columns] = df_train[categ_features_columns].astype('category')
df_test[categ_features_columns] = df_test[categ_features_columns].astype('category')

ohe = OneHotEncoder(categories='auto', handle_unknown='ignore', sparse=False)
categ_train_features = ohe.fit_transform(df_train[categ_features_columns])
categ_test_features = ohe.transform(df_test[categ_features_columns])

In [None]:
df_train.drop(columns=categ_features_columns, inplace=True)
df_test.drop(columns=categ_features_columns, inplace=True)

df_train.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

In [None]:
columns = ['categ_' + str(i) for i in range(categ_train_features.shape[1])]
df_train = pd.concat([df_train, pd.DataFrame(categ_train_features, columns=columns, dtype=int)], axis=1)
df_test = pd.concat([df_test, pd.DataFrame(categ_test_features, columns=columns, dtype=int)], axis=1)

In [None]:
numerical_features_columns = ['total_impressions', 'viewable_impressions', 'measurable_impressions', 'CPM']

for col in numerical_features_columns:
    df_train[col] = np.log1p(df_train[col])
    df_test[col] = np.log1p(df_test[col])
    

numerical_features_columns.remove('CPM')

scaler = StandardScaler()
numer_train_features = scaler.fit_transform(df_train[numerical_features_columns])
numer_test_features = scaler.transform(df_test[numerical_features_columns])

In [None]:
df_train.drop(columns=numerical_features_columns, inplace=True)
df_test.drop(columns=numerical_features_columns, inplace=True)

columns = ['num_' + str(i) for i in range(numer_train_features.shape[1])]
df_train = pd.concat([df_train, pd.DataFrame(numer_train_features, columns=columns)], axis=1)
df_test = pd.concat([df_test, pd.DataFrame(numer_test_features, columns=columns)], axis=1)

In [None]:
y_train = df_train.CPM
X_train = df_train.drop(columns=['date', 'CPM'])

y_test = df_test.CPM
X_test = df_test.drop(columns=['date', 'CPM'])

In [None]:
pca = PCA(n_components=26)

F_train = pca.fit_transform(X_train)
F_test = pca.transform(X_test)

In [None]:
catbreg = CatBoostRegressor(n_estimators=2000, depth=10)
catbreg.fit(F_train, y_train)

In [None]:
y_pred = catbreg.predict(F_test)

print(f'Result Mean Squared Error: {mean_squared_error(np.exp(y_test), np.exp(y_pred))}')