In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.metrics import mean_squared_error

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/real-time-advertisers-auction/Dataset.csv')

In [None]:
def weird_division(n, d):
    return n / d if d else 0

df['CPM'] = df.apply(lambda x: weird_division(((x['total_revenue']*100)) ,x['measurable_impressions'])*1000 , axis=1)

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
df.integration_type_id.unique()

In [None]:
corr = df.corr()
plt.figure(figsize=(12,8))
sns.heatmap(data=corr,vmin=0, vmax=1, cmap="YlGnBu",  square=True)
plt.show()

In [None]:
# drop 'integration_type_id' and 'revenue_share_percent', because columns have only one value 
# drop 'site_id', since it is highly correlated with 'ad_unit_id'
# drop 'measurable_impressions' and 'total_revenue', which were used to create the target 'CPM'
df.drop(['integration_type_id', 'revenue_share_percent','site_id', 'measurable_impressions', 'total_revenue'], axis = 1, inplace=True)

In [None]:
corr = df.corr()
plt.figure(figsize=(14,10))
sns.heatmap(data=corr,vmin=0, vmax=1, cmap="YlGnBu",  square=True, annot= True)
plt.show()

In [None]:
# remove the emissions
df = df[df['CPM'].between(df['CPM'].quantile(.05), df['CPM'].quantile(.95))]

In [None]:
# divide the data into test and train by date
train = df[df.date <= '2019-06-21 00:00:00']
test = df[df.date > '2019-06-21 00:00:00']

X_train = train.drop(['CPM', 'date'], axis = 1)
y_train = train['CPM']

X_test = test.drop(['CPM', 'date'], axis = 1)
y_test = test['CPM']

In [None]:
xg_reg = xgb.XGBRegressor(
    max_depth=10,
    feval=mean_squared_error,
    maximize=False,
    colsample_bytree=0.7,
    learning_rate=0.03,
    min_child_weight=4,
    n_estimators= 500,
    nthread=4, 
    silent=1,
    subsample=0.7)

xg_reg.fit(X_train, y_train)

In [None]:
print(mean_squared_error(y_test, xg_reg.predict(X_test)))