In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/real-time-advertisers-auction/Dataset.csv')

In [None]:
split_date = '2019-06-22'

In [None]:
df.reset_index(inplace=True)
df.drop(columns=["index"], inplace=True, axis=1)
df.head()

In [None]:
#calculating CPM
#calculating the value that the Advertisers Bid for the month of June
# CPM(the value which was the winning bid value) = 
#((revenue of the publisher*100)/revenue_share_percentage)/measurable_impressions)*1000

def weird_division(n, d):
    return n / d if d else 0

df['CPM'] = df.apply(lambda x: weird_division(((x['total_revenue']*100)),x['measurable_impressions'])*1000 , axis=1)

In [None]:
df.drop(columns=['total_revenue'], axis=1, inplace=True)

In [None]:
df.head()

## Remove negative and values below 0.95 Quantile

In [None]:
df = df[df.CPM >= 0]

In [None]:
df.CPM.plot.density()

In [None]:
df = df[df.CPM < df.CPM.quantile(0.95)]

In [None]:
df.CPM.plot.density()

In [None]:
import math
df['LOG'] = df['CPM'].apply(lambda x : math.log(x+1))

In [None]:
df.LOG.plot.density()

In [None]:
numeric_cols = list(df.select_dtypes(include="number"))

In [None]:
heat_mask = list(~(np.array(numeric_cols) == "LOG"))

In [None]:
import seaborn as sns

corr = df.corr()
plt.figure(figsize=(12,12))
sns.heatmap(data=corr,vmin=0, vmax=1, cmap="YlGnBu",  square=True, annot=True, mask=heat_mask)
plt.show()

In [None]:
train = df[df['date'] <= split_date]
test = df[df['date'] > split_date]

In [None]:
import matplotlib.pyplot as plt
plt.figure()
train.LOG.plot.density(label="train CPM");
test.LOG.plot.density(label="test CPM");
plt.legend()

In [None]:
train.date.min(), train.date.max(), test.date.min(), test.date.max()

## Prepare data

In [None]:
useful_columns = list(set(numeric_cols) - {'CPM','LOG'})

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

num_pipeline=Pipeline([('imputer',SimpleImputer(strategy='median')),('mm_scaler',MinMaxScaler()),])
column_transformer = ColumnTransformer([("num",num_pipeline,useful_columns)])


In [None]:
X_train = train[useful_columns]
Y_train = train.CPM
X_train_prepd = column_transformer.fit_transform(X_train)

In [None]:
from sklearn.ensemble import RandomForestRegressor

reg = RandomForestRegressor(n_estimators = 30, random_state = 42)
reg.fit(X_train_prepd, Y_train)

In [None]:
X_test = test[useful_columns]
Y_test = test.CPM
X_test_prepd = column_transformer.transform(X_test)

In [None]:
Y_test_pred = reg.predict(X_test_prepd)
mean_squared_error(Y_test, Y_test_pred)