In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## predict CPM using Random Forest Regressor

Author: Roman Novoksahnov

In [None]:
df = pd.read_csv('/kaggle/input/real-time-advertisers-auction/Dataset.csv', parse_dates=['date'])

### mandatory step

In [None]:
# calculate target value 'CRM' as it is done early

def weird_division(n, d):
    return n / d if d else 0

df['CPM'] = df.apply(lambda x: weird_division(((x['total_revenue']*100)),x['measurable_impressions'])*1000 , axis=1)

# remove 'total_revenue'
df.drop('total_revenue', axis=1, inplace=True)

# train/test split
train = df.loc[df['date'] <= '21-Jun-2019']
test = df.loc[df['date'] > '21-Jun-2019']

# remove negative CRM values from test & cut over 95 precentile
test = test.loc[test['CPM'] >= 0]
test = test.loc[test['CPM'] < test['CPM'].quantile(0.95)]
test_labels = test['CPM']
test.drop('CPM', axis=1, inplace=True)

### lets massage the data a bit

In [None]:
# remove negative CRM values from train & cut over 95 precentile
train = train.loc[train['CPM'] >= 0]
train = train.loc[train['CPM'] < train['CPM'].quantile(0.95)]
train_labels = train['CPM']
train.drop('CPM', axis=1, inplace=True)

# log train target distirbutions to avoid predicting negative values
train_labels = np.log(1 + train_labels)

def get_reverse(x):
    return np.exp(x) - 1

### cross-validate, fit & predict

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

def display_scores(scores):
    print('Scores total = ', scores)
    print('Mean = ', scores.mean())
    print('Std dev = ', scores.std())
    
# split history approx 80/20 for validation 
date_threshold = '18-Jun-2019'
X_train, X_test = train.loc[train['date'] <= date_threshold], train.loc[train['date'] > date_threshold]
y_train, y_test = train_labels.loc[X_train.index], train_labels.loc[X_test.index]

In [None]:
# lets do cross-validation
from sklearn.ensemble import RandomForestRegressor

featues_list = ['site_id', 'ad_type_id', 'geo_id', 'device_category_id',
       'advertiser_id', 'order_id', 'line_item_type_id', 'os_id',
       'integration_type_id', 'monetization_channel_id', 'ad_unit_id',
       'total_impressions', 'viewable_impressions', 'measurable_impressions', 'revenue_share_percent', ]

reg = RandomForestRegressor(n_jobs=-1,)

model = reg.fit(X_train[featues_list], y_train)
scores = cross_val_score(model, X_train[featues_list], y_train, scoring='neg_mean_squared_error', cv = 5, n_jobs=-1)
display_scores(-scores)
print(model.score(X_train[featues_list], y_train))
predictions = get_reverse(reg.predict(X_test[featues_list]))
print('mse = ', mean_squared_error(get_reverse(y_test), predictions))

It looks like default RF regressor does well, no need to run grid parameters search

### final predict

In [None]:
# check final model on the test set

# train on the whole set first
model = reg.fit(train[featues_list], train_labels)

# predict on test set
predictions = get_reverse(reg.predict(test[featues_list]))

# final mse score
print('mse = ', mean_squared_error(test_labels, predictions))