In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
fn = '/kaggle/input/real-time-advertisers-auction/Dataset.csv'

In [None]:
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

#### Description

The dataset provided to you has data for several websites owned by the same company and they are asking for your help for what should be their approach to set reserve prices and what is the range for reserve prices they should be setting for July. The data is only of the actual revenue generation and not at bid level. The dataset has the following columns:

    Date
    site_id : each id denotes a different website
    adtypeid : each id denotes a different ad_type. These can be display ads , video ads, text ads etc
    geo_id : each id denotes a different country. our maximum traffic is from english speaking countries
    devicecategoryid : each id denoted a different device_category like desktop , mobile, tablet
    advertiser_id: each id denotes a different bidder in the auction
    order_id : can be ignored
    lineitemtype_id : can be ignored
    osid : each id denotes a different operating system for mobile device category only (android , ios etc) . for all other device categories, osid will correspond to not_mobile
    integrationtypeid : it describes how the demand partner is setup within a publisher's ecosystem - can be adserver (running through the publisher adserver) or hardcoded
    monetizationchannelid : it describes the mode through which demand partner integrates with a particular publisher - it can be header bidding (running via prebid.js), dynamic allocation, exchange bidding, direct etc
    adunitid - each id denotes a different ad unit (one page can have more than one ad units)
    total_impressions - measurement column measuring the impressions for the particular set of dimensions
    total_revenue - measurement column measuring the revenue for the particular set of dimensions
    viewable_impressions - Number of impressions on the site that were viewable out of all measurable impressions. A display ad is counted as viewable if at least 50% of its area was displayed on screen for at least one second
    measurable_impressions - Impressions that were measurable by Active View out of the total number of eligible impressions. This value should generally be close to 100%. For example, an impression that is rendering in a cross-domain iframe may not be measurable.
    Revenuesharepercent - not every advertiser gives all the revenue to the publisher. They charge a certain share for the services they provide. This captures the fraction of revenue that will actually reach the publishers pocket.


#### DATA

In [None]:
data = pd.read_csv(fn)
data.info()

In [None]:
data.head()

#### CPM

In [None]:
def weird_division(n, d):
    return n / d if d else 0
data['CPM'] = data.apply(lambda x: weird_division(((x['total_revenue']*100)),x['measurable_impressions'])*1000 , axis=1)

label = 'CPM'

#### FEATURES

In [None]:
features = [x for x in data.columns if x not in [label,'date','CPM', 'total_revenue', 'measurable_impressions', 'viewable_impressions', 'revenue_share_percent', 'total_impressions']]
features

#### SPLIT and CLEAR

In [None]:
import datetime as DT
textEnd = "2019-06-22 00:00:00"
dateEnd = DT.datetime.strptime(textEnd, '%Y-%m-%d  %H:%M:%S').date()
# text = "2019-06-30 00:00:00"
dates = [DT.datetime.strptime(x, '%Y-%m-%d  %H:%M:%S').date() for x in data['date'].values]
dates = np.array(dates)

train_idx = dates<dateEnd
test_idx = dates>=dateEnd
train = data[train_idx]
test = data[test_idx]

max_train_cpm = train['CPM'].quantile(.95)
print(f"Train: value 95 quantile = {max_train_cpm}");
train_idx = (dates<dateEnd)&(data['CPM']<=max_train_cpm)

max_test_cpm = test['CPM'].quantile(.95)
print(f"Test: value 95 quantile = {max_test_cpm}");
test_idx = (dates>=dateEnd)&(data['CPM']>=0)&(data['CPM']<=max_test_cpm)
train = data[train_idx]
test = data[test_idx]
print(f"negative CPM data={(data['CPM']<0).sum()}")

print(f"size(train)={train.shape}, size(test)={test.shape}")

#### FIT-PREDICT simple LGBMRegressor

In [None]:
n_estimators=500
max_depth=9

X_train, X_test, y_train, y_test = train_test_split(train[features],train[label], test_size=0.3)
model = LGBMRegressor(n_estimators=n_estimators,max_depth=max_depth)
model.fit(X_train,y_train)
pred = model.predict(X_test)
val_score = mean_squared_error(y_test,pred)
pred = model.predict(test[features])
test_score = mean_squared_error(test[label],pred)
print(f"Score: val = {val_score}, test = {test_score}")

In [None]:
model = LGBMRegressor(n_estimators=n_estimators,max_depth=max_depth)
model.fit(train[features],train[label])
pred = model.predict(test[features])
test_score = mean_squared_error(test[label],pred)
print(f"Score: test = {test_score}")