In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from itertools import combinations
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
d_full = df = pd.read_csv(
    '/kaggle/input/real-time-advertisers-auction/Dataset.csv',
    parse_dates=["date"]
)


# 1. Common info

In [None]:
d_full.shape

In [None]:
d_full.info()

In [None]:
d_full.describe()

In [None]:
feats_const = []
feats_empty = []
feats_rare = []
thr = int(d_full.shape[0]*.05)
for col in d_full:
    v_counts = dict(d_full[col].value_counts())
    v_min = min(v_counts.values())
    v_max = max(v_counts.values())
    if len(v_counts)==0:
        feats_empty.append(col)
    elif len(v_counts)==1:
        feats_const.append(col)
    if v_max<thr:
        feats_rare.append(col)
    print(f"{col}: {len(v_counts)}")
#     elif len(v_counts)<=20:
#         feats_cat.append(col)
#         print(f"{col}: {v_counts}")
print(f"rare={feats_rare}")
print(f"empty={feats_empty}")
print(f"const={feats_const}")


In [None]:
d_full.head(5)

# 2. Data preparation

In [None]:
# target computation
target = "CPM"

#calculating CPM
#calculating the value that the Advertisers Bid for the month of June
# CPM(the value which was the winning bid value) = 
#((revenue of the publisher*100)/revenue_share_percentage)/measurable_impressions)*1000

def weird_division(n, d):
    return n / d if d else 0

d_full['CPM'] = d_full.apply(
    lambda x: weird_division(((x['total_revenue']*100)),x['measurable_impressions'])*1000 , 
    axis=1
)

d_full = d_full[d_full.CPM >= 0].reset_index(drop=True)


In [None]:
d_full["wday"] = d_full.date.dt.weekday

In [None]:
feats_cat = [
    'ad_type_id',
    'ad_unit_id',
    'advertiser_id',
    'device_category_id',
    'geo_id',
    'integration_type_id',
    'line_item_type_id',
    'os_id',
    'site_id',
    "wday"
]
feats_num = [
    'measurable_impressions', 
    'total_impressions', 
    'viewable_impressions',            
]


for arg1, arg2 in combinations(feats_num, 2):
    col_trg = f"{arg1}/{arg2}"
    d_full[col_trg] = d_train[arg1]/d_train[arg2]
    feats_num.append(col_trg)
    
    col_trg = f"{arg1}*{arg2}"
    d_full[col_trg] = d_train[arg1]*d_train[arg2]
    feats_num.append(col_trg)

for num, den in combinations(feats_cat, 2):
    col_trg = f"{num}*{den}"
    d_full[col_trg] = d_train[num]*d_train[den]
    feats_cat.append(col_trg)


feats = feats_cat + feats_num

In [None]:
d_train = d_full[d_full.date < '2019-06-22'].reset_index(drop=True)
d_test = d_full[d_full.date >= '2019-06-22'].reset_index(drop=True)

d_train = d_train[d_train.CPM<d_train.CPM.quantile(.95)].reset_index(drop=True)
d_test = d_test[d_test.CPM<d_test.CPM.quantile(.95)].reset_index(drop=True)


# 3. Prediction

In [None]:
prm_lgb = {
    'n_estimators': 300, 
    'learning_rate': 0.07, 
    'num_leaves': 60,
    
    'reg_alpha': 0.5,
    'reg_lambda': 0.5, 

    'objective': 'tweedie', 
    'tweedie_variance_power': 1.25,
}

est = lgb.LGBMRegressor(**prm_lgb)

est.fit(
    d_train[feats], d_train[target], 
    eval_metric=['mse'], 
    categorical_feature=feats_cat
)

In [None]:
mean_squared_error(
    y_true=d_test[target],
    y_pred=est.predict(d_test[feats]), 
)
