In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

In [None]:
df = pd.read_csv('/kaggle/input/real-time-advertisers-auction/Dataset.csv')
df.date = pd.to_datetime(df.date)
df['week_day'] = df['date'].dt.day_name()
print(df.shape)
df.head()

In [None]:
def weird_division(n, d):
    return n / d if d else 0

df['CPM'] = df.apply(lambda x: weird_division(((x['total_revenue']*100)),x['measurable_impressions'])*1000 , axis=1)

In [None]:
cat_feat = ['site_id', 'ad_type_id', 'geo_id', 'device_category_id',
            'advertiser_id', 'order_id', 'line_item_type_id', 'os_id',
            'integration_type_id', 'monetization_channel_id', 'ad_unit_id',
            'week_day',
           ]
num_feat = ['total_impressions', 'viewable_impressions',]

In [None]:
new_feats = []
for i in range(len(num_feat)):
    for j in range(i, len(num_feat)):
        new_feat = f'{num_feat[i]}_{num_feat[j]}'
        df[new_feat] = df[num_feat[i]]/df[num_feat[j]]
        new_feats.append(new_feat)

In [None]:
features = num_feat + cat_feat + new_feats
target = 'CPM'

In [None]:
df = df[df.CPM>=0].reset_index(drop=True)

In [None]:
test = df[(df.date >= pd.to_datetime('2019-06-22'))
         ].reset_index(drop=True)
test = test[test['CPM']<test['CPM'].quantile(.95)].reset_index(drop=True)

In [None]:
df_train = df[(df.date < pd.to_datetime('2019-06-22'))
              & (df.CPM<=df['CPM'].quantile(.95))
             ].reset_index(drop=True)

In [None]:
for col in cat_feat:
    df_train[col] = df_train[col].astype('category')
    test[col] = test[col].astype('category')

In [None]:
train, valid = train_test_split(df_train, test_size=0.2, shuffle=True, random_state=42)

In [None]:
params = {
    'nthread': 30,
    'max_depth': 11,
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'mse',
    'metric': 'mse',
    'num_leaves': 31,
    'learning_rate': 0.03,
    #'feature_fraction': 0.7,
    'colsample_bytree': 0.7,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'min_child_samples':100,
    #'lambda_l1': 0.06,
    #'lambda_l2': 0.1,
    #'min_gain_to_split': 0.5,
    'verbose': -1,
    'seed': 42
}

In [None]:
lgb_train = lgb.Dataset(train[features], train[target])
lgb_valid = lgb.Dataset(valid[features], valid[target])

gbm = lgb.train(params, lgb_train, 15000, 
    valid_sets=[lgb_train, lgb_valid],
    early_stopping_rounds=100, verbose_eval=50)

In [None]:
test_pred = gbm.predict(test[features])
test_true = test['CPM']
mean_squared_error(test_true, test_pred)