In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

import lightgbm as lgb

In [None]:
df = pd.read_csv('/kaggle/input/real-time-advertisers-auction/Dataset.csv', parse_dates=['date'])

In [None]:
df.head()

In [None]:
df['day'] = df.date.dt.day
df['dayofweek'] = df.date.dt.dayofweek

In [None]:
#calculating CPM
#calculating the value that the Advertisers Bid for the month of June
# CPM(the value which was the winning bid value) = 
#((revenue of the publisher*100)/revenue_share_percentage)/measurable_impressions)*1000

def weird_division(n, d):
    return n / d if d else 0

df['CPM'] = df.apply(lambda x: weird_division(((x['total_revenue']*100)),x['measurable_impressions'])*1000 , axis=1)

In [None]:
del df['total_revenue']

In [None]:
print(df.shape)
test = df[df.date >= '2019-06-22']
train = df[df.date < '2019-06-22']
train.shape, test.shape

In [None]:
print(test.shape)
print(test.CPM.quantile(.95))
test = test[test.CPM >= 0]
test = test[test.CPM < test.CPM.quantile(.95)]
print(test.shape)

In [None]:
print(train.shape)
print(train.CPM.quantile(.95))
train = train[train.CPM >= 0]
train = train[train.CPM < train.CPM.quantile(.95)] 
print(train.shape)

In [None]:
train.head()

In [None]:
cat_features = ['site_id',
                'ad_type_id',
                'geo_id',
                'device_category_id',
                'advertiser_id',
                'line_item_type_id',
                'os_id',
                'integration_type_id',
                'monetization_channel_id',
                'ad_unit_id',
                'order_id']

In [None]:
# order_id seems to leak but we care only on final score )
X_tr, X_va, y_tr, y_va = train_test_split(train.drop(['date', 'CPM'], axis=1), train['CPM'], test_size=0.15, random_state=2020)

X_tr.shape, X_va.shape

In [None]:
#feature selection

import eli5
from eli5.sklearn import PermutationImportance

reg = lgb.LGBMRegressor(n_estimators=256).fit(X_tr.fillna(-1), y_tr)
perm = PermutationImportance(reg).fit(X_va.fillna(-1), y_va)
eli5.show_weights(perm, feature_names = X_va.columns.tolist())

In [None]:
selected = eli5.explain_weights_df(perm, feature_names = X_va.columns.tolist())
features_selected = selected[selected['weight'] > 0.001]['feature'].values
len(features_selected), features_selected

In [None]:
cat_features_fixed = []
for c in cat_features:
    if c in features_selected:
        cat_features_fixed.append(c)
        
cat_features = cat_features_fixed

In [None]:
tr_data = lgb.Dataset(X_tr[features_selected], label=y_tr, categorical_feature=cat_features)
va_data = lgb.Dataset(X_va[features_selected], label=y_va, categorical_feature=cat_features, reference=tr_data)

parameters = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'mse',
    'feature_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 50,
}


model = lgb.train(parameters,
                  tr_data,
                  valid_sets=va_data,
                  num_boost_round=10000,
                  early_stopping_rounds=100,
                  verbose_eval=50)

In [None]:
%matplotlib inline
lgb.plot_importance(model, importance_type='gain', figsize=(15,15));

In [None]:
model.predict(test[features_selected])

In [None]:
# clip predictions below zero
y_te = pd.Series(model.predict(test[features_selected])).apply(lambda x: 0 if x < 0 else x).values

In [None]:
y_te

In [None]:
score = mean_squared_error(test['CPM'], y_te)
print(score, score < 4850)