In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
from lightgbm import LGBMRegressor

Read data and apply CPM function 

In [None]:
df = pd.read_csv('/kaggle/input/real-time-advertisers-auction/Dataset.csv')

def weird_division(n, d):
    return n / d if d else 0

df['CPM'] = df.apply(lambda x: weird_division(((x['total_revenue'] * 100)), x['measurable_impressions']) * 1000 , axis=1)

Drop total_revenue and non-informative columns

In [None]:
df = df.drop('total_revenue', axis = 1)
df = df.drop(['integration_type_id', 'revenue_share_percent'], axis = 1)

Train/Holdout split by date, 0-95% CPM filtering

In [None]:
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values('date')

df_train = df[df['date'] < pd.to_datetime('22.06.2019')]
df_train = df_train[df_train['CPM'] >= 0 ]
df_train = df_train[df_train['CPM'] < df_train['CPM'].quantile(.95)]

df_test= df[df['date'] >= pd.to_datetime('22.06.2019')]
df_test = df_test[df_test['CPM'] >= 0]
df_test = df_test[df_test['CPM'] < df_test['CPM'].quantile(.95)]

Target mean encoding for categorical features ("*_id")

In [None]:
for col in df_train.columns: 
    if '_id' in col:
        train_means = df_train.groupby(col)['CPM'].mean()
        df_train[col + '_mean'] = df_train[col].map(train_means)
        df_test[col + '_mean'] = df_test[col].map(train_means)  

Preparing for fit/predict

In [None]:
x = df_train.drop(['CPM','date'], axis = 1)
y = df_train['CPM']

holdout_x = df_test.drop(['CPM','date'], axis = 1)
holdout_y = df_test['CPM']

Default LGBM 

In [None]:
model = LGBMRegressor()
model.fit(x, y)
holdout_pred = model.predict(holdout_x)
print('Default LGBM MSE', mean_squared_error(holdout_pred, holdout_y))

Hyperparameters random search

In [None]:
SEED = 42
np.random.seed(SEED)

tscv = TimeSeriesSplit(n_splits=2)
i = 0
score = []

for train_index, val_index in tscv.split(x):
    if (i == 0): 
        i = 1 
        continue
    
    x_train, x_val = x.iloc[train_index], x.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    print(" N | steps | depth | leaves | MSE")
    for j in range(20):
        n_estimators = np.random.randint(100, 1500)
        num_leaves = np.random.randint(10, 500)
        max_depth = np.random.randint(2, 12)
        
        model = LGBMRegressor( 
                    max_depth=max_depth, 
                    n_estimators=n_estimators, 
                    num_leaves=num_leaves, 
                    random_state=SEED, 
                    n_jobs=-1)
        model.fit(x_train, y_train)
        y_pred = model.predict(x_val)
        print(f"{j}\t{n_estimators}\t{max_depth}\t{num_leaves}\t{mean_squared_error(y_pred, y_val):.2f}")
        score.append([j, n_estimators, max_depth, num_leaves, mean_squared_error(y_pred, y_val)])
    i += 1

Best parameters

In [None]:
top_scores = sorted(score, key = lambda x: x[4])[:5]
best_n_estimators = top_scores[0][1]     
best_max_depth = top_scores[0][2]     
best_num_leaves = top_scores[0][3] 
top_scores

Model with best parameters    

In [None]:
best_model = LGBMRegressor(
                            n_estimators=best_n_estimators, 
                            max_depth=best_max_depth, 
                            num_leaves=best_num_leaves, 
                            random_state=SEED,
                          )
best_model.fit(x, y)
holdout_pred = best_model.predict(holdout_x)
print('Tuned LGBM MSE:', mean_squared_error(holdout_pred, holdout_y))
holdout_pred = holdout_pred * (holdout_pred >= 0) 
print('MSE with negative predictions set to 0:', mean_squared_error(holdout_pred, holdout_y))

# Final MSE = 2600