In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, PoissonRegressor
from lightgbm.sklearn import LGBMRegressor

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('/kaggle/input/real-time-advertisers-auction/Dataset.csv', parse_dates=['date'])

def weird_division(n, d):
    return n / d if d else 0

df['target'] = df.apply(lambda x: weird_division(((x['total_revenue']*100)),x['measurable_impressions'])*1000 , axis=1)

df = df.drop(columns=[
    'total_revenue', 
    'revenue_share_percent'
])

In [None]:
df['dayofweek'] = df['date'].dt.dayofweek

In [None]:
df.head()

In [None]:
VAL_DATE = '2019-06-20'
TEST_DATE = '2019-06-22'

test_mask = df['date'] >= TEST_DATE
train_mask = df['date'] < VAL_DATE
val_mask = (df['date'] >= VAL_DATE) & (df['date'] < TEST_DATE)


X, y = df.drop(columns=['target', 'date']), df['target']

X_train, y_train = X[train_mask], y[train_mask]
X_val, y_val = X[val_mask], y[val_mask]
X_test, y_test = X[test_mask], y[test_mask]


test_max = y_test.quantile(0.95)
test_mask_filter = (y_test <= test_max) & (y_test >= 0)
X_test, y_test = X_test[test_mask_filter], y_test[test_mask_filter]

train_max = y_train.quantile(0.95)
train_mask_filter = (y_train <= train_max) & (y_train >= 0)
X_train, y_train = X_train[train_mask_filter], y_train[train_mask_filter]

val_max = y_val.quantile(0.95)
val_mask_filter = (y_val <= val_max) & (y_val >= 0)
X_val, y_val = X_val[val_mask_filter], y_val[val_mask_filter]

X_train_val = pd.concat([X_train, X_val])
y_train_val = pd.concat([y_train, y_val])

In [None]:
plt.hist(y_train, bins=50)

## 1. LinearRegression (baseline)

In [None]:
numeric_features = [
    'measurable_impressions', 
    'total_impressions',
    'viewable_impressions',
    
]

categorical_features = [
    'site_id', 
    'ad_type_id', 
    'geo_id',
    'device_category_id',
    'advertiser_id',
    'os_id',
    'monetization_channel_id',
    'ad_unit_id',
    'order_id',
    'line_item_type_id',
    'integration_type_id',
    'dayofweek',
]
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

lr = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression()),
])

lr.fit(X_train, y_train)
mean_squared_error(lr.predict(X_test), y_test)

## 2. DNN (not the best choice for tabular data... but why not)

In [None]:
X_train_norm = preprocessor.fit_transform(X_train)
X_val_norm = preprocessor.transform(X_val)

In [None]:
class TabularDataset(Dataset):
    
    def __init__(self, X, y):
        self.X = torch.from_numpy(X.todense()).float()
        self.y = torch.from_numpy(y.values).float().unsqueeze(-1)
        
    def __getitem__(self, index):
        return self.X[index], self.y[index]
        
    def __len__ (self):
        return len(self.y)
    
train_dataset = TabularDataset(X_train_norm, y_train)
val_dataset = TabularDataset(X_val_norm, y_val)

train_dataloader = DataLoader(train_dataset, batch_size=256, shuffle=True)
val_dataloader = DataLoader(val_dataset)

In [None]:
class Net(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc4 = nn.Linear(hidden_dim, 1)
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))

        x = self.fc4(x)
        x = F.relu(x) # оставим и здесь relu, ибо таргет >= 0
        return x

In [None]:
input_dim = X_train_norm.shape[1]
model = Net(input_dim, 512)
opt = optim.Adam(model.parameters(), lr=3e-4)

In [None]:
for epoch in range(10):
    
    epoch_train_loss = 0
    epoch_val_loss = 0
    
    model.train()
    for X_train_batch, y_train_batch in train_dataloader:
        y_pred = model(X_train_batch)
        loss = F.mse_loss(y_pred, y_train_batch)
        
        opt.zero_grad()
        loss.backward()
        opt.step()
        
        epoch_train_loss += loss.item()    
    
    with torch.no_grad():
        model.eval()
        for X_val_batch, y_val_batch in val_dataloader:
            y_pred = model(X_val_batch)
            loss = F.mse_loss(y_pred, y_val_batch)
            epoch_val_loss += loss.item()
            
        
    epoch_train_loss = epoch_train_loss / len(train_dataloader)
    epoch_val_loss = epoch_val_loss / len(val_dataloader)
    print(f'train: {epoch_train_loss:.2f}, val: {epoch_val_loss:.2f}')
        

In [None]:
X_test_tensor = torch.from_numpy(preprocessor.transform(X_test).todense()).float()
with torch.no_grad():
    preds = model(X_test_tensor).squeeze(-1).numpy()
    
mean_squared_error(y_test, preds)

## 3. LightGBM

In [None]:
(y_train == 0).sum() / len(y_train)

In [None]:
params = {
    'learning_rate': 0.1, 
    'n_estimators': 250, 
    'reg_lambda': 0.1, 
    'num_leaves': 63,
    'objective': 'tweedie', # потому что распределение таргета скошенное и много нулей
    'tweedie_variance_power': 1.5,
}

In [None]:
lgbm = LGBMRegressor(**params)
lgbm.fit(
    X_train, 
    y_train, 
    eval_set=(X_val, y_val), 
    eval_metric=['mse'], 
    verbose=20, 
    categorical_feature=categorical_features
)

In [None]:
mean_squared_error(lgbm.predict(X_test), y_test)

Обучим с теми же параметрами, но добавим данные, которые использовали для валидации

In [None]:
lgbm_all_train = LGBMRegressor(**params)
lgbm_all_train.fit(X_train_val, y_train_val, categorical_feature=categorical_features)

In [None]:
mean_squared_error(lgbm_all_train.predict(X_test), y_test)

# Best MSE: 2603.4