In [None]:
!pip uninstall -y allennlp

In [None]:
!pip install pandas_profiling lightautoml

In [None]:
import logging
import os
import time
logging.basicConfig(format='[%(asctime)s] (%(levelname)s): %(message)s', level=logging.INFO)

import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
import pandas_profiling
import torch

from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.dataset.roles import DatetimeRole
from lightautoml.tasks import Task

In [None]:
np.random.seed(42)
torch.set_num_threads(8)

### Data preparation and exploration

In [None]:
df = pd.read_csv('/kaggle/input/real-time-advertisers-auction/Dataset.csv', dtype={'site_id': str, 'ad_type_id': str, 'geo_id': str, 'device_category_id': str, 'advertiser_id': str, 'order_id': str, 
                                           'line_item_type_id': str, 'os_id': str, 'integration_type_id': str, 'monetization_channel_id': str, 'ad_unit_id': str},
                   parse_dates=[0], dayfirst=True)
df.head()

In [None]:
def weird_division(n, d):
    return n / d if d else 0

df['CPM'] = df.apply(lambda x: weird_division(((x['total_revenue']*100)),x['measurable_impressions'])*1000 , axis=1)

In [None]:
len(df)

In [None]:
df.profile_report()

In [None]:
df.drop(columns=['total_revenue', 'integration_type_id', 'revenue_share_percent'], inplace=True)
df[df['CPM'] >= 0]
df = df[df['CPM'] <= np.percentile(df['CPM'], 95)]

In [None]:
train_data = df[df.date < pd.to_datetime("2019-06-22")]
test_data = df[df.date >= pd.to_datetime("2019-06-22")]

In [None]:
len(train_data), len(test_data)

In [None]:
train_data.head()

### Setup task and column metadata

In [None]:
task = Task('reg', loss='mse', metric='mse')

In [None]:
roles = {'target': 'CPM',
         DatetimeRole(base_date=True, seasonality=(), base_feats=False): 'date',
         }

### Enjoy as machine does all the work

In [None]:
%%time 

automl = TabularAutoML(task = task, 
                       timeout = 300,
                       general_params = {'nested_cv': False, 'use_algos': [['linear_l2', 'lgb', 'lgb_tuned']]},
                       reader_params = {'cv': 5, 'random_state': 42},
                       tuning_params = {'max_tuning_iter': 20, 'max_tuning_time': 50},
                       lgb_params = {'default_params': {'num_threads': 8}})
oof_pred = automl.fit_predict(train_data, roles = roles)
logging.info('oof_pred:\n{}\nShape = {}'.format(oof_pred, oof_pred.shape))

In [None]:
len(oof_pred.data[:, 0])

In [None]:
train_data['CPM'].isnull().sum()

In [None]:
%%time

test_pred = automl.predict(test_data)

print('TEST score: {}'.format(mean_squared_error(test_data['CPM'].values, test_pred.data[:, 0])))

### Test MSE: 2563.072643811232