In [1]:
import os
import numpy as np
import pandas as pd
import torch
import multiprocessing as mp¬
from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.tasks import Task

In [7]:
N_THREADS = mp.cpu_count()
N_FOLDS = 10
RANDOM_STATE = 42
TIMEOUT = 4400
TARGET_NAME = 'per_square_meter_price'
THRESHOLD = 0.15
NEGATIVE_WEIGHT = 1.1
PATH = 'data'

In [3]:
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

In [11]:
train_data = pd.read_csv(os.path.join(PATH, 'train.csv.gz'))
test_data = pd.read_csv(os.path.join(PATH, 'test.csv.gz'))
submission = pd.read_csv(os.path.join(PATH, 'test_submission.csv'))

In [5]:
def deviation_metric_vec(y_true: np.array, y_pred: np.array) -> float:
    deviation = (y_pred - y_true) / np.maximum(1e-8, y_true)

    metr = deviation * 0.0 + 9

    metr[np.abs(deviation) <= THRESHOLD] = 0

    metr[deviation <= - 4 * THRESHOLD] = 9 * NEGATIVE_WEIGHT

    mask = (-4 * THRESHOLD < deviation) & (deviation < -THRESHOLD)
    metr[mask] = NEGATIVE_WEIGHT * ((deviation[mask] / THRESHOLD) + 1) ** 2

    mask = (deviation < 4 * THRESHOLD) & (deviation > THRESHOLD)
    metr[mask] = ((deviation[mask] / THRESHOLD) - 1) ** 2

    return metr.mean()

In [12]:
train = train_data[train_data.price_type == 0]
test = train_data[train_data.price_type == 1]
train[TARGET_NAME] = train[TARGET_NAME] * .9
train = train.loc[(train[TARGET_NAME] >= 1.095943e+03) & (train[TARGET_NAME] <= 1.790993e+06)]

In [6]:
task = Task('reg', loss='mae', metric=deviation_metric_vec, greater_is_better=False)
roles = {'target': TARGET_NAME,
         'category': ['city', 'floor', 'region', 'street', 'realty_type'],
         'drop': ['id', 'date', 'lat', 'lng', 'price_type']
         }

sklearn doesn't support in general case mae and will not be used.


In [13]:
automl = TabularAutoML(task=task,
                       memory_limit=8,
                       timeout=TIMEOUT,
                       cpu_limit=N_THREADS,
                       verbose=2,
                       general_params={'use_algos': [['lgb', 'cb'],
                                                     ['lgb_tuned', 'cb_tuned']]},
                       reader_params={'cv': N_FOLDS,
                                      'random_state': RANDOM_STATE,
                                      'n_jobs': N_THREADS},
                       )

In [14]:
oof_pred = automl.fit_predict(test, roles=roles)

Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer


Start automl preset with listed constraints:
- time: 4400 seconds
- cpus: 8 cores
- memory: 8 gb

Train data shape: (4493, 77)
Feats was rejected during automatic roles guess: []


Layer 1 ...
Train process start. Time left 4387.554087877274 secs
Start fitting Selector_LightGBM ...

===== Start working with fold 0 for Selector_LightGBM =====

Training until validation scores don't improve for 200 rounds
[100]	valid's l1: 24311.9	valid's Opt metric: 2.54167
[200]	valid's l1: 21889.4	valid's Opt metric: 2.2682
[300]	valid's l1: 20755.6	valid's Opt metric: 2.10656
[400]	valid's l1: 20203.7	valid's Opt metric: 1.97264
[500]	valid's l1: 19883.6	valid's Opt metric: 1.91758
[600]	valid's l1: 19653.8	valid's Opt metric: 1.88094
[700]	valid's l1: 19454	valid's Opt metric: 1.85186
[800]	valid's l1: 19332.1	valid's Opt metric: 1.84505
[900]	valid's l1: 19210.4	valid's Opt metric: 1.83539
[1000]	valid's l1: 19140.7	valid's Opt metric: 1.83224
[1100]	valid's l1: 19015.9	valid's Opt metric: 1.82032


In [None]:
# from Vladislav Evteev (https://github.com/BatyaZhizni/Raifhack-DS)
test_data['floor'] = test_data['floor'].mask(test_data['floor'] == '-1.0', -1)
.mask(test_data['floor'] == '-2.0', -2)
.mask(test_data['floor'] == '-3.0', -3)
.mask(test_data['floor'] == 'подвал, 1', 1)
.mask(test_data['floor'] == 'подвал', -1)
.mask(test_data['floor'] == 'цоколь, 1', 1)
.mask(test_data['floor'] == '1,2,антресоль', 1)
.mask(test_data['floor'] == 'цоколь', 0)
.mask(test_data['floor'] == 'тех.этаж (6)', 6)
.mask(test_data['floor'] == 'Подвал', -1)
.mask(test_data['floor'] == 'Цоколь', 0)
.mask(test_data['floor'] == 'фактически на уровне 1 этажа', 1)
.mask(test_data['floor'] == '1,2,3', 1)
.mask(test_data['floor'] == '1, подвал', 1)
.mask(test_data['floor'] == '1,2,3,4', 1)
.mask(test_data['floor'] == '1,2', 1)
.mask(test_data['floor'] == '1,2,3,4,5', 1)
.mask(test_data['floor'] == '5, мансарда', 5)
.mask(test_data['floor'] == '1-й, подвал', 1)
.mask(test_data['floor'] == '1, подвал, антресоль', 1)
.mask(test_data['floor'] == 'мезонин', 2)
.mask(test_data['floor'] == 'подвал, 1-3', 1)
.mask(test_data['floor'] == '1 (Цокольный этаж)', 0)
.mask(test_data['floor'] == '3, Мансарда (4 эт)', 3)
.mask(test_data['floor'] == 'подвал,1', 1)
.mask(test_data['floor'] == '1, антресоль', 1)
.mask(test_data['floor'] == '1-3', 1)
.mask(test_data['floor'] == 'мансарда (4эт)', 4)
.mask(test_data['floor'] == '1, 2.', 1)
.mask(test_data['floor'] == 'подвал , 1 ', 1)
.mask(test_data['floor'] == '1, 2', 1)
.mask(test_data['floor'] == 'подвал, 1,2,3', 1)
.mask(test_data['floor'] == '1 + подвал (без отделки)', 1)
.mask(test_data['floor'] == 'мансарда', 3)
.mask(test_data['floor'] == '2,3', 2)
.mask(test_data['floor'] == '4, 5', 4)
.mask(test_data['floor'] == '1-й, 2-й', 1)
.mask(test_data['floor'] == '1 этаж, подвал', 1)
.mask(test_data['floor'] == '1, цоколь', 1)
.mask(test_data['floor'] == 'подвал, 1-7, техэтаж', 1)
.mask(test_data['floor'] == '3 (антресоль)', 3)
.mask(test_data['floor'] == '1, 2, 3', 1)
.mask(test_data['floor'] == 'Цоколь, 1,2(мансарда)', 1)
.mask(test_data['floor'] == 'подвал, 3. 4 этаж', 3)
.mask(test_data['floor'] == 'подвал, 1-4 этаж', 1)
.mask(test_data['floor'] == 'подва, 1.2 этаж', 1)
.mask(test_data['floor'] == '2, 3', 2)
.mask(test_data['floor'] == '7,8', 7)
.mask(test_data['floor'] == '1 этаж', 1)
.mask(test_data['floor'] == '1-й', 1)
.mask(test_data['floor'] == '3 этаж', 3)
.mask(test_data['floor'] == '4 этаж', 4)
.mask(test_data['floor'] == '5 этаж', 5)
.mask(test_data['floor'] == 'подвал,1,2,3,4,5', 1)
.mask(test_data['floor'] == 'подвал, цоколь, 1 этаж', 1)
.mask(test_data['floor'] == '3, мансарда', 3)
.mask(test_data['floor'] == 'цоколь, 1, 2,3,4,5,6', 1)
.mask(test_data['floor'] == ' 1, 2, Антресоль', 1)
.mask(test_data['floor'] == '3 этаж, мансарда (4 этаж)', 3)
.mask(test_data['floor'] == 'цокольный', 0)
.mask(test_data['floor'] == '1,2 ', 1)
.mask(test_data['floor'] == '3,4', 3)
.mask(test_data['floor'] == 'подвал, 1 и 4 этаж', 1)
.mask(test_data['floor'] == '5(мансарда)', 5)
.mask(test_data['floor'] == 'технический этаж,5,6', 5)
.mask(test_data['floor'] == ' 1-2, подвальный', 1)
.mask(test_data['floor'] == '1, 2, 3, мансардный', 1)
.mask(test_data['floor'] == 'подвал, 1, 2, 3', 1)
.mask(test_data['floor'] == '1,2,3, антресоль, технический этаж', 1)
.mask(test_data['floor'] == '3, 4', 3)
.mask(test_data['floor'] == '1-3 этажи, цоколь (188,4 кв.м), подвал (104 кв.м)', 1)
.mask(test_data['floor'] == '1,2,3,4, подвал', 1)
.mask(test_data['floor'] == '2-й', 2)
.mask(test_data['floor'] == '1, 2 этаж', 1)
.mask(test_data['floor'] == 'подвал, 1, 2', 1)
.mask(test_data['floor'] == '1-7', 1)
.mask(test_data['floor'] == '1 (по док-м цоколь)', 1)
.mask(test_data['floor'] == '1,2,подвал ', 1)
.mask(test_data['floor'] == 'подвал, 2', 2)
.mask(test_data['floor'] == 'подвал,1,2,3', 1)
.mask(test_data['floor'] == '1,2,3 этаж, подвал ', 1)
.mask(test_data['floor'] == '1,2,3 этаж, подвал', 1)
.mask(test_data['floor'] == '2, 3, 4, тех.этаж', 2)
.mask(test_data['floor'] == 'цокольный, 1,2', 1)
.mask(test_data['floor'] == 'Техническое подполье', -1)
.mask(test_data['floor'] == '1.2', 1)
.astype(float)

In [20]:
prediction = automl.predict(test_data)

In [41]:
test_data[TARGET_NAME] = prediction.data[:, 0] * 0.95

In [None]:
test_data[['id', TARGET_NAME]].to_csv('sample_submission.csv', index=False)