In [None]:
# Essential DS libraries
import numpy as np
import pandas as pd
import gc
from sklearn.metrics import roc_auc_score, mean_squared_error
from sklearn.model_selection import train_test_split

# LightAutoML presets, task and report generation
from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.tasks import Task

pd.set_option('display.max_columns', 50)

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

train_add = pd.read_csv('data/train_add.csv')
test_add = pd.read_csv('data/test_add.csv')

train.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,None reported,Yes,4200
1,1,Lincoln,LS V8,2002,143250,Gasoline,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,A/T,Silver,Beige,At least 1 accident or damage reported,Yes,4999
2,2,Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,A/T,Blue,Gray,None reported,Yes,13900
3,3,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,45000
4,4,Mercedes-Benz,Metris Base,2021,7388,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Beige,None reported,Yes,97500


In [3]:
all_data = pd.concat([train, train_add, test, test_add]).reset_index(drop = True)
all_data.head()

def convert_engine(val):
    d = {
        'engine_HP': np.nan,
        'engine_L': np.nan,
        'engine_cylinder': np.nan,
        'engine_cylinder_type': np.nan,
        'engine_fuel_type': np.nan,
        'engine_type': np.nan,
        'engine_v': np.nan,
        'engine_vtype': np.nan
    }
    spl = val.replace('-', '').split(' ')
    pos_v1 = -1
    pos_v2 = -1
    for i, v in enumerate(spl):
        if v.endswith('HP'):
            d['engine_HP'] = float(v[:-2])
        elif v.endswith('L'):
            d['engine_L'] = float(v[:-1])
        elif v == 'Engine' and spl[i-1] == 'Cylinder':
            d['engine_cylinder'] = abs(float(spl[i-2].replace('V', '')))
        elif v == 'Fuel':
            d['engine_fuel_type'] = spl[i - 1]
        ##############
        elif v == 'Liter':
            d['engine_L'] = float(spl[i-1])
        elif v.startswith('V') or v.startswith('H') or v.startswith('I'):
            try:
                d['engine_cylinder'] = float(v[1:])
                d['engine_cylinder_type'] = v[0]
                pos_v1 = i
            except:
                pass
        elif v == 'DOHC' or v == 'OHV':
            d['engine_type'] = ' '.join(spl[i:])
            if pos_v2 != -1:
                d['engine_vtype'] = ' '.join(spl[pos_v2+1:i])
            elif pos_v1 != -1:
                d['engine_vtype'] = ' '.join(spl[pos_v1+1:i])
        elif v.endswith('V'):
            try:
                d['engine_v'] = float(v[:-1])
                pos_v2 = i
            except:
                pass
    return d

all_data = pd.concat([all_data, pd.DataFrame.from_records(all_data['engine'].map(convert_engine).values)], axis = 1)
def milage_signs(val):
    v = str(val)
    for i in range(len(v) - 1, -1, -1):
        if v[i] != '0':
            break
    return (len(v) - 1 - i) / len(v) * 100


all_data['milage_signif_signs_perc'] = all_data['milage'].map(milage_signs)
for col in ['transmission', 'ext_col', 'int_col']:
    all_data[col] = all_data[col].str.lower()

def convert_transmission(val):
    d = {
        'transmission_speeds_cnt': np.nan,
        'transmission_type': np.nan
    }
    spl = val.replace('/', '').split(' ')
    for i, v in enumerate(spl):
        if 'speed' in v:
            tmp = v.split('-')
            if len(tmp) > 1:
                if tmp[0] != 'single':
                    d['transmission_speeds_cnt'] = float(tmp[0])
                else:
                    d['transmission_speeds_cnt'] = 1.0
            else:
                d['transmission_speeds_cnt'] = float(spl[i-1])
        elif 'manual' in v or 'mt' in v:
            d['transmission_type'] = 'manual'
        elif 'automatic' in v or 'at' in v:
            d['transmission_type'] = 'automatic'
    return d

all_data = pd.concat([all_data, pd.DataFrame.from_records(all_data['transmission'].map(convert_transmission).values)], axis = 1)
train2, train2_add, test2, test2_add = all_data.iloc[:len(train)], all_data.iloc[len(train):len(train)+len(train_add)], all_data.iloc[len(train)+len(train_add):len(train)+len(train_add)+len(test)], all_data.iloc[len(train)+len(train_add)+len(test):]


In [4]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from lightautoml.text.nn_model import TorchUniversalModel
from lightautoml.text.embed import PLREmbedding, WeightedCatEmbedding, pooling_by_name
from lightautoml.ml_algo.torch_based.fttransformer.fttransformer_utils import Transformer
from collections import OrderedDict
from typing import List, Tuple, Type
from typing import Optional
from typing import Union

import numpy as np
import torch
import torch.nn as nn

from lightautoml.text.nn_model import TorchUniversalModel
from lightautoml.text.embed import PLREmbedding, WeightedCatEmbedding, pooling_by_name
from lightautoml.ml_algo.torch_based.fttransformer.fttransformer_utils import Transformer
from collections import OrderedDict
from typing import List, Tuple, Type
from typing import Optional
from typing import Union

import numpy as np
import torch
import torch.nn as nn

from transformers import BertConfig, BertModel, BertForSequenceClassification
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType

class BertEmbeddingsEmpty(nn.Module):
    """Construct the embeddings from word, position and token_type embeddings."""

    def __init__(self):
        super().__init__()

        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
        # any TensorFlow checkpoint file


    def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, **kwargs):
        if input_ids is not None:
            input_shape = input_ids.size()
        else:
            input_shape = inputs_embeds.size()[:-1]

        seq_length = input_shape[1]

        embeddings = inputs_embeds
        return embeddings
class FTTransformer2(nn.Module):
    """FT Transformer (https://arxiv.org/abs/2106.11959v2) from https://github.com/lucidrains/tab-transformer-pytorch/tree/main.

    Args:
            pooling: Pooling used for the last step.
            n_out: Output dimension, 1 for binary prediction.
            embedding_size: Embeddings size.
            depth: Number of Attention Blocks inside Transformer.
            heads: Number of heads in Attention.
            attn_dropout: Post-Attention dropout.
            ff_dropout: Feed-Forward Dropout.
            dim_head: Attention head dimension.
            num_enc_layers: Number of Transformer layers.
            device: Device to compute on.
    """

    def __init__(
        self,
        *,
        pooling: str = "mean",
        n_out: int = 1,
        embedding_size: int = 32,
        depth: int = 4,
        heads: int = 1,
        attn_dropout: float = 0.1,
        ff_dropout: float = 0.1,
        dim_head: int = 32,
        num_enc_layers: int = 2,
        device: Union[str, torch.device] = "cuda:0",
        **kwargs,
    ):
        super(FTTransformer2, self).__init__()
        self.device = device
        self.pooling = pooling_by_name[pooling]()
        print('pooling', pooling)
    
        # transformer
        # self.transformer = nn.Sequential(
        #     *nn.ModuleList(
        #         [
        #             Transformer(
        #                 dim=embedding_size,
        #                 depth=depth,
        #                 heads=heads,
        #                 dim_head=dim_head,
        #                 attn_dropout=attn_dropout,
        #                 ff_dropout=ff_dropout,
        #             )
        #             for _ in range(num_enc_layers)
        #         ]
        #     )
        # )
        self.conf = {"hidden_size": 128, "hidden_act": "gelu", "initializer_range": 0.02, "vocab_size": 10, "hidden_dropout_prob": 0.1, "num_attention_heads": 2,
         "type_vocab_size": 2, "max_position_embeddings": 512, "num_hidden_layers": 2, "intermediate_size": 512, "attention_probs_dropout_prob": 0.1} #bert_tiny
        # self.conf = {"hidden_size": 512, "hidden_act": "gelu", "initializer_range": 0.02, "vocab_size": 10, "hidden_dropout_prob": 0.1, "num_attention_heads": 8,
        # "type_vocab_size": 2, "max_position_embeddings": 512, "num_hidden_layers": 4, "intermediate_size": 2048, "attention_probs_dropout_prob": 0.1} #bert_small

        #self.transformer = BertModel(BertConfig(**self.conf))
        self.transformer = BertModel.from_pretrained("prajjwal1/bert-tiny", output_attentions=True, output_hidden_states=True, cache_dir="./")
        self.transformer.embeddings = BertEmbeddingsEmpty()

        #peft_config = LoraConfig(task_type=TaskType.FEATURE_EXTRACTION, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1)

        #self.transformer = get_peft_model(self.transformer, peft_config)
        #print(self.transformer.print_trainable_parameters())

        #self.fixup_initialization()
        # to logits
    
        self.to_logits = nn.Sequential(nn.BatchNorm1d(embedding_size), nn.Linear(embedding_size, n_out))

        self.cls_token = nn.Embedding(2, embedding_size)

    def fixup_initialization(self):
        temp_state_dic = {}
        en_layers = self.conf['num_hidden_layers']

        for name, param in self.named_parameters():
            if 'weight' in name and param.data.dim() == 2:
                param = (9 * en_layers) ** (- 1 / 4) * param
                temp_state_dic[name] = param
            if np.any([1 if i in name else 0 for i in ["linear1.weight",
                        "linear2.weight",
                        "self_attn.out_proj.weight", #'self_attn.in_proj_weight', 
                        ]]):
                temp_state_dic[name] = (0.67 * (en_layers) ** (- 1. / 4.)) * param
            elif name in ["self_attn.v_proj.weight",]:
                temp_state_dic[name] = (0.67 * (en_layers) ** (- 1. / 4.)) * (param * (2**0.5))

        for name in self.state_dict():
            if name not in temp_state_dic:
                temp_state_dic[name] = self.state_dict()[name]
        self.load_state_dict(temp_state_dic)

    def forward(self, embedded):
        """Transform the input tensor.

        Args:
            embedded : torch.Tensor
                embedded fields

        Returns:
            torch.Tensor

        """
        cls_token = torch.unsqueeze(
            self.cls_token(torch.ones(embedded.shape[0], dtype=torch.int).to(self.device)), dim=1
        )
        x = torch.cat((cls_token, embedded), dim=1)

        x = self.transformer(inputs_embeds=x, 
                    #attention_mask=~mask
                    ).last_hidden_state
        #x = self.transformer(x)
        x_mask = torch.ones(x.shape, dtype=torch.bool).to(self.device)
        pool_tokens = self.pooling(x=x, x_mask=x_mask)
        logits = self.to_logits(pool_tokens)
        return logits


class FTT_plus(TorchUniversalModel):
    """Mixed data model.

    Class for preparing input for DL model with mixed data.

    Args:
            n_out: Number of output dimensions.
            cont_params: Dict with numeric model params.
            cat_params: Dict with category model para
            **kwargs: Loss, task and other parameters.

        """

    def __init__(
            self,
            n_out: int = 1,
            cont_params = None,
            cat_params = None,
            **kwargs,
    ):
        # init parent class (need some helper functions to be used)
        super(FTT_plus, self).__init__(**{
                **kwargs,
                "cont_params": cont_params,
                "cat_params": cat_params,
                "torch_model": None, # dont need any model inside parent class
        })
        
        n_in = 0
        # add cont columns processing
        self.cont_embedder = PLREmbedding(**cont_params)
        n_in += self.cont_embedder.get_out_shape()
        
        # add cat columns processing
        self.cat_embedder = WeightedCatEmbedding(**cat_params)
        n_in += self.cat_embedder.get_out_shape()
        
        self.torch_model = FTTransformer2(
                **{
                    **kwargs,
                    **{"n_in": n_in, "n_out": n_out},
                }
        )
    
    def get_logits(self, inp) -> torch.Tensor:
        outputs = []
        outputs.append(self.cont_embedder(inp))
        outputs.append(self.cat_embedder(inp))
        
        if len(outputs) > 1:
            output = torch.cat(outputs, dim=1)
        else:
            output = outputs[0]
        
        logits = self.torch_model(output)
        return logits

def myround(x, base=1000):
    return base * np.round(np.float32(x) / base)
        
cb_params = {
        "task_type": "GPU",
        #"thread_count": 4,
        "random_seed": 42,
        #"learning_rate": 0.03,
        #"l2_leaf_reg": 1e-2,
        "bootstrap_type": 'Bernoulli', #"Bernoulli",
        # "bagging_temperature": 1,
        'subsample': 0.5,
        "grow_policy": "SymmetricTree",
        "max_depth": 9,
        #"min_data_in_leaf": 50000,
        "one_hot_max_size": 2, #10,
        "fold_permutation_block": 5,
        "boosting_type": "Ordered",
        "boost_from_average": False,#True,
        "od_type": "Iter",
        "od_wait": 200,
        "max_bin": 32, #32,
        "feature_border_type": "GreedyLogSum",
        "nan_mode": "Min",
        # "silent": False,
        "verbose": 100,
        "allow_writing_files": False,
        #'objective': 'Tweedie:variance_power=1.5',
    
    'num_trees':50000,
    #'learning_rate': 0.02,
    #'random_strength':0,
    'l2_leaf_reg': 5.5,
    #'max_depth': 9
    
    }

lgb_params = {
        "objective": "rmse",  
        'metric': 'rmse', 'num_trees': 50000, 
        'max_depth': 7,
        'min_data_in_leaf': 1000,
        'bagging_fraction': 0.5,
        'reg_alpha': 0, 
        'reg_lambda': 2,
    }

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss
import joblib
import os 
res = {}

for clip_name, clip in zip(['woclip', 'clip'], [999_999_999, 500_000]):
    for data_name, add_data in zip(['data', 'wodata'], [train2_add, None]):
        print(clip_name, data_name)
        # denselight classification
        def ll(x, y):
            return log_loss(x, y, labels=np.arange(y.shape[1]))
        task = Task('multiclass', metric=ll)
        oof = np.zeros(len(train2))
        pred = 0
        pred_add = 0
        models = {}
        if not os.path.isfile(f'denselight_cls_{clip_name}_{data_name}.jbl'):
            for fold, (tr_idx, te_idx) in enumerate(KFold(n_splits=10, random_state=42, shuffle=True).split(train2['price'], train2['price'], None)):
                x_tr, x_vl = train2.iloc[tr_idx].reset_index(drop=True), train2.iloc[te_idx].reset_index(drop=True)
                if add_data is not None:
                    x_tr = pd.concat([x_tr, train2_add], axis=0).reset_index(drop=True)
                
                x_tr['price'] = np.clip(x_tr['price'], 0, clip)

                x_tr['price'] = x_tr['price'].apply(myround )
                x_vl['price'] = x_vl['price'].apply(myround )

                sorted_labels = np.array(sorted(np.unique(x_tr['price'])))
                sorted_labels2 = np.array(sorted(np.unique(x_vl['price'])))

                mapping = {}
                for curr in sorted_labels2:
                    diff = np.abs(sorted_labels - curr)
                    idx = np.argmin(diff)
                    mapping[curr] = sorted_labels[idx]
                x_vl['price'] = x_vl['price'].map(mapping)
                automl1 = TabularAutoML(
                task = task, 
                timeout = 600 * 3600,
                cpu_limit = 2, 
                gpu_ids = '0',
                selection_params = {'mode': 0},
                general_params = {"use_algos": [['denselight']]}, 
                nn_params = {
                    "n_epochs": 20, 
                    "bs": 1024//2, 
                    "num_workers": 0, 
                    "path_to_save": None, 
                    "freeze_defaults": True,
                    "cont_embedder": 'plr',
                    'cat_embedder': 'weighted',
                    'act_fun': 'GELU',#'SiLU',
                    "hidden_size": [64, 32], #32,
                    'stop_by_metric': True,
                    'embedding_size': 37,#32,
                    'verbose_bar': True,
                    'input_bn': False,
                    'opt_params': { 'lr': 0.0003 , 'weight_decay': 0 }
                },
                nn_pipeline_params = {"use_qnt": True, "use_te": False},
                reader_params = {'n_jobs': 12, 'cv': 10, 'random_state': 42, 'advanced_roles': False})

                out_of_fold_predictions = automl1.fit_predict(
                    x_tr, valid_data = x_vl, 
                    roles = {
                        'target': 'price',
                        'drop': ['id'],
                        
                    }, 
                    verbose = 1)
                values = np.array(list(automl1.reader.class_mapping.keys()))

                oof[te_idx] = (out_of_fold_predictions.data @ values.reshape(-1, 1)).flatten()
                pred += (automl1.predict(test2).data @ values.reshape(-1, 1)).flatten() / 10
                pred_add += (automl1.predict(test2_add).data  @ values.reshape(-1, 1)).flatten() / 10

                models[fold] = automl1
            res_dl_cls = {'oof': oof, 'pred': pred, 'pred_add': pred_add, 'models': models}
            joblib.dump(res_dl_cls, f'denselight_cls_{clip_name}_{data_name}.jbl')
            print(f'denselight_cls_{clip_name}_{data_name}', np.sqrt(mean_squared_error(train2['price'], oof)))

        # ftt classification
        task = Task('multiclass', metric=ll)
        oof = np.zeros(len(train2))
        pred = 0
        pred_add = 0
        models = {}
        if not os.path.isfile(f'ftt_cls_{clip_name}_{data_name}.jbl'):
            for fold, (tr_idx, te_idx) in enumerate(KFold(n_splits=10, random_state=42, shuffle=True).split(train2['price'], train2['price'], None)):
                x_tr, x_vl = train2.iloc[tr_idx].reset_index(drop=True), train2.iloc[te_idx].reset_index(drop=True)
                if add_data is not None:
                    x_tr = pd.concat([x_tr, train2_add], axis=0).reset_index(drop=True)
                
                x_tr['price'] = np.clip(x_tr['price'], 0, clip)

                x_tr['price'] = x_tr['price'].apply(myround )
                x_vl['price'] = x_vl['price'].apply(myround )

                sorted_labels = np.array(sorted(np.unique(x_tr['price'])))
                sorted_labels2 = np.array(sorted(np.unique(x_vl['price'])))

                mapping = {}
                for curr in sorted_labels2:
                    diff = np.abs(sorted_labels - curr)
                    idx = np.argmin(diff)
                    mapping[curr] = sorted_labels[idx]
                x_vl['price'] = x_vl['price'].map(mapping)

                automl1 = TabularAutoML(debug=True,
                task = task, 
                timeout = 600 * 3600,
                cpu_limit = 2, 
                gpu_ids = '0',
                selection_params = {'mode': 0},
                general_params = {"use_algos": [[FTT_plus]]}, 
                nn_params = {
                    "n_epochs": 20, 
                    "bs": 1024//2, 
                    "num_workers": 0, 
                    "path_to_save": None, 
                    "freeze_defaults": True,
                    "cont_embedder": 'plr',
                    'cat_embedder': 'weighted',
                    'act_fun': 'GELU',#'SiLU',
                    "hidden_size": [64, 32], #32,
                    'stop_by_metric': True,
                    'embedding_size': 128,#32,
                    'verbose_bar': True,
                    'input_bn': False,
                    'opt_params': { 'lr': 0.0003 , 'weight_decay': 0 }, "model_with_emb": True,
                },
                nn_pipeline_params = {"use_qnt": True, "use_te": False},
                reader_params = {'n_jobs': 12, 'cv': 10, 'random_state': 42, 'advanced_roles': False})

                out_of_fold_predictions = automl1.fit_predict(
                    x_tr, valid_data = x_vl, 
                    roles = {
                        'target': 'price',
                        'drop': ['id'],
                        
                    }, 
                    verbose = 1)
                    
                values = np.array(list(automl1.reader.class_mapping.keys()))

                oof[te_idx] = (out_of_fold_predictions.data @ values.reshape(-1, 1)).flatten()
                pred += (automl1.predict(test2).data @ values.reshape(-1, 1)).flatten() / 10
                pred_add += (automl1.predict(test2_add).data  @ values.reshape(-1, 1)).flatten() / 10

                models[fold] = automl1
            res_dl_cls = {'oof': oof, 'pred': pred, 'pred_add': pred_add, 'models': models}
            joblib.dump(res_dl_cls, f'ftt_cls_{clip_name}_{data_name}.jbl')
            print(f'ftt_cls_{clip_name}_{data_name}', np.sqrt(mean_squared_error(train2['price'], oof)))

        # denselight regression
        task = Task('reg')
        oof = np.zeros(len(train2))
        pred = 0
        pred_add = 0
        models = {}
        if not os.path.isfile(f'denselight_reg_{clip_name}_{data_name}.jbl'):
            for fold, (tr_idx, te_idx) in enumerate(KFold(n_splits=10, random_state=42, shuffle=True).split(train2['price'], train2['price'], None)):
                x_tr, x_vl = train2.iloc[tr_idx].reset_index(drop=True), train2.iloc[te_idx].reset_index(drop=True)
                if add_data is not None:
                    x_tr = pd.concat([x_tr, train2_add], axis=0).reset_index(drop=True)
                
                x_tr['price'] = np.clip(x_tr['price'], 0, clip)

                scaler = RobustScaler()
                scaler.fit(x_tr['price'].values.reshape(-1, 1))
                x_tr['price'] = scaler.transform(x_tr['price'].values.reshape(-1, 1))
                x_vl['price'] = scaler.transform(x_vl['price'].values.reshape(-1, 1))

                automl1 = TabularAutoML(debug=True,
                task = task, 
                timeout = 600 * 3600,
                cpu_limit = 2, 
                gpu_ids = '0',
                selection_params = {'mode': 0},
                general_params = {"use_algos": [['denselight']]}, 
                nn_params = {
                    "n_epochs": 10, 
                    "bs": 1024//2, 
                    "num_workers": 0, 
                    "path_to_save": None, 
                    "freeze_defaults": True,
                    "cont_embedder": 'plr',
                    'cat_embedder': 'weighted',
                    'act_fun': 'GELU',#'SiLU',
                    "hidden_size": [64, 32], #32,
                    'stop_by_metric': True,
                    'embedding_size': 37,#32,
                    'verbose_bar': True,
                    'input_bn': False,
                    'opt_params': { 'lr': 0.0003 , 'weight_decay': 0 }
                },
                nn_pipeline_params = {"use_qnt": True, "use_te": False},
                reader_params = {'n_jobs': 12, 'cv': 10, 'random_state': 42, 'advanced_roles': False})

                out_of_fold_predictions = automl1.fit_predict(
                    x_tr, valid_data = x_vl, 
                    roles = {
                        'target': 'price',
                        'drop': ['id'],
                        
                    }, 
                    verbose = 1)
                oof[te_idx] = scaler.inverse_transform(out_of_fold_predictions.data.flatten().reshape(-1, 1)).flatten()
                pred += scaler.inverse_transform(automl1.predict(test2).data.flatten().reshape(-1, 1)).flatten() / 10
                pred_add += scaler.inverse_transform(automl1.predict(test2_add).data.flatten().reshape(-1, 1)).flatten() / 10

                models[fold] = automl1
            res_dl_r = {'oof': oof, 'pred': pred, 'pred_add': pred_add, 'models': models}
            joblib.dump(res_dl_r, f'denselight_reg_{clip_name}_{data_name}.jbl')
            print(f'denselight_reg_{clip_name}_{data_name}', np.sqrt(mean_squared_error(train2['price'], oof)))

        # ftt regression
        task = Task('reg')
        oof = np.zeros(len(train2))
        pred = 0
        pred_add = 0
        models = {}
        if not os.path.isfile(f'ftt_reg_{clip_name}_{data_name}.jbl'):
            for fold, (tr_idx, te_idx) in enumerate(KFold(n_splits=10, random_state=42, shuffle=True).split(train2['price'], train2['price'], None)):
                x_tr, x_vl = train2.iloc[tr_idx].reset_index(drop=True), train2.iloc[te_idx].reset_index(drop=True)
                if add_data is not None:
                    x_tr = pd.concat([x_tr, train2_add], axis=0).reset_index(drop=True)
                
                x_tr['price'] = np.clip(x_tr['price'], 0, clip)

                scaler = RobustScaler()
                scaler.fit(x_tr['price'].values.reshape(-1, 1))
                x_tr['price'] = scaler.transform(x_tr['price'].values.reshape(-1, 1))
                x_vl['price'] = scaler.transform(x_vl['price'].values.reshape(-1, 1))

                automl1 = TabularAutoML(debug=True,
                task = task, 
                timeout = 600 * 3600,
                cpu_limit = 2, 
                gpu_ids = '0',
                selection_params = {'mode': 0},
                general_params = {"use_algos": [[FTT_plus]]}, 
                nn_params = {
                    "n_epochs": 10, 
                    "bs": 1024//2, 
                    "num_workers": 0, 
                    "path_to_save": None, 
                    "freeze_defaults": True,
                    "cont_embedder": 'plr',
                    'cat_embedder': 'weighted',
                    'act_fun': 'GELU',#'SiLU',
                    "hidden_size": [64, 32], #32,
                    'stop_by_metric': True,
                    'embedding_size': 128,#128 - tiny, 512 - small ,#32,
                    'verbose_bar': True,
                    #'init_bias': False,
                    'input_bn': False,
                    #'sch': 'CosineAnnealingLR',
                    #'scheduler_params': { 'T_max': 10, 'eta_min':0, 'last_epoch':-1, 'verbose':None},
                    #"snap_params": { 'k': 3, 'early_stopping': True, 'patience': 3, 'swa': False }, 
                    'opt_params': { 'lr': 0.0003 , 'weight_decay': 0 },
                    "model_with_emb": True,
                },nn_pipeline_params = {"use_qnt": True, "use_te": False},
                reader_params = {'n_jobs': 12, 'cv': 10, 'random_state': 42, 'advanced_roles': False})
                

                out_of_fold_predictions = automl1.fit_predict(
                    x_tr, valid_data = x_vl, 
                    roles = {
                        'target': 'price',
                        'drop': ['id'],
                        
                    }, 
                    verbose = 1)
                oof[te_idx] = scaler.inverse_transform(out_of_fold_predictions.data.flatten().reshape(-1, 1)).flatten()
                pred += scaler.inverse_transform(automl1.predict(test2).data.flatten().reshape(-1, 1)).flatten() / 10
                pred_add += scaler.inverse_transform(automl1.predict(test2_add).data.flatten().reshape(-1, 1)).flatten() / 10

                models[fold] = automl1
            res_dl_r = {'oof': oof, 'pred': pred, 'pred_add': pred_add, 'models': models}
            joblib.dump(res_dl_r, f'ftt_reg_{clip_name}_{data_name}.jbl')
            print(f'ftt_reg_{clip_name}_{data_name}', np.sqrt(mean_squared_error(train2['price'], oof)))

        # catboost regression
        task = Task('reg')
        oof = np.zeros(len(train2))
        pred = 0
        pred_add = 0
        models = {}
        if not os.path.isfile(f'catboost_{clip_name}_{data_name}.jbl'):
            for fold, (tr_idx, te_idx) in enumerate(KFold(n_splits=10, random_state=42, shuffle=True).split(train2['price'], train2['price'], None)):
                x_tr, x_vl = train2.iloc[tr_idx].reset_index(drop=True), train2.iloc[te_idx].reset_index(drop=True)
                if add_data is not None:
                    x_tr = pd.concat([x_tr, train2_add], axis=0).reset_index(drop=True)
                
                x_tr['price'] = np.clip(x_tr['price'], 0, clip)

                automl1 = TabularAutoML(debug=True,
                task = task, 
                timeout = 600 * 3600,
                cpu_limit = 2,
                gpu_ids = '0',
                selection_params = {'mode': 0},
                general_params = {"use_algos": [['cb']]},
                cb_params = {'default_params': cb_params,
                            'freeze_defaults': True},
                reader_params = {'n_jobs': 12, 'cv': 10, 'random_state': 42, 'advanced_roles': True})

                out_of_fold_predictions = automl1.fit_predict(
                    x_tr, valid_data = x_vl, 
                    roles = {
                        'target': 'price',
                        'drop': ['id'],
                        
                    }, 
                    verbose = 1)
                oof[te_idx] = out_of_fold_predictions.data.flatten()
                pred += automl1.predict(test2).data.flatten() / 10
                pred_add += automl1.predict(test2_add).data.flatten() / 10

                models[fold] = automl1
            res_catboost = {'oof': oof, 'pred': pred, 'pred_add': pred_add, 'models': models}
            joblib.dump(res_catboost, f'catboost_{clip_name}_{data_name}.jbl')

            print(f'catboost_{clip_name}_{data_name}', np.sqrt(mean_squared_error(train2['price'], oof)))


        # lgb regression
        task = Task('reg')
        oof = np.zeros(len(train2))
        pred = 0
        pred_add = 0
        models = {}
        if not os.path.isfile(f'lgb_{clip_name}_{data_name}.jbl'):
            for fold, (tr_idx, te_idx) in enumerate(KFold(n_splits=10, random_state=42, shuffle=True).split(train2['price'], train2['price'], None)):
                x_tr, x_vl = train2.iloc[tr_idx].reset_index(drop=True), train2.iloc[te_idx].reset_index(drop=True)
                if add_data is not None:
                    x_tr = pd.concat([x_tr, train2_add], axis=0).reset_index(drop=True)
                
                x_tr['price'] = np.clip(x_tr['price'], 0, clip)

                automl1 = TabularAutoML(debug=True,
                task = task, 
                timeout = 600 * 3600,
                cpu_limit = 2,
                gpu_ids = '0',
                selection_params = {'mode': 0},
                general_params = {"use_algos": [['lgb']]},
                lgb_params = {'default_params': lgb_params,
                            'freeze_defaults': True},
                reader_params = {'n_jobs': 12, 'cv': 10, 'random_state': 42, 'advanced_roles': True})

                out_of_fold_predictions = automl1.fit_predict(
                    x_tr, valid_data = x_vl, 
                    roles = {
                        'target': 'price',
                        'drop': ['id'],
                        
                    }, 
                    verbose = 1)
                oof[te_idx] = out_of_fold_predictions.data.flatten()
                pred += automl1.predict(test2).data.flatten() / 10
                pred_add += automl1.predict(test2_add).data.flatten() / 10

                models[fold] = automl1
            res_lgb = {'oof': oof, 'pred': pred, 'pred_add': pred_add, 'models': models}
            joblib.dump(res_lgb, f'lgb_{clip_name}_{data_name}.jbl')
            print(f'lgb_{clip_name}_{data_name}', np.sqrt(mean_squared_error(train2['price'], oof)))


