# Step 0. Install LAMA

In [None]:
pip install lightautoml

# Step 0.1. Import necessary libraries 

In [None]:
# Standard python libraries
import os
import time

# Installed libraries
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import torch
import matplotlib.pyplot as plt
import pickle

# Imports from our package
from lightautoml.dataset.roles import DatetimeRole
from lightautoml.tasks import Task
from lightautoml.pipelines.features.base import EmptyFeaturePipeline
from lightautoml.pipelines.features.lgb_pipeline import LGBAdvancedPipeline, LGBSimpleFeatures
from lightautoml.automl.base import AutoML
from lightautoml.ml_algo.boost_lgbm import BoostLGBM
from lightautoml.ml_algo.tuning.optuna import OptunaTuner
from lightautoml.pipelines.ml.base import MLPipeline
from lightautoml.pipelines.selection.importance_based import ImportanceCutoffSelector, ModelBasedImportanceEstimator
from lightautoml.pipelines.selection.permutation_importance_based import NpPermutationImportanceEstimator, \
    NpIterativeFeatureSelector

from lightautoml.transformers.base import LAMLTransformer, SequentialTransformer, UnionTransformer, ColumnsSelector
from lightautoml.pipelines.utils import get_columns_by_role
from lightautoml.dataset.roles import NumericRole
from lightautoml.pipelines.features.base import FeaturesPipeline, TabularDataFeatures
from lightautoml.reader.base import PandasToPandasReader

# Step 0.2. Parameters 

In [None]:
N_THREADS = 4 # threads cnt for lgbm and linear models
N_FOLDS = 3 # folds cnt for AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
TEST_SIZE = 0.2 # Test size for metric check
TIMEOUT = 300 # Time in seconds for automl run
TARGET_NAME = 'TARGET' # Target column name

# Step 0.3. Fix torch number of threads and numpy seed 

In [None]:
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

# Step 0.4. Example data load 

In [None]:
%%time

data = pd.read_csv('../input/lama-datasets/sampled_app_train.csv')
data.head()

# Step 0.5. (Optional) Some user feature preparation 

Cell below shows some user feature preparations to create task more difficult (this block can be omitted if you don't want to change the initial data):

In [None]:
%%time

data['BIRTH_DATE'] = (np.datetime64('2018-01-01') + data['DAYS_BIRTH'].astype(np.dtype('timedelta64[D]'))).astype(str)
data['EMP_DATE'] = (np.datetime64('2018-01-01') + np.clip(data['DAYS_EMPLOYED'], None, 0).astype(np.dtype('timedelta64[D]'))
                    ).astype(str)

data['constant'] = 1
data['allnan'] = np.nan

data['report_dt'] = np.datetime64('2018-01-01')

data.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True)

# Step 0.6. (Optional) Data splitting for train-test 

Block below can be omitted if you are going to train model only or you have specific train and test files:

In [None]:
%%time

train_data, test_data = train_test_split(data, 
                                         test_size=TEST_SIZE, 
                                         stratify=data[TARGET_NAME], 
                                         random_state=RANDOM_STATE)
print('Data splitted. Parts sizes: train_data = {}, test_data = {}'
              .format(train_data.shape, test_data.shape))

In [None]:
train_data.head()

#  ==== Custom AutoML pipeline ====


## Step 1. Create Task and Reader

In [None]:
%%time

task = Task('binary', )
reader = PandasToPandasReader(task, cv=N_FOLDS, random_state=RANDOM_STATE)

## Step 2. Setup columns roles

Roles setup here set target column and base date, which is used to calculate date differences:

In [None]:
%%time

roles = {'target': TARGET_NAME,
         DatetimeRole(base_date=True, seasonality=(), base_feats=False): 'report_dt',
         }

## Step 3. Create custom transformer and feature pipeline.

In [None]:

class GroupByTransformer(LAMLTransformer):

    _fit_checks = ()
    _transform_checks = ()
    _fname_prefix = 'grb'


    @property
    def features(self):
        """Features list."""

        return self._features

    def __init__(self):

        super().__init__()
        self.dicts = {}



    def fit(self, dataset):

        # set transformer names and add checks
        for check_func in self._fit_checks:
            check_func(dataset)
        # set transformer features

        # convert to accepted dtype and get attributes
        dataset = dataset.to_pandas()
        df = dataset.data
        cat_cols = get_columns_by_role(dataset, 'Category')
        num_cols = get_columns_by_role(dataset, 'Numeric')

        feats = []
        for cat in cat_cols:
            for num in num_cols:  
                feature = f'{self._fname_prefix}__{cat}_delta_mean_{num}'
                _dict = df[[cat, num]].groupby(cat)[num].mean().to_dict()
                self.dicts[feature] = {'cat': cat, 'num': num, 'values': _dict}
                feats.append(feature)
            
        self._features = feats
        return self


    def transform(self, dataset):

        # checks here
        super().transform(dataset)
        # convert to accepted dtype and get attributes
        dataset = dataset.to_pandas()
        df = dataset.data

        # transform
        roles = NumericRole()
        outputs = []
        for feat, value in self.dicts.items():
            cat, num = value['cat'], value['num']
            new_arr = (df[num] - df[cat].map(value['values'])).values.reshape(-1, 1)
            output = dataset.empty().to_numpy()
            output.set_data(new_arr, [f'{self._fname_prefix}__{cat}_delta_mean_{num}'], roles)
            outputs.append(output)
        # create resulted
        return dataset.empty().to_numpy().concat(outputs)
    
    
class GroupByPipeline(FeaturesPipeline, TabularDataFeatures):


    def __init__(self, feats_imp = None, top_category: int = 3, top_numeric: int = 3, **kwargs):
        """

        """
        super().__init__(feats_imp=feats_imp)
        self.top_category = top_category
        self.top_numeric = top_numeric

    def create_pipeline(self, train):


        transformer_list = []

        categories = get_columns_by_role(train, 'Category')
        numerics = get_columns_by_role(train, 'Numeric')
        cat_feats_to_select = []
        num_feats_to_select = []
        if len(categories) > self.top_category:
            cat_feats_to_select = self.get_top_categories(train, self.top_category)
        elif len(categories) > 0:
            cat_feats_to_select = categories
            
        if len(numerics) > self.top_numeric:
            num_feats_to_select = self.get_top_numeric(train, self.top_numeric)
        elif len(numerics) > 0:
            num_feats_to_select = numerics
        
        if (len(cat_feats_to_select) > 0) and (len(num_feats_to_select) > 0):
            cat_processing = [

                ColumnsSelector(keys=cat_feats_to_select + num_feats_to_select),
                GroupByTransformer(),

            ]
            cat_processing = SequentialTransformer(cat_processing)
            transformer_list.append(cat_processing)
            
        return UnionTransformer(transformer_list)
    
    def get_top_numeric(self, train, top_n = 5):

        nums = get_columns_by_role(train, 'Numeric')
        if len(nums) == 0:
            return []

        df = pd.DataFrame({'importance': 0, 'cardinality': 0}, index=nums)
        # importance if defined
        if self.feats_imp is not None:
            feats_imp = pd.Series(self.feats_imp.get_features_score()).sort_values(ascending=False)
            df['importance'] = feats_imp[feats_imp.index.isin(nums)]
            df['importance'].fillna(-np.inf)

        # check for cardinality
        df['cardinality'] = -self.get_uniques_cnt(train, nums)
        # sort
        df = df.sort_values(by=['importance', 'cardinality'], ascending=[False, self.ascending_by_cardinality])
        # get top n
        top = list(df.index[:top_n])

        return top

## Step 4. Create feature selector.

In [None]:
#post selection

model0 = BoostLGBM(
    default_params={'learning_rate': 0.05, 'num_leaves': 64, 'seed': 42, 'num_threads': N_THREADS}
)

pie = NpPermutationImportanceEstimator()
selector = ImportanceCutoffSelector(LGBSimpleFeatures(), model0, pie, cutoff=-99999)

## Step 5. Create pipelines.

In [None]:
%%time

pipe = LGBAdvancedPipeline(top_intersections=2).append(GroupByPipeline(selector, 5, 5))

model = BoostLGBM(
    default_params={'learning_rate': 0.05, 'num_leaves': 128, 'seed': 1, 'num_threads': N_THREADS})


pipeline = MLPipeline([
    (model),
], pre_selection=selector, features_pipeline=pipe, post_selection=None)

## Step 6. Create AutoML.

In [None]:
%%time 
start = time.time()

automl = AutoML(reader, [
    [pipeline],
], skip_conn=False, verbose=0)

oof_pred = automl.fit_predict(train_data, roles = roles)
print('oof_pred:\n{}\nShape = {}'.format(oof_pred, oof_pred.shape))
time_automl = time.time() - start

## Step 7. Predict to test data and check scores

In [None]:
%%time

test_pred = automl.predict(test_data)
print('Prediction for test data:\n{}\nShape = {}'.format(test_pred, test_pred.shape))

print('Check scores...')
print('OOF score: {}'.format(roc_auc_score(train_data[TARGET_NAME].values,
                                           oof_pred.data[:, 0])))
test_automl = roc_auc_score(test_data[TARGET_NAME].values, test_pred.data[:, 0])
print('TEST score: {}'.format(test_automl))
