# Import necessary libraries

In [3]:
# Standard python libraries
import logging
import os
import time
logging.basicConfig(format='[%(asctime)s] (%(levelname)s): %(message)s', level=logging.INFO)

# Installed libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import torch

# Imports from our package
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.dataset.roles import DatetimeRole
from lightautoml.tasks import Task
from lightautoml.utils.profiler import Profiler

from lightautoml.addons.uplift.base import AutoMLParamWrapper, AutoUplift, UpliftCandidateInfo
from lightautoml.addons.uplift import meta_learners
from lightautoml.addons.uplift.metrics import (_available_uplift_modes,
                                               calculate_graphic_uplift_curve,
                                               calculate_min_max_uplift_auc,
                                               calculate_uplift_auc,
                                               perfect_uplift_curve)
from lightautoml.addons.uplift.utils import create_linear_automl
from lightautoml.report.report_deco import ReportDecoUplift


%matplotlib inline

# Parameters

## Setting

In [4]:
N_THREADS = 8 # threads cnt for lgbm and linear models
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
TEST_SIZE = 0.2 # Test size for metric check
TIMEOUT = 300 # Time in seconds for automl run
TARGET_NAME = 'TARGET' # Target column name
TREATMENT_NAME = 'CODE_GENDER'

## Fix torch number of threads and numpy seed

In [5]:
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

# Example data load

In [6]:
%%time

data = pd.read_csv('./example_data/test_data_files/sampled_app_train.csv')
data.head()

CPU times: user 72.2 ms, sys: 7.85 ms, total: 80 ms
Wall time: 79.6 ms


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,313802,0,Cash loans,M,N,Y,0,270000.0,327024.0,15372.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,319656,0,Cash loans,F,N,N,0,108000.0,675000.0,19737.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,207678,0,Revolving loans,F,Y,Y,2,112500.0,270000.0,13500.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
3,381593,0,Cash loans,F,N,N,1,67500.0,142200.0,9630.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,4.0
4,258153,0,Cash loans,F,Y,Y,0,337500.0,1483231.5,46570.5,...,0,0,0,0,0.0,0.0,0.0,2.0,0.0,0.0


# (Optional) Some user feature preparation

In [7]:
%%time 

data['BIRTH_DATE'] = (np.datetime64('2018-01-01') + data['DAYS_BIRTH'].astype(np.dtype('timedelta64[D]'))).astype(str)
data['EMP_DATE'] = (np.datetime64('2018-01-01') + np.clip(data['DAYS_EMPLOYED'], None, 0).astype(np.dtype('timedelta64[D]'))
                    ).astype(str)

data['report_dt'] = np.datetime64('2018-01-01')

data['constant'] = 1
data['allnan'] = np.nan

data.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True)


data['CODE_GENDER'] = (data['CODE_GENDER'] == 'M').astype(int)

CPU times: user 99.6 ms, sys: 8.28 ms, total: 108 ms
Wall time: 106 ms


# Data splitting for train-test

In [9]:
%%time


stratify_value = data[TARGET_NAME] + 10 * data[TREATMENT_NAME]

train_data, test_data = train_test_split(data, test_size=3000, stratify=stratify_value, random_state=42)

test_target, test_treatment = test_data[TARGET_NAME].values.ravel(), test_data[TREATMENT_NAME].values.ravel()

CPU times: user 12.6 ms, sys: 211 µs, total: 12.8 ms
Wall time: 12 ms


# Setup columns roles

In [None]:
%%time

roles = {
    'target': TARGET_NAME,
    'treatment': TREATMENT_NAME,
    DatetimeRole(base_date=True, seasonality=(), base_feats=False): 'report_dt'
}

# Uplift modeling

## AutoUplift (use predefined uplift methods)

### Fit autouplift

In [None]:
%%time

task = Task('binary')

autouplift = AutoUplift(task,
                        add_dd_candidates=True,
                        metric='adj_qini', 
                        normed_metric=True, 
                        test_size=0.2, 
                        threshold_imbalance_treatment=0.0,
                        timeout=100) # 300 sec, enough to train all metalearners on sample data

autouplift.fit(train, roles)

### Show rating of uplift methods (meta-learners)

In [None]:
%%time

rating_table = autouplift.get_metalearners_ranting()
rating_table

### Predict to test data and check metrics

In [None]:
%%time

uplift_pred, treatment_pred, control_pred = autouplift.predict(test)
uplift_pred = uplift_pred.ravel()

roc_auc_treatment = roc_auc_score(test_target[test_treatment == 1], treatment_pred[test_treatment == 1])
roc_auc_control = roc_auc_score(test_target[test_treatment == 0], control_pred[test_treatment == 0])

uplift_auc_algo = calculate_uplift_auc(test_target, uplift_pred, test_treatment, normed=False)
uplift_auc_algo_normed = calculate_uplift_auc(test_target, uplift_pred, test_treatment, normed=True)
auc_base, auc_perfect = calculate_min_max_uplift_auc(test_target, test_treatment)

logging.info('--- Check scores ---')
logging.info('OOF scores "ROC_AUC":')
logging.info('\tTreatment = %f', roc_auc_treatment)
logging.info('\tControl   = %f', roc_auc_control)
logging.info('Uplift score of test group (default="adj_qini"):')
logging.info('\tBaseline      = %f', auc_base)
logging.info('\tAlgo (Normed) = %f (%f)', uplift_auc_algo, uplift_auc_algo_normed)
logging.info('\tPerfect       = %f', auc_perfect)

###  Get best metalearner with report functionaly (should refit on train data for generating report) 

In [None]:
%%time 

best_metalearner_repo = autouplift.create_best_meta_learner(need_report=True, update_metalearner_params={'timeout': 100})
best_metalearner_repo.fit(train, roles)
best_metalearner_repo.predict(test)

# Path to report: PATH_TO_CURRENT_NOTEBOOK/lama_report/lama_interactive_report.html

## AutoUplift (custom uplift methods)

### Fit autouplift

In [1]:
%%time

# Set uplift candidate for choosing best of them
# !!!ATTENTION!!!
#    This is a demonstration of the possibilities, 
#    You may use default set of candidates 

task = Task('binary')

uplift_candidates = [
    UpliftCandidateInfo(
        'TLearner__Default', 
        meta_learners.TLearner, 
        {'base_task': task}
    ),  
    UpliftCandidateInfo(
        'TLearner__Custom', 
        meta_learners.TLearner, 
        {
            'treatment_learner': AutoMLParamWrapper(TabularAutoML, {'task': task, 'timeout': 10}),
            'control_learner': create_linear_automl(base_task=Task('binary'))
        }
    ),
    UpliftCandidateInfo(
        'XLearner__Custom',
        meta_learners.XLearner,
        {
            'outcome_learners': [
                TabularAutoML(task=task, timeout=10), # [sec] , Only speed up example, don't change it!
                create_linear_automl(base_task=Task('binary'))
            ],
            'effect_learners': [AutoMLParamWrapper(TabularAutoML, {'task': Task('reg'), 'timeout': 5})],
            'propensity_learner': create_linear_automl(base_task=Task('binary')),
        }    
    )
]

autouplift = AutoUplift(task,
                        uplift_candidates=uplift_candidates, 
                        add_dd_candidates=True,
                        metric='adj_qini', 
                        normed_metric=True, 
                        test_size=0.2, 
                        threshold_imbalance_treatment=0.0,    # Doesn't affect, see warnings
                        timeout=600)                          # Doesn't affect, see warnings

autouplift.fit(train, roles)

NameError: name 'Task' is not defined

### Show rating of uplift methods (meta-learners)

In [2]:
%%time

rating_table = autouplift.get_metalearners_ranting()
rating_table

NameError: name 'autouplift' is not defined

### Predict to test data and check metrics

In [None]:
%%time

uplift_pred, treatment_pred, control_pred = autouplift.predict(test)
uplift_pred = uplift_pred.ravel()

roc_auc_treatment = roc_auc_score(test_target[test_treatment == 1], treatment_pred[test_treatment == 1])
roc_auc_control = roc_auc_score(test_target[test_treatment == 0], control_pred[test_treatment == 0])

uplift_auc_algo = calculate_uplift_auc(test_target, uplift_pred, test_treatment, normed=False)
uplift_auc_algo_normed = calculate_uplift_auc(test_target, uplift_pred, test_treatment, normed=True)
auc_base, auc_perfect = calculate_min_max_uplift_auc(test_target, test_treatment)

logging.info('--- Check scores ---')
logging.info('OOF scores "ROC_AUC":')
logging.info('\tTreatment = %f', roc_auc_treatment)
logging.info('\tControl   = %f', roc_auc_control)
logging.info('Uplift score of test group (default="adj_qini"):')
logging.info('\tBaseline      = %f', auc_base)
logging.info('\tAlgo (Normed) = %f (%f)', uplift_auc_algo, uplift_auc_algo_normed)
logging.info('\tPerfect       = %f', auc_perfect)

###  Get best metalearner with report functionaly (should refit on train data for generating report) 

In [None]:
%%time 

best_metalearner_repo = autouplift.create_best_meta_learner(need_report=False, update_metalearner_params={'timeout': 60})
best_metalearner_repo.fit(train, roles)
best_metalearner_repo.predict(test)

## MetaLearner

### TLearner

#### Fit on train data

In [None]:
%%time

# Default setting
tlearner = meta_learners.TLearner(base_task=Task('binary'))
tlearner.fit(train, roles)

#### Predict to test data and check metrics

In [None]:
%%time

uplift_pred, treatment_pred, control_pred = tlearner.predict(test)
uplift_pred = uplift_pred.ravel()

roc_auc_treatment = roc_auc_score(test_target[test_treatment == 1], treatment_pred[test_treatment == 1])
roc_auc_control = roc_auc_score(test_target[test_treatment == 0], control_pred[test_treatment == 0])

uplift_auc_algo = calculate_uplift_auc(test_target, uplift_pred, test_treatment, normed=False)
uplift_auc_algo_normed = calculate_uplift_auc(test_target, uplift_pred, test_treatment, normed=True)
auc_base, auc_perfect = calculate_min_max_uplift_auc(test_target, test_treatment)

logging.info('--- Check scores ---')
logging.info('OOF scores "ROC_AUC":')
logging.info('\tTreatment = %f', roc_auc_treatment)
logging.info('\tControl   = %f', roc_auc_control)
logging.info('Uplift score of test group (default="adj_qini"):')
logging.info('\tBaseline      = %f', auc_base)
logging.info('\tAlgo (Normed) = %f (%f)', uplift_auc_algo, uplift_auc_algo_normed)
logging.info('\tPerfect       = %f', auc_perfect)

### XLearner

#### Fit on train data

In [None]:
%%time

# Custom base algorithm
xlearner = meta_learners.XLearner(
    propensity_learner=TabularAutoML(task=Task('binary'), timeout=10),
    outcome_learners=[
        TabularAutoML(task=Task('binary'), timeout=10),
        TabularAutoML(task=Task('binary'), timeout=10)
    ],
    effect_learners=[
        TabularAutoML(task=Task('reg'), timeout=10),
        TabularAutoML(task=Task('reg'), timeout=10)
    ]
)
xlearner.fit(train, roles)

#### Predict to test data and check metrics

In [None]:
%%time

uplift_pred, treatment_pred, control_pred = xlearner.predict(test)
uplift_pred = uplift_pred.ravel()

roc_auc_treatment = roc_auc_score(test_target[test_treatment == 1], treatment_pred[test_treatment == 1])
roc_auc_control = roc_auc_score(test_target[test_treatment == 0], control_pred[test_treatment == 0])

uplift_auc_algo = calculate_uplift_auc(test_target, uplift_pred, test_treatment, normed=False)
uplift_auc_algo_normed = calculate_uplift_auc(test_target, uplift_pred, test_treatment, normed=True)
auc_base, auc_perfect = calculate_min_max_uplift_auc(test_target, test_treatment)

logging.info('--- Check scores ---')
logging.info('OOF scores "ROC_AUC":')
logging.info('\tTreatment = %f', roc_auc_treatment)
logging.info('\tControl   = %f', roc_auc_control)
logging.info('Uplift score of test group (default="adj_qini"):')
logging.info('\tBaseline      = %f', auc_base)
logging.info('\tAlgo (Normed) = %f (%f)', uplift_auc_algo, uplift_auc_algo_normed)
logging.info('\tPerfect       = %f', auc_perfect)

# Uplift metrics and graphics (using xlearner predictions)

In [None]:
%%time 

UPLIFT_METRIC = 'adj_qini'

logging.info("All available uplift metrics: %s", _available_uplift_modes)

## Algorithm uplift curve 

In [None]:
%%time

# Algorithm curve
xs_xlearner, ys_xlearner = calculate_graphic_uplift_curve(
    test_target, uplift_pred, test_treatment, mode=UPLIFT_METRIC
)

## Baseline, perfect curve

In [None]:
# Baseline curve
xs_base, ys_base = [0, 1], [0, ys_xlearner[-1]]

# Perfect curver
perfect_uplift = perfect_uplift_curve(test_target, test_treatment)
xs_perfect, ys_perfect = calculate_graphic_uplift_curve(
    test_target, perfect_uplift, test_treatment, mode=UPLIFT_METRIC)

In [None]:
plt.figure(figsize=(10, 7))

plt.plot(xs_base, ys_base, 'black')
plt.plot(xs_xlearner, ys_xlearner, 'red')
plt.plot(xs_perfect, ys_perfect, 'green')

plt.fill_between(xs_xlearner, ys_xlearner, alpha=0.5, color='orange')

plt.xlabel('Cumulative percentage of people in T/C groups')
plt.ylabel('Uplift metric (%s)'.format(UPLIFT_METRIC))
plt.grid()
plt.legend(['Baseline', 'XLearner', 'Perfect']);

# Report

In [None]:
%%time

RDU = ReportDecoUplift()
tlearner_deco = RDU(meta_learners.TLearner(base_task=Task('binary')))
tlearner_deco.fit(train, roles)
tlearner_deco.predict(test);

# Path to report: PATH_TO_CURRENT_NOTEBOOK/lama_report/lama_interactive_report.html