In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb

from collections import OrderedDict
from sklearn.metrics import roc_auc_score
from copy import deepcopy

# import sys
# sys.path.append('../')
# # sys.path.append('../../old/AutoMLWhitebox')
from autowoe import ReportDeco, AutoWoE

### Чтение  выборок

In [2]:
data = pd.read_csv("./data/data_cat.csv")

In [3]:
train = data.iloc[:14000, :]
test = data.iloc[14000:, :]

In [4]:
train

Unnamed: 0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,V36,V37,V38,V39,V40,V41,V42,V43,V44,V45
0,0,86400,68.50,W,13926,,150.0,discover,142.0,credit,...,,,,,,,,,,
1,0,86401,29.00,W,2755,404.0,150.0,mastercard,102.0,credit,...,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
2,0,86469,59.00,W,4663,490.0,150.0,visa,166.0,debit,...,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
3,0,86499,50.00,W,18132,567.0,150.0,mastercard,117.0,debit,...,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
4,0,86506,50.00,H,4497,514.0,150.0,mastercard,102.0,credit,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13995,0,404944,47.95,W,10112,360.0,150.0,visa,166.0,debit,...,,,,,,,,,,
13996,0,404946,150.00,R,3682,264.0,150.0,visa,162.0,credit,...,,,,,,,,,,
13997,0,404956,449.95,W,7474,583.0,150.0,visa,226.0,credit,...,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
13998,0,404992,15.00,W,9500,321.0,150.0,visa,226.0,debit,...,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0


### Параметры модели

Для обучения модели рекомендуется указать тип признаков для обучения.
Поэтому создается словарь features_type с ключами: 

 "real" -- вещественный признак
 
 "cat" --  категориальный.
 
 "date"-- ("%Y%d%m", ("m", "d", "wd", "h", "min"))
 
 Для признаков, которые не размечены, типы будут определены автоматом. Такой вариант будет работать, но качество порядочно просядет. __Однако в этот раз воспользуемся этой опцией и дикт features_type создавать не будем__

In [5]:
# подробно параметры описаны в Example_1
auto_woe = AutoWoE(monotonic=False,
                     max_bin_count=5,
                     oof_woe=True,
                     regularized_refit=True,
                     p_val=0.05,
                     debug=False,
                     verbose=0,
                     cat_merge_to='to_maxp',
                     nan_merge_to='to_maxp'
        )
auto_woe = ReportDeco(auto_woe)

In [6]:
autowoe_fit_params = {"train": train, 
                      "target_name": "isFraud",
}

auto_woe.fit(**autowoe_fit_params)

[LightGBM] [Info] Number of positive: 298, number of negative: 10902
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5423
[LightGBM] [Info] Number of data points in the train set: 11200, number of used features: 98
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.026607 -> initscore=-3.599608
[LightGBM] [Info] Start training from score -3.599608


In [7]:
pred = auto_woe.predict_proba(test)
roc_auc_score(test[autowoe_fit_params["target_name"]], pred)

0.8262185562804332

##### Замечание
ReportDeco - обертка для построения отчета. Она не обязательна для обучения и применения модели, но обязательна для построения отчета (см последнюю ячейку). Для доступа к атрибутам самой модели необходимо обратится к атрибуту auto_woe._ReportDeco__auto_woe декоратора

### Результат работы автотипизатора

In [8]:
auto_woe.private_features_type

{'TransactionDT': 'real',
 'TransactionAmt': 'real',
 'card1': 'real',
 'card2': 'real',
 'card3': 'real',
 'card5': 'real',
 'card6': 'cat',
 'addr1': 'real',
 'dist1': 'real',
 'dist2': 'real',
 'P_emaildomain': 'cat',
 'R_emaildomain': 'cat',
 'C1': 'real',
 'C2': 'real',
 'C4': 'real',
 'C5': 'real',
 'C6': 'real',
 'C7': 'real',
 'C8': 'real',
 'C9': 'real',
 'C10': 'real',
 'C11': 'real',
 'C12': 'real',
 'C13': 'real',
 'C14': 'real',
 'D1': 'real',
 'D2': 'real',
 'D3': 'real',
 'D4': 'real',
 'D5': 'real',
 'D6': 'real',
 'D8': 'real',
 'D9': 'real',
 'D10': 'real',
 'D11': 'real',
 'D12': 'real',
 'D13': 'real',
 'D14': 'real',
 'D15': 'real',
 'M1': 'cat',
 'M2': 'cat',
 'M3': 'cat',
 'M4': 'cat',
 'M5': 'cat',
 'M6': 'cat',
 'M7': 'cat',
 'M8': 'cat',
 'M9': 'cat',
 'V4': 'real',
 'V5': 'real',
 'V6': 'real',
 'V7': 'real',
 'V10': 'real',
 'V11': 'real',
 'V12': 'real',
 'V13': 'real',
 'V19': 'real',
 'V20': 'real',
 'V23': 'real',
 'V24': 'real',
 'V25': 'real',
 'V26': 

### Формирование отчета

In [9]:
report_params = {"automl_date_column": "report_month", # колонка с датой в формате params['datetimeFormat']
                 "output_path": "./AUTOWOE_REPORT_3", # папка, куда сгенерится отчет и сложатся нужные файлы
                 "report_name": "___НАЗВАНИЕ ОТЧЕТА___",
                 "report_version_id": 1,
                 "city": "Воронеж",
                 "model_aim": "___ЦЕЛЬ ПОСТРОЕНИЯ МОДЕЛИ___",
                 "model_name": "___НАЗВАНИЕ МОДЕЛИ___",
                 "zakazchik": "___ЗАКАЗЧИК___",
                 "high_level_department": "___ПОДРАЗДЕЛЕНИЕ___",
                 "ds_name": "___РАЗРАБОТЧИК МОДЕЛИ___",
                 "target_descr": "___ОПИСАНИЕ ЦЕЛЕВОГО СОБЫТИЯ___",
                 "non_target_descr": "___ОПИСАНИЕ НЕЦЕЛЕВОГО СОБЫТИЯ___"}

auto_woe.generate_report(report_params)

No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
No artists with labels found to put in legend.  Note that 