In [1]:
import pandas as pd
from sklearn.metrics import roc_auc_score

from autowoe import AutoWoE, ReportDeco

### Чтение и подготовка обучающей выборки

In [2]:
train = pd.read_csv(
    "./data/train_demo.csv", low_memory=False, index_col="line_id", parse_dates=["datetime_" + str(i) for i in range(2)]
)

### Чтение и подготовка тестовой выборки

In [3]:
test = pd.read_csv("./data/test_demo.csv", index_col="line_id", parse_dates=["datetime_" + str(i) for i in range(2)])

test_target = pd.read_csv("./data/test-target_demo.csv")["target"]
test["target"] = test_target.values

### Параметры модели

Для обучения модели рекомендуется указать тип признаков для обучения.
Поэтому создается словарь features_type с ключами: 

 "real" -- вещественный признак
 
 "cat" --  категориальный.
 
 __"date"-- ("%Y%d%m", ("m", "d", "wd", "h", "min"))__
 
 Для признаков, которые не размечены, типы будут определены автоматом. Такой вариант будет работать, но качество порядочно просядет
 
__Попробуем указать даты с форматом None (автоопределение) и сезонностью - день месяца и день недели__

#### features_type

In [4]:
num_col = list(filter(lambda x: "numb" in x, train.columns))
num_feature_type = dict.fromkeys(num_col, "real")

date_col = list(filter(lambda x: "datetime" in x, train.columns))
date_feature_type = dict.fromkeys(date_col, (None, ("d", "wd")))

In [5]:
features_type = dict(**num_feature_type, **date_feature_type)

In [6]:
# подробно параметры описаны в Example_1
auto_woe = AutoWoE(
    monotonic=True, max_bin_count=4, oof_woe=False, regularized_refit=False, p_val=0.05, debug=False, verbose=0
)
auto_woe = ReportDeco(auto_woe)

In [7]:
auto_woe.fit(train[num_col + date_col + ["target"]], target_name="target", features_type=features_type)

[LightGBM] [Info] Number of positive: 63, number of negative: 5537
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11532
[LightGBM] [Info] Number of data points in the train set: 5600, number of used features: 652
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.011250 -> initscore=-4.476073
[LightGBM] [Info] Start training from score -4.476073


In [8]:
pred = auto_woe.predict_proba(test)
roc_auc_score(test["target"], pred)

0.7911446119486321

##### Замечание
ReportDeco - обертка для построения отчета. Она не обязательна для обучения и применения модели, но обязательна для построения отчета (см последнюю ячейку).

### Значения коэфициентов и p-values

При указании regularized_refit=False будет произведена оценка p-value на коэфициенты модели. Коэфициенты с p-value выше указанного порога не будут включены в модель

In [9]:
auto_woe.features_fit

number_254         -0.487530
number_10          -0.475665
number_345         -0.707849
number_759         -0.763258
number_761         -0.894294
number_706         -0.648337
number_1           -1.044868
number_368         -1.062441
datetime_1__F__d   -1.232442
dtype: float64

In [10]:
auto_woe.intercept

-4.545016720125766

In [11]:
auto_woe.p_vals

number_254          0.013034
number_10           0.030010
number_345          0.004663
number_759          0.001166
number_761          0.000357
number_706          0.006792
number_1            0.001364
number_368          0.000006
datetime_1__F__d    0.003993
Intercept_          0.000000
dtype: float64

### Формирование отчета

In [12]:
report_params = {
    "automl_date_column": "report_month",  # колонка с датой в формате params['datetimeFormat']
    "output_path": "./AUTOWOE_REPORT_2",  # папка, куда сгенерится отчет и сложатся нужные файлы
    "report_name": "___НАЗВАНИЕ ОТЧЕТА___",
    "report_version_id": 1,
    "city": "Воронеж",
    "model_aim": "___ЦЕЛЬ ПОСТРОЕНИЯ МОДЕЛИ___",
    "model_name": "___НАЗВАНИЕ МОДЕЛИ___",
    "zakazchik": "___ЗАКАЗЧИК___",
    "high_level_department": "___ПОДРАЗДЕЛЕНИЕ___",
    "ds_name": "___РАЗРАБОТЧИК МОДЕЛИ___",
    "target_descr": "___ОПИСАНИЕ ЦЕЛЕВОГО СОБЫТИЯ___",
    "non_target_descr": "___ОПИСАНИЕ НЕЦЕЛЕВОГО СОБЫТИЯ___",
}

auto_woe.generate_report(report_params)

No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
No artists with labels found to put in legend.  Note that 