### Установка (при необходимости раскомментить)

In [1]:
# # Uninstall previous version of WhiteBox
# !pip3 uninstall --yes autowoe 
# # install requirements if Linux 
# # !pip3 install -r requirements.txt
# # Install WhiteBox
# !python3 setup.py install --user

In [2]:
import pandas as pd
import numpy as np
import lightgbm as lgb

from collections import OrderedDict
from sklearn.metrics import roc_auc_score
from tqdm import tqdm
from copy import deepcopy

from autowoe import ReportDeco, AutoWoE

### Чтение и подготовка обучающей выборки

In [3]:
train = pd.read_csv("./train_demo.csv",
                    low_memory=False,
                    index_col="line_id",
                    parse_dates = ["datetime_" + str(i) for i in range(2)],)

### Чтение и подготовка тестовой выборки

In [4]:
test = pd.read_csv("./test_demo.csv",
                   index_col="line_id", 
                   parse_dates = ["datetime_" + str(i) for i in range(2)])
    
test_target = pd.read_csv("./test-target_demo.csv")["target"]
test["target"] = test_target.values

### Параметры модели

Для обучения модели рекомендуется указать тип признаков для обучения.
Поэтому создается словарь features_type с ключами: 

 "real" -- вещественный признак
 
 "cat" --  категориальный.
 
 __"date"-- ("%Y%d%m", ("m", "d", "wd", "h", "min"))__
 
 Для признаков, которые не размечены, типы будут определены автоматом. Такой вариант будет работать, но качество порядочно просядет
 
__Попробуем указать даты с форматом None (автоопределение) и сезонностью - день месяца и день недели__

#### features_type

In [5]:
num_col = list(filter(lambda x: "numb" in x, train.columns))
num_feature_type = {x: "real" for x in num_col}

date_col = list(filter(lambda x: "datetime" in x, train.columns))
date_feature_type = {x: (None, ("d", "wd")) for x in date_col}

In [6]:
features_type = dict(**num_feature_type,
                     **date_feature_type
                    )

In [7]:
# подробно параметры описаны в Example_1
auto_woe = AutoWoE(monotonic=True,
                     max_bin_count=4,
                     oof_woe=False,
                     regularized_refit=False,
                     p_val=0.05,
                     debug=False
        )
auto_woe = ReportDeco(auto_woe)

In [8]:
auto_woe.fit(train[num_col + date_col + ['target']], 
             target_name="target",
             features_type=features_type,
            )

 features ['number_16', 'number_17', 'number_18', 'number_19', 'number_20', 'number_21', 'number_22', 'number_23', 'number_24', 'number_25', 'number_26', 'number_27', 'number_28', 'number_30', 'number_31', 'number_32', 'number_34', 'number_35', 'number_36', 'number_37', 'number_40', 'number_41', 'number_42', 'number_96', 'number_97', 'number_99', 'number_100', 'number_101', 'number_105', 'number_113', 'number_115', 'number_116', 'number_117', 'number_118', 'number_119', 'number_120', 'number_121', 'number_122', 'number_123', 'number_124', 'number_125', 'number_126', 'number_127', 'number_128', 'number_129', 'number_131', 'number_132', 'number_133', 'number_134', 'number_135', 'number_141', 'number_142', 'number_143', 'number_144', 'number_145', 'number_147', 'number_149', 'number_150', 'number_151', 'number_152', 'number_208', 'number_209', 'number_210', 'number_211', 'number_212', 'number_213', 'number_214', 'number_215', 'number_216', 'number_217', 'number_218', 'number_219', 'number



Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[2]	val_set's auc: 0.623319
 features ['number_0', 'number_2', 'number_3', 'number_5', 'number_6', 'number_7', 'number_8', 'number_10', 'number_11', 'number_12', 'number_14', 'number_15', 'number_29', 'number_33', 'number_38', 'number_39', 'number_43', 'number_44', 'number_45', 'number_46', 'number_47', 'number_48', 'number_49', 'number_50', 'number_51', 'number_52', 'number_53', 'number_54', 'number_55', 'number_56', 'number_57', 'number_58', 'number_59', 'number_60', 'number_61', 'number_62', 'number_63', 'number_64', 'number_65', 'number_66', 'number_67', 'number_68', 'number_69', 'number_70', 'number_71', 'number_72', 'number_73', 'number_74', 'number_75', 'number_76', 'number_77', 'number_78', 'number_79', 'number_80', 'number_81', 'number_82', 'number_83', 'number_84', 'number_85', 'number_86', 'number_87', 'number_88', 'number_89', 'number_90', 'number_91', 'number_92', 'number_93', '

number_1 processing...
number_9 processing...
number_103 processing...
number_158 processing...
number_170 processing...
number_179 processing...
number_182 processing...
number_247 processing...
number_261 processing...
number_287 processing...
number_155 processing...
number_221 processing...
number_173 processing...
number_180 processing...
number_251 processing...
number_289 processing...
number_283 processing...
number_13 processing...
number_164 processing...
number_4 processing...
number_295 processing...
number_332 processing...
number_335 processing...
number_359 processing...
number_367 processing...
number_646 processing...
number_706 processing...
number_742 processing...
number_749 processing...
number_334 processing...
number_754 processing...
number_328 processing...
number_368 processing...
number_345 processing...
number_364 processing...
number_695 processing...
number_743 processing...
number_708 processing...
number_761 processing...
number_751 processing...
number_

In [9]:

pred = auto_woe.predict_proba(test)
roc_auc_score(test['target'], pred)

0.7903629257398102

##### Замечание
ReportDeco - обертка для построения отчета. Она не обязательна для обучения и применения модели, но обязательна для построения отчета (см последнюю ячейку).

### Значения коэфициентов и p-values

При указании regularized_refit=False будет произведена оценка p-value на коэфициенты модели. Коэфициенты с p-value выше указанного порога не будут включены в модель

In [10]:
auto_woe.features_fit

number_9           -0.800556
number_345         -0.762075
number_761         -0.920001
number_706         -0.879256
number_1           -1.149317
number_368         -1.096183
datetime_0__F__d   -0.995278
number_646         -1.293710
number_221         -1.084248
dtype: float64

In [11]:
auto_woe.intercept

-4.5442729794875625

In [12]:
auto_woe.p_vals

number_9            0.000009
number_345          0.002176
number_761          0.000484
number_706          0.000242
number_1            0.000348
number_368          0.000003
datetime_0__F__d    0.001416
number_646          0.005687
number_221          0.006115
Intercept_          0.000000
dtype: float64

### Формирование отчета

In [13]:
report_params = {"automl_date_column": "report_month", # колонка с датой в формате params['datetimeFormat']
                 "output_path": "../AUTOWOE_REPORT_2", # папка, куда сгенерится отчет и сложатся нужные файлы
                 "report_name": "___НАЗВАНИЕ ОТЧЕТА___",
                 "report_version_id": 1,
                 "city": "Воронеж",
                 "model_aim": "___ЦЕЛЬ ПОСТРОЕНИЯ МОДЕЛИ___",
                 "model_name": "___НАЗВАНИЕ МОДЕЛИ___",
                 "zakazchik": "___ЗАКАЗЧИК___",
                 "high_level_department": "___ПОДРАЗДЕЛЕНИЕ___",
                 "ds_name": "___РАЗРАБОТЧИК МОДЕЛИ___",
                 "target_descr": "___ОПИСАНИЕ ЦЕЛЕВОГО СОБЫТИЯ___",
                 "non_target_descr": "___ОПИСАНИЕ НЕЦЕЛЕВОГО СОБЫТИЯ___"}

auto_woe.generate_report(report_params)

No handles with labels found to put in legend.
No handles with labels found to put in legend.
No handles with labels found to put in legend.
No handles with labels found to put in legend.
No handles with labels found to put in legend.
No handles with labels found to put in legend.
No handles with labels found to put in legend.
No handles with labels found to put in legend.
No handles with labels found to put in legend.
No handles with labels found to put in legend.
  grp = df.groupby(col)['pred', 'Target'].mean()
  grp = df.groupby(col)['pred', 'Target'].mean()
  grp = df.groupby(col)['pred', 'Target'].mean()
  grp = df.groupby(col)['pred', 'Target'].mean()
  grp = df.groupby(col)['pred', 'Target'].mean()
  grp = df.groupby(col)['pred', 'Target'].mean()
  grp = df.groupby(col)['pred', 'Target'].mean()
  grp = df.groupby(col)['pred', 'Target'].mean()
  grp = df.groupby(col)['pred', 'Target'].mean()


Successfully wrote ../AUTOWOE_REPORT_2/autowoe_report.html.
