In [41]:
import yaml
import sys 
import os
import logging
import joblib
import pandas as pd

logging.basicConfig(format="%(asctime)s;%(levelname)s;%(message)s",level=logging.INFO)

sys.path.append(os.path.join(os.path.dirname(sys.path[0]),'modules'))

# Import custom modules
from binning import iv_varsel, woe_bins
from fit import model_config, mltrain_loop

In [42]:
# Load Config
c = yaml.load(open('msft.yaml','r'),Loader=yaml.FullLoader)

data = pd.read_parquet(c['data'])

In [43]:
#data = pd.read_parquet(c['dpath']+c['dvarsel'])

In [44]:
rerun_varsel = False
rerun_binning = False
refit_models = False

if os.path.isfile(c['dpath']+c['dvarsel']):
    logging.info('Variable Selection dataset already exists')
    if rerun_varsel:
        logging.info('Variable Selection redone')
        iv_varsel(
            data,
            c['bads'],
            c['target'],
            c['dpath'],
            c['dvarsel'],
        )
else:
    logging.info('Variable Selection dataset not found. Start Variable Selection')
    iv_varsel(
        data,
        c['bads'],
        c['target'],
        c['dpath'],
        c['dvarsel'],
    )

2020-10-19 22:45:33,179;INFO;Variable Selection dataset already exists


In [45]:
if os.path.isfile(c['dpath']+c['dbins']):
    logging.info('Woe Binning dataset already exists')
    if rerun_binning:
        logging.info('Woe Binning redone')
        data = pd.read_parquet(c['dpath']+c['dvarsel'])
        woe_bins(
            data,
            c['target'],
            c['bads'],
            c['dpath'],
            c['dbins'],
            c['ppath'],
        )
else:
    logging.info('Woe Binning dataset not found. Start Woe Binning')
    data = pd.read_parquet(c['dpath']+c['dvarsel'])
    woe_bins(
        data,
        c['target'],
        c['bads'],
        c['dpath'],
        c['dbins'],
        c['ppath'],
    )

2020-10-19 22:45:36,314;INFO;Woe Binning dataset already exists


In [46]:
# C
models = model_config(m_dict=c['models'])

data = pd.read_parquet(c['dpath']+c['dvarsel'])
bfile = c['dpath']+c['dbins']
bins = joblib.load(bfile)

print(len(models))

mltrain_loop(
    models,
    data,
    bins,
    c['target'],
    offset_lst=[0],
    begin_training  = pd.Timestamp(c['begin_training']),
    end_training    = pd.Timestamp(c['end_training']),
    begin_valid     = pd.Timestamp(c['begin_valid']),
    end_valid       = pd.Timestamp(c['end_valid']),
    begin_test      = pd.Timestamp(c['begin_test']),
    end_test        = pd.Timestamp(c['end_test']),
    refit_models=refit_models
    )

2020-10-19 22:45:40,764;INFO;Fitting 38 models: ['LogisticRegression_l1_ratio0_penaltyl2_solversaga', 'SVC_C0.5_gammaauto_kernellinear_max_iter1000_probabilityTrue', 'SVC_C1_gammascale_kernelrbf_max_iter1000_probabilityTrue', 'MLPClassifier_activationlogistic_alpha0.0001_hidden_layer_sizes(50, 50)_learning_rateconstant_max_iter1000_solveradam', 'DecisionTreeClassifier_criteriongini_max_depth8', 'MLPClassifier_activationlogistic_alpha0.0001_hidden_layer_sizes(60, 40)_learning_rateconstant_max_iter1000_solveradam', 'SVC_C1_gammaauto_kernellinear_max_iter1000_probabilityTrue', 'RandomForestClassifier_max_depth2_n_estimators500', 'SVC_C0.5_gammascale_kernelsigmoid_max_iter1000_probabilityTrue', 'RandomForestClassifier_max_depth10_n_estimators500', 'SVC_C1_gammaauto_kernelrbf_max_iter1000_probabilityTrue', 'DecisionTreeClassifier_criteriongini_max_depth10', 'MLPClassifier_activationlogistic_alpha0.0001_hidden_layer_sizes(70, 30)_learning_rateconstant_max_iter1000_solveradam', 'LogisticRegre

38
[INFO] converting into woe values ...
[INFO] converting into woe values ...
[INFO] converting into woe values ...


2020-10-19 22:45:41,626;INFO;Model LogisticRegression_l1_ratio0_penaltyl2_solversaga: Starting
2020-10-19 22:45:41,626;INFO;Model LogisticRegression_l1_ratio0_penaltyl2_solversaga: Fitting started
2020-10-19 22:45:41,646;INFO;Model LogisticRegression_l1_ratio0_penaltyl2_solversaga: Fitting ended
2020-10-19 22:45:41,674;INFO;Model LogisticRegression_l1_ratio0_penaltyl2_solversaga: Metrics Calculated
2020-10-19 22:45:41,676;INFO;Model LogisticRegression_l1_ratio0_penaltyl2_solversaga: Finished 1/38
2020-10-19 22:45:41,679;INFO;Model SVC_C0.5_gammaauto_kernellinear_max_iter1000_probabilityTrue: Starting
2020-10-19 22:45:41,681;INFO;Model SVC_C0.5_gammaauto_kernellinear_max_iter1000_probabilityTrue: Fitting started


2009-08-17 00:00:00 2017-12-29 00:00:00
2018-01-02 00:00:00 2018-12-31 00:00:00
2019-01-02 00:00:00 2019-08-16 00:00:00


2020-10-19 22:45:42,383;INFO;Model SVC_C0.5_gammaauto_kernellinear_max_iter1000_probabilityTrue: Fitting ended
2020-10-19 22:45:42,667;INFO;Model SVC_C0.5_gammaauto_kernellinear_max_iter1000_probabilityTrue: Metrics Calculated
2020-10-19 22:45:42,667;INFO;Model SVC_C0.5_gammaauto_kernellinear_max_iter1000_probabilityTrue: Finished 2/38
2020-10-19 22:45:42,668;INFO;Model SVC_C1_gammascale_kernelrbf_max_iter1000_probabilityTrue: Starting
2020-10-19 22:45:42,668;INFO;Model SVC_C1_gammascale_kernelrbf_max_iter1000_probabilityTrue: Fitting started
2020-10-19 22:45:43,864;INFO;Model SVC_C1_gammascale_kernelrbf_max_iter1000_probabilityTrue: Fitting ended
2020-10-19 22:45:44,295;INFO;Model SVC_C1_gammascale_kernelrbf_max_iter1000_probabilityTrue: Metrics Calculated
2020-10-19 22:45:44,295;INFO;Model SVC_C1_gammascale_kernelrbf_max_iter1000_probabilityTrue: Finished 3/38
2020-10-19 22:45:44,296;INFO;Model MLPClassifier_activationlogistic_alpha0.0001_hidden_layer_sizes(50, 50)_learning_rateconst