In [1]:
%load_ext autoreload
%autoreload 2
import matplotlib.pyplot as plt
import matplotlib
import time
import numpy as np
%matplotlib inline

import pandas as pd
pd.options.display.max_colwidth = 300
pd.options.display.max_columns = 100

from src.web.train_util import read_from



In [2]:
'''
Parameters. 
Parameters that are defined in this cell can be injected and overwritten by the machine learning platform.
'''

# MLP defined parameters 
training_runner = None
project_id = None
training_id = None
metrics_feedback_url = None
model_destination = None

# user defined parameters

# label keys
label = 'success'
# model file directory
work_dir = '/var/spark/ml_files/'
model_type = 'ML-BR'
start_date = '2020-01-01'
end_date = '2020-01-31'
# desc = '%s_%s_for_calendar_retry_attempt'.format(start_date, end_date)

# data
training_data = ''
# bin_profile_data =  work_dir + 'bin_profile_2019_01_to_2019_05.csv'
# payment_mid_bin_data = work_dir + 'payment_mid_bin_2019_01_to_05.csv'
decline_type_data = work_dir + 'Decline_Type.csv'

# features
input_features = {
            "billing_country": {
                "type": "string"
            },
            "bin": {
                "type": "string"
            },
            "card_brand": {
                "type": "string"
            },
            "card_category": {
                "type": "string"
            },
            "card_class": {
                "type": "string"
            },
            "card_usage": {
                "type": "string"
            },
            "cc_expiration_date": {
                "type": "string"
            },
            "day_of_month": {
                "type": "integer"
            },
            "failed_attempt_date": {
                "type": "string"
            },
            "failed_response_code": {
                "type": "string"
            },
            "failed_response_message": {
                "type": "string"
            },
            "funding_source": {
                "type": "string"
            },
            "issuer_country": {
                "type": "string"
            },
            "merchant_number": {
                "type": "string"
            },
            "payment_amount_usd": {
                "type": "number"
            },
            "payment_currency": {
                "type": "string"
            },
            "payment_method_id": {
                "type": "string"
            },
            "payment_service_id": {
                "type": "string"
            },
            "renew_att_num": {
                "type": "integer"
            },
            "site_id": {
                "type": "string"
            },
            "transaction_date_in_string": {
                "type": "string"
            },
            "renewal_window": {
                "type": "integer"
            },
            "duration": {
                "type": "integer"
            },
            "segment_num": {
                "type": "integer"
            },
            "sub_age": {
                "type": "integer"
            }, 
            "bank_name": {
                "type": "string"
            },
            "first_calendar_attempt_date": {
                "type": "string"
            }
        }

features_cat = [ 
    'failed_response_code', 
    'failed_decline_type',  
    'day_of_month', 
    'funding_source', 
    'payment_currency', 
    'days_between', 
    'billing_country', 
    'renewal_window',  
    'renew_att_num', 
    'card_brand']

features_float = [ 
    'bin',  
    'failed_response_code', 
    'date_increment', 
    'renewal_window'
]

features_num = [ 
    'duration', 
    'sub_age'
]
features_num_calculated = []
features_num_encoded = []   
features_num_bin_profile = []

features_cat_encoded = [
    'month', 
    'days_between', 
    'renew_att_num', 
    'day_of_week', 
    'num_of_days', 
    'payment_service_id', 
    'merchant_number', 
    'month', 
    'is_expired', 
    'segment_num_group', 
    'sub_duration_group',  
    'sub_age_group', 
    'card_brand'
]


features_grouped = [
    ['payment_service_id', 'merchant_number', 'billing_country'],
    ['payment_service_id', 'merchant_number', 'payment_currency'],
    ['payment_service_id', 'billing_country','payment_currency'],
    ['bin', 'is_expired'],
    ['bank_name', 'is_expired'],
    ['payment_service_id', 'is_expired'],
    ['days_between', 'failed_decline_type'],
    ['days_between', 'renewal_window'],
    ['days_between', 'funding_source'],
    ['card_brand', 'is_expired'],
    ['sub_age_group', 'sub_duration_group'],
    ['sub_duration_group', 'is_expired'],
    ['segment_num_group', 'is_expired'],
    ['day_of_week', 'billing_country'],
    ['issuer_country', 'is_expired'] 
]

features_encoded = features_cat_encoded + features_num_encoded

additional_fields = [
    'card_brand',  
    'segment_num', 
    'segment_num_group',
    'bank_name', 
    'duration', 
    'is_expired', 
    'renewal_window', 
    'payment_currency', 
    'funding_source', 
    'card_category', 
    'card_class', 
    'card_usage', 
    'renew_att_num', 
    'site_id', 
    'bin', 
    'merchant_number', 
    'billing_country', 
    'funding_source', 
    "payment_service_id", 
    'day_of_month', 
    'failed_decline_type',  
    'failed_day_of_month', 
    'failed_response_code', 
    'payment_amount_usd', 
    'issuer_country',  
    'failed_response_message',
    'days_between',  
    'transaction_date_in_string', 
    'cc_expiration_date', 
    'failed_attempt_date'
]

feature_candidates = ['card_brand', 'funding_source', 'card_category', 'card_class', 'card_usage', 'issuer_country', 
                 'day_of_month', 'site_id', 'failed_decline_type', 'merchant_number', 
                'payment_service_id', 'payment_method_id', 'bin', 'renew_att_num', 'failed_day_of_month', 
                'payment_currency', 'days_between', 'failed_response_code', 'payment_amount_usd', 'date_increment', 
                'transaction_hour', 'failed_response_messages_from_previous_cal', 'failed_response_codes_from_previous_cal', 
                'failed_decline_type_from_previous_cal', 'failed_response_messages_from_first_cal', 'failed_decline_type_from_first_cal', 'days_between_from_first_cal' ]

usecols = feature_candidates +  ['new_status','subscription_id', 'subsegment_id', 'success', 'cid' ,'bank_name','added_expiry_years', 'failed_response_message','date_increment', 'received_date', 'billing_country', 'transaction_date_in_string', 'cc_expiration_date', 'failed_attempt_date']

'''
data parameters
'''
excluded_processors = ['mes', 'paypalExpress']
subs_creation_date_files = [
     'subs_subscription_creation_date_2017_01_2017_12.csv', 
     'subs_subscription_creation_date_2018_01_2018_05.csv',
     'subs_subscription_creation_date_2018_06_2018_12.csv',
     'subs_subscription_creation_date_2019_01_2019_12.csv', 
     'subs_subscription_creation_date_2020_01_2020_03.csv',
     'subs_subscription_creation_date_2020_04.csv'
]

subs_files = [
    'subs_subscription_2018_12_to_2020_01.csv', 
    'subs_li_item_2020_02_to_2020_02_20.csv',  
    'subs_li_item_2020_03_2020_05.csv'
]

sub_seg_expire_files = [
    'sub_seg_expire_2019_all.csv', 
    'sub_seg_expire_2020_01_2020_02.csv', 
    'sub_seg_expire_2020_03_to_2020_04.csv', 
    'sub_seg_expire_2020_05_2020_08.csv'
]

#'dca_2019_06.csv', 'dca_2019_07.csv', 'dca_2019_08.csv', 'dca_2019_09.csv','dca_2019_10.csv'
# TO DO : dca_2019_11
training_files = ['dca_2019_12.csv', 
                  'dca_2020_01.csv', 'dca_2020_02.csv']
eval_files = [ 'dca_2020_03.csv']
test_files = [ 'dca_2020_04.csv']
bin_profile_per_date_month_path = 'bin_profile_per_date_month_2018_2020_03.csv'
bank_profile_per_date_month_path = 'bank_profile_per_date_month_2018_2020_03.csv'
bin_profile_per_day_of_month_path = 'bin_profile_per_date_month_2018_2020_03.csv'
bank_profile_per_day_of_month_path = 'bank_profile_per_day_of_month_2018_2020_03.csv'


'''
training hyperparameters
'''
scale_pos_weight = None

tuned_parameters = {}

best_parameters = {
              'depth': 5,
              'iterations': 1201,
              'random_seed': 7,
              'scale_pos_weight': scale_pos_weight,
              'subsample': 0.5,
              'bagging_temperature': 3.5,
              'rsm': 0.35,
              'eval_metric': 'BrierScore',
              'early_stopping_rounds': 500,
              'model_size_reg': 2.5,
              'l2_leaf_reg': 20.9,
              'random_strength': 5.0
              }



In [3]:

from src.web.utils import to_date
from src.web.utils import days_between
from src.web.utils import is_expired


def days_between_period(df):
    d1 = to_date(df['next_renewal_date'])
    d2 = to_date(df['grace_period_date'])
    return abs((d2 - d1).days)

def process_dca_data(dca_df, df_sub, df_sub_seg, df_creation_date):
    df = dca_df.copy()
    
    df_amex = df[(df.payment_service_id == 'mes') & (df.card_brand.isin(['American Express', 'Discover']))] 

    df = df
    df = df[~(df['new_status'] == 'Reversed')]
    
    df = df[~(df.payment_service_id.isin(excluded_processors))]
    df = df[~df['payment_amount_usd'].isna()]
    df = pd.merge(df, df_sub[['subsegment_id', 'renewal_window', 'grace_period_date', 'next_renewal_date']], left_on='subsegment_id', right_on='subsegment_id', how='left')

    df = pd.merge(df, df_sub_seg[['subsegment_id', 'duration', 'segment_num']], left_on='subsegment_id', right_on='subsegment_id', how='left')

    
    df['is_expired'] = df.apply(is_expired, axis=1)
    df.loc[~df['date_increment'].isna(), 'is_expired'] = True

    df = df[~(df.duration.isna())]
    df = df[~(df['bin'] == 'nan')]
    df = df[~(df['cc_expiration_date'] == 'nan')]
    
    df = pd.concat([df, df_amex])
    df = pd.merge(df, df_creation_date, left_on='subscription_id', right_on='subscription_id', how='left')
    df.subs_activation_date.fillna('2017-01-01 00:00:00', inplace=True)
    df['sub_age'] = df.apply(lambda x: days_between(x.transaction_date_in_string, x.subs_activation_date), axis=1)

    df = df.rename(columns={"next_renewal_date": "first_calendar_attempt_date"})
    
    df.failed_decline_type = df.failed_decline_type_from_previous_cal 
    return df

In [4]:

subs_creation_date =  pd.concat((read_from(file, s3_dir='ml_files') for file in subs_creation_date_files) , ignore_index=True)
subs_creation_date = subs_creation_date.rename(columns={"SUBSCRIPTION_ID": "subscription_id", "CREATION_DATE": "subs_activation_date"})
subs_creation_date.shape

(46751635, 2)

In [5]:

df_subs =  pd.concat((read_from(file, s3_dir='training_files') for file in subs_files) , ignore_index=True)

df_subs.head()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


Unnamed: 0.1,Unnamed: 0,grace_period_date,line_item_type,next_renewal_date,renewal_window,segment_number,subsegment_id
0,,2018-12-26,RENEWED,2018-12-15,11,8.0,17904709400
1,,2019-01-19,RENEWED,2018-12-20,30,2.0,17958051000
2,,2019-01-03,RENEWED,2018-12-04,30,7.0,17795605900
3,,2018-12-19,RENEWED,2018-12-04,15,,14777236400
4,,2019-02-20,RENEWED,2019-01-27,24,,15218507800


In [6]:
df_sub_seg_expire =  pd.concat((read_from(file) for file in sub_seg_expire_files) , ignore_index=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [7]:

df_train = pd.concat((read_from(file, usecols=usecols, s3_dir='ml_files') for file in training_files) , ignore_index=True)
df_train = process_dca_data(df_train, df_subs, df_sub_seg_expire, subs_creation_date)

# df_train_2 = read_from('retry_success_2020_01_to_2020_02.csv', usecols=usecols, s3_dir='ml_files')
# df_train_2 = df_train_2[(df_train_2.received_date >= '2020-02-01')]
# df_train_2 = process_dca_data(df_train_2, df_subs, df_sub_seg_expire, subs_creation_date)
# df_train = pd.concat([df_train, df_train_2])

df_train.shape #(6543250, 42)

  copy=copy, sort=sort)


(2126462, 48)

In [8]:
df_val = pd.concat((read_from(file, usecols=usecols, s3_dir='ml_files') for file in eval_files) , ignore_index=True)

# df_val = read_from(eval_file, usecols=usecols, s3_dir='ml_files')
# df_val = df_val[(df_val.received_date >= '2020-03-01')]
df_val = process_dca_data(df_val, df_subs, df_sub_seg_expire, subs_creation_date)
df_val.shape

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




(671071, 48)

In [9]:

df_test = pd.concat((read_from(file, usecols=usecols, s3_dir='ml_files') for file in test_files) , ignore_index=True)

# df_test = read_from(test_file, usecols=usecols, s3_dir='ml_files')

# df_test = df_test[(df_test.received_date >= '2020-04-01')]
df_test = process_dca_data(df_test, df_subs, df_sub_seg_expire, subs_creation_date)
df_test.shape

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




(610289, 48)

In [10]:
#import for training
import numpy as np
from sklearn import cross_validation
from sklearn import metrics
from sklearn.model_selection import cross_val_score

from sklearn import cross_validation
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import GridSearchCV
# from spark_sklearn import GridSearchCV
from sklearn.preprocessing import label_binarize


# from src.web.utils import PreProcessing
from src.web.preprocessing import PreProcessing
from src.web.encoder import EnhancedLeaveOneOutEncoder


In [11]:
bin_profile_per_date_month = read_from(bin_profile_per_date_month_path)

bin_profile_per_date_month['bin'] = bin_profile_per_date_month['bin'].apply(str).str.replace('.0', '', regex=False)
max_per_date_month_dict = bin_profile_per_date_month.set_index(['bin', 'month', 'day_of_month'])['Max_99'].T.to_dict()

bin_profile_per_day_of_month = read_from(bin_profile_per_day_of_month_path)

bin_profile_per_day_of_month['bin'] = bin_profile_per_day_of_month['bin'].apply(str).str.replace('.0', '', regex=False)
max_per_day_of_month_dict = bin_profile_per_day_of_month.set_index(['bin', 'day_of_month'])['Max_99'].T.to_dict()

In [12]:
bank_profile_per_date_month = read_from(bank_profile_per_date_month_path)
max_per_bank_card_date_month_dict = bank_profile_per_date_month.set_index(['bank_name', 'card_category', 'month', 'day_of_month'])['Max_99'].T.to_dict()

bank_profile_per_day_of_month = read_from(bank_profile_per_day_of_month_path)
max_per_bank_card_day_of_month_dict = bank_profile_per_day_of_month.set_index(['bank_name', 'card_category', 'day_of_month'])['Max_99'].T.to_dict()


In [13]:
success_per_date_month_dict = bin_profile_per_date_month.set_index(['bin', 'month', 'day_of_month'])['count'].T.to_dict()
success_per_day_of_month_dict = bin_profile_per_day_of_month.set_index(['bin', 'day_of_month'])['count'].T.to_dict()


In [14]:
df_train = df_train.sample(frac=1).reset_index(drop=True)
df_train.shape

(2126462, 48)

In [15]:
original_size = len(df_train)
balanced_size = len(df_train)
fail_size = df_train[label].value_counts(normalize=True)[0.0]
success_size =  df_train[label].value_counts(normalize=True)[1.0]

In [16]:
segment_num_group = [0, 2, 3, 4, 5, 6, 7, 8, 15, 20, 25, 30, 40, 50, 70, 100, 150]

df_train['segment_num_group'] = pd.cut(df_train['segment_num'], segment_num_group).astype(str).str.replace('.0', '', regex=False)
df_val['segment_num_group'] = pd.cut(df_val['segment_num'], segment_num_group).astype(str).str.replace('.0', '', regex=False)
df_test['segment_num_group'] = pd.cut(df_test['segment_num'], segment_num_group).astype(str).str.replace('.0', '', regex=False)


In [17]:
df_train.loc[(df_train['duration'] == 28) | (df_train['duration'] == 29) | (df_train['duration'] == 31) , 'duration'] = 30
df_train.loc[(df_train['duration'] == 366) , 'duration'] = 365
df_train.loc[(df_train['duration'] == 731) , 'duration'] = 730

df_val.loc[(df_val['duration'] == 28) | (df_val['duration'] == 29) | (df_val['duration'] == 31) , 'duration'] = 30
df_val.loc[(df_val['duration'] == 366) , 'duration'] = 365
df_val.loc[(df_val['duration'] == 731) , 'duration'] = 730

df_test.loc[(df_test['duration'] == 28) | (df_test['duration'] == 29) | (df_test['duration'] == 31) , 'duration'] = 30
df_test.loc[df_test['duration'] == 366 , 'duration'] = 365
df_test.loc[(df_test['duration'] == 731) , 'duration'] = 730

In [18]:
duration_group = [0, 3, 6, 9, 13, 17, 20, 25, 27, 33, 39, 43, 62, 70, 80, 88, 94, 100, 118, 125, 130, 146, 155, 176, 184, 200, 213, 230, 263, 300, 363, 368, 373,729, 733, 1000, 2000]

df_train['sub_duration_group'] = pd.cut(df_train['duration'], duration_group).astype(str).str.replace('.0', '', regex=False) 
df_val['sub_duration_group'] = pd.cut(df_val['duration'], duration_group).astype(str).str.replace('.0', '', regex=False) 
df_test['sub_duration_group'] = pd.cut(df_test['duration'], duration_group).astype(str).str.replace('.0', '', regex=False) 

df_train['sub_age_group'] = pd.cut(df_train['sub_age'], duration_group).astype(str).str.replace('.0', '', regex=False) 
df_val['sub_age_group'] = pd.cut(df_val['sub_age'], duration_group).astype(str).str.replace('.0', '', regex=False) 
df_test['sub_age_group'] = pd.cut(df_test['sub_age'], duration_group).astype(str).str.replace('.0', '', regex=False)

In [19]:
from sklearn.model_selection import GridSearchCV
# from spark_sklearn import GridSearchCV

from src.web.preprocessing import PreProcessing
from src.web.preprocessing import make_pipeline
from sklearn.preprocessing import Imputer


# additional_fields = ['card_brand',  'segment_num', 'segment_num_group' ,'bank_name', 'duration', 'is_expired', 'renewal_window', 'payment_currency', 'funding_source', 'card_category', 'card_class', 'card_usage', 'renew_att_num', 'site_id', 'bin', 'merchant_number', 'billing_country', 'funding_source', "payment_service_id", 'day_of_month', 'failed_decline_type',  'failed_day_of_month', 'failed_response_code', 'payment_amount_usd', 'issuer_country',  'failed_response_message','days_between',  'transaction_date_in_string', 'cc_expiration_date', 'failed_attempt_date']
additional_fields = [x for x in additional_fields if x not in (features_cat+features_num)]
fields = features_cat + features_num + additional_fields



features_dict = {'LABEL': label, 'FIELDS': fields ,'FEATURES_CAT': features_cat, 'FEATURES_NUM':features_num, 'FEATURES_ENCODED':features_encoded, 'FEATURES_NUM_ENCODED':features_num_encoded, 'FEATURES_NUM_CALCULATED':features_num_calculated, 'FEATURES_FLOAT': features_float}
features_dict_key = 'preprocessing__features_dict'
features_dict['df_bin_profile'] = None 
features_dict['df_decline_type'] = None

features_dict['FEATURES_NUM_BIN_PROFILE'] = features_num_bin_profile
features_dict['FEATURES_GROUPED'] = features_grouped
features_dict['ADDITIONAL_FIELDS'] = additional_fields


features_dict['group_dict'] = {"max_per_date_month_dict": max_per_date_month_dict, "max_per_day_of_month_dict": max_per_day_of_month_dict, \
                               "max_per_bank_card_date_month_dict": max_per_bank_card_date_month_dict, "max_per_bank_card_day_of_month_dict": max_per_bank_card_day_of_month_dict, \
                              "success_per_date_month_dict": success_per_date_month_dict, "success_per_day_of_month_dict": success_per_day_of_month_dict}



In [27]:
'''Prepares training parameters'''

from catboost import CatBoostClassifier, Pool
import src.web.preprocessing
from src.web.preprocessing import PreProcessing
from src.web.train_util import *
from importlib import import_module
import sys

classifier = CatBoostClassifier

cat_features_len = len(features_cat) +  len (features_grouped)
input_data = df_train

features_dict['use_cat_encoder'] = False
_preProcessor = PreProcessing().fit(input_data, input_data['success'], features_dict=features_dict)

_df_val = df_val 
_df_test = df_test
scale_pos_weight = (_df_val[label].value_counts(normalize=True)[0.0] / _df_val[label].value_counts(normalize=True)[1.0] )
best_parameters['scale_pos_weight'] = scale_pos_weight

_x_eval = _preProcessor.transform(_df_val)
_y_eval = _df_val["success"]


alg_name = 'catboostclassifier'


model_file = ''


cat_features = list(range(0,cat_features_len))

fit_params = {
    f"{alg_name}__verbose": True,
    f"{alg_name}__cat_features": cat_features,
    f"{alg_name}__plot": True,
    f"{alg_name}__eval_set": Pool(_x_eval, _y_eval, cat_features)
}


features_dict['fit_params'] = fit_params


self.features_encoded: ['month', 'days_between', 'renew_att_num', 'day_of_week', 'num_of_days', 'payment_service_id', 'merchant_number', 'month', 'is_expired', 'segment_num_group', 'sub_duration_group', 'sub_age_group', 'card_brand']
# Finish handle_feat_encoded.
self.features_all:  None
In fit, self.features_cat: ['failed_response_code', 'failed_decline_type', 'day_of_month', 'funding_source', 'payment_currency', 'days_between', 'billing_country', 'renewal_window', 'renew_att_num', 'card_brand']
['failed_response_code', 'failed_decline_type', 'day_of_month', 'funding_source', 'payment_currency', 'days_between', 'billing_country', 'renewal_window', 'renew_att_num', 'card_brand']
# not using cat encoder
# Finish handle_feat_encoded.
# transform_time: 1.5084774494171143
  failed_response_code failed_decline_type day_of_month funding_source  \
0                  806                base            5         credit   
1                  806                base           12         credit   

In [28]:
'''
print out all parameters.
'''

print('training_runner = ', training_runner)
print('project_id =', project_id)
print('training_id =', training_id)
print('metrics_feedback_url =', metrics_feedback_url)
print('model_destination =', model_destination)
print('label =', label)
print('training_data =', training_data)

print('training_files =', training_files)
print('eval_files =', eval_files)
print('test_files =', test_files)
print('sub_seg_expire_files =', sub_seg_expire_files)
print('subs_files =', subs_files)
print('subs_creation_date_files =', subs_creation_date_files)
print('excluded_processors =', excluded_processors)

'''
print out manipulated and aggregated features.
'''
print('\n============== training parameters & features ================ ')
print('input_features =', input_features)
print('additional_fields =', additional_fields)
print('tuned_parameters =', tuned_parameters)
print('best_parameters =', best_parameters)
print('features_cat =', features_cat)
print('features_float =', features_float)
print('features_num =', features_num)
print('features_grouped =', features_grouped)

print('feature_num_encoded =', features_encoded)
print('features_encoded =', features_encoded)
print('features_num_calculated =', features_num_calculated)


training_runner =  None
project_id = None
training_id = None
metrics_feedback_url = None
model_destination = None
label = success
training_data = 
training_files = ['dca_2019_12.csv', 'dca_2020_01.csv', 'dca_2020_02.csv']
eval_files = ['dca_2020_03.csv']
test_files = ['dca_2020_04.csv']
sub_seg_expire_files = ['sub_seg_expire_2019_all.csv', 'sub_seg_expire_2020_01_2020_02.csv', 'sub_seg_expire_2020_03_to_2020_04.csv', 'sub_seg_expire_2020_05_2020_08.csv']
subs_files = ['subs_subscription_2018_12_to_2020_01.csv', 'subs_li_item_2020_02_to_2020_02_20.csv', 'subs_li_item_2020_03_2020_05.csv']
subs_creation_date_files = ['subs_subscription_creation_date_2017_01_2017_12.csv', 'subs_subscription_creation_date_2018_01_2018_05.csv', 'subs_subscription_creation_date_2018_06_2018_12.csv', 'subs_subscription_creation_date_2019_01_2019_12.csv', 'subs_subscription_creation_date_2020_01_2020_03.csv', 'subs_subscription_creation_date_2020_04.csv']
excluded_processors = ['mes', 'paypalExpress']

input_

In [None]:
""" Train the model"""



clf, result_d = build_and_train(
    input_data, 
    classifier, 
    tuned_parameters, 
    alg_name, 
    model_file, 
    best_param=best_parameters, 
    features_dict=features_dict, 
    test_data=_df_test,
    metrics_feedback_url=metrics_feedback_url)
                                   
print("result_dict: ", result_d)


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



# Using assigned test_data
--------------------
(2126462, 30)
(2126462,)
(610289, 30)
(610289,)
Best Retry preprocessing pipeline ... 
using fit_params ....... 
In EnhancedPipeline fit_predict ...
self.features_encoded: ['month', 'days_between', 'renew_att_num', 'day_of_week', 'num_of_days', 'payment_service_id', 'merchant_number', 'month', 'is_expired', 'segment_num_group', 'sub_duration_group', 'sub_age_group', 'card_brand']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df[feat] = df[feat].fillna('').astype(str).str.replace('.0', '', regex=False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df[DAY_OF_WEEK] = df[TXN_DATE_IN_STR].apply(to_weekday)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df[IS_WEEKEND] = df[DAY_OF_WEEK].apply(is_weekend)
A value is trying to be set on a copy of a slice from a DataFrame.
Try 

# Finish handle_feat_encoded.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df[BIN] = pd.to_numeric(df[BIN], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df[BIN] = df[BIN].astype(str).str.replace('.0', '', regex=False)


self.features_all:  None
In fit, self.features_cat: ['failed_response_code', 'failed_decline_type', 'day_of_month', 'funding_source', 'payment_currency', 'days_between', 'billing_country', 'renewal_window', 'renew_att_num', 'card_brand']
['failed_response_code', 'failed_decline_type', 'day_of_month', 'funding_source', 'payment_currency', 'days_between', 'billing_country', 'renewal_window', 'renew_att_num', 'card_brand']
# not using cat encoder


In [26]:
'''
output the model
'''
start_date = '2019-12-01'
end_date = '2020-02-01'
if training_runner is None:
    model_id = 'ML-BR-1'
    version = get_latest_version(model_id, model_type) + 1
    model_name = model_id + '.' + str(version)
    model_file, model_file_name = write_model(clf, model_name)
    
    preprocess_repo_path = handle_preprocessing_file(model_id, version)
    size_desc = str(", original size: %s (fail: %s, success: %s), balanced_size: %s" % (original_size, fail_size, success_size, original_size))
    desc = 'Using more specific failed_decline_type. Remove site_id.  Remove days_between-card_brand and funding_source. Add days_between-failed_decline_code. Include data with null first_cal_attempt. Add bank_name-is_expired, handle card_brand to be lower when grouping. Remove days_between_first_cal. With days_between-card_brand, reduce model_size_reg. Handle sub_duration_group in preprocessing. Minus segment_num. Using updated bank_profile, bin_profile to end of March. Use duration, sub_age, segment_num as numeric. Update sub_duration_group, days_between-card_brand,  duration(handle 28, 29,31 366 and 731), card_brand, renew_att_num,  sub_age.  Add days_between-failed_decline_type.  bank_card_max_per_date.  Add issuer_country-is_expired and bin-is_expired .is_expired to be True for all non na date increment. More individual features. Add more specific failed_decline_type. 2020_03 as val data and 2020_04 as test data  {}_{}_for_calendar retry model,  eval_metric= BrierScore, with no date_increment, no payment amount and bin profile). {}'.format(start_date, end_date, size_desc)

    hyper_params = result_d.pop('hyper_params', None)
    extended_att = {"preprocess_repo_path": preprocess_repo_path, "input_features": input_features}
    repo_path = upload_artifact(model_file_name)
    insert_model_info(model_id, version, repo_path, desc=desc, model_type=model_type,eval_metrics=json.dumps(result_d), 
                      hyper_parameter=json.dumps(hyper_params), extended_att=json.dumps(extended_att), features_dict=features_dict, algorithm='CatBoostClassifier')
    
else:
    model_file = joblib.dump(clf, model_destination)

print('model_file generated: ', model_file)

Model ML-BR-1 version 396 is inserted into model repo
model_file generated:  ['/var/spark/ml_files/models/ML-BR-1.396.pkl']
