In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import pandas as pd
pd.options.display.max_colwidth = 300
pd.options.display.max_columns = 100

from src.web.train_util import *

In [None]:
'''
Variables.
Variables that are defined in this cell can be injected and overwritten by the machine learning platform.
'''

'''
MLP defined variables
'''
training_runner = None
project_id = None
training_id = None
metrics_feedback_url = None
model_file = None
output_dir = 'out'
training_metrics_file = 'training.metrics'
cross_validation_metrics_file = 'cross_validation.metrics'
testing_metrics_file = 'testing.metrics'
feature_importance_file = "feature.importance"


'''
feature variables
'''
input_features = {
    "billing_country": {
        "type": "string"
    },
    "bin": {
        "type": "string"
    },
    "card_brand": {
        "type": "string"
    },
    "card_category": {
        "type": "string"
    },
    "card_class": {
        "type": "string"
    },
    "card_usage": {
        "type": "string"
    },
    "cc_expiration_date": {
        "type": "string"
    },
    "day_of_month": {
        "type": "integer"
    },
    "failed_response_code": {
        "type": "string"
    },
    "failed_response_message": {
        "type": "string"
    },
    "funding_source": {
        "type": "string"
    },
    "issuer_country": {
        "type": "string"
    },
    "merchant_number": {
        "type": "string"
    },
    "payment_amount_usd": {
        "type": "number"
    },
    "payment_currency": {
        "type": "string"
    },
    "payment_method_id": {
        "type": "string"
    },
    "payment_service_id": {
        "type": "string"
    },
    "renew_att_num": {
        "type": "integer"
    },
    "site_id": {
        "type": "string"
    },
    "transaction_date_in_string": {
        "type": "string"
    },
    "failed_payment_service_id": {
        "type": "string"
    },
    "failed_merchant_number": {
        "type": "string"
    },
    "processor_att_num": {
        "type": "integer"
    },
    "transaction_hour": {
        "type": "integer"
    },
    "date_increment": {
        "type": "integer"
    }
}

features_cat = [
    "merchant_number",
    #"failed_merchant_number",
    #"failed_payment_service_id",
    "payment_service_id",
    "payment_currency",
    "site_id",
    "failed_decline_type",
    "failed_response_code",
    "transaction_hour"
]

features_float = [
    'bin',
    'issuer_country',
    'renew_att_num',
    'failed_response_code'
]

features_num = []
features_num_calculated = []
features_num_encoded = []   
features_num_bin_profile = []

features_cat_encoded = [
    'week_of_month',
    'day_of_week',
    'is_expired'
]

features_grouped = [
    [ 'payment_service_id', 'merchant_number'],
    [ 'payment_service_id', 'is_expired'],
    [ 'bin', 'is_expired'],
    [ 'issuer_country', 'is_expired'],
    [ 'day_of_month', 'payment_service_id', 'bin']
]

additional_fields = [
    'is_expired',
    'bin',
    'issuer_country',
    'day_of_month',
    'transaction_date_in_string',
    'cc_expiration_date',
    'billing_country'
]

feature_candidates = [
    "site_id",
    "card_category",
    "issuer_country",
    "merchant_number",
    "bank_name",
    "payment_service_id",
    "bin",
    "payment_currency",
    "day_of_month",
    "failed_response_code",
    "failed_decline_type",
    "transaction_hour",
    'date_increment',
    'payment_amount_usd'
]

usecols = [
    'subscription_id',
    'success',
    'cid',
    'bank_name',
    'added_expiry_years',
    'failed_response_message',
    'date_increment',
    'received_date',
    'billing_country',
    'transaction_date_in_string',
    'cc_expiration_date'
]


'''
data variables
'''
excluded_processors = ['mes', 'paypalExpress']

subs_creation_date_files = [
     'subs_subscription_creation_date_2017_01_2017_12.csv', 
     'subs_subscription_creation_date_2018_01_2018_05.csv',
     'subs_subscription_creation_date_2018_06_2018_12.csv',
     'subs_subscription_creation_date_2019_01_2019_12.csv', 
     'subs_subscription_creation_date_2020_01_2020_03.csv',
     'subs_subscription_creation_date_2020_04.csv'
]

subs_files = [
    'subs_subscription_2018_12_to_2020_01.csv', 
    'subs_li_item_2020_02_to_2020_02_20.csv',  
    'subs_li_item_2020_03_2020_05.csv'
]

sub_seg_expire_files = [
    'sub_seg_expire_2019_all.csv', 
    'sub_seg_expire_2020_01_2020_02.csv', 
    'sub_seg_expire_2020_03_to_2020_04.csv', 
    'sub_seg_expire_2020_05_2020_08.csv'
]

#'dca_2019_06.csv', 'dca_2019_07.csv', 'dca_2019_08.csv', 'dca_2019_09.csv','dca_2019_10.csv'
# TO DO : dca_2019_11
training_files = ['processor_retry_2019_07_to_09.csv', 'processor_retry_2019_10_to_12.csv']

eval_files = [ 'processor_retry_2020_01_to_02.csv']
test_files = [ 'processor_retry_2020_01_to_02.csv']
bin_profile_per_date_month_path = 'bin_profile_per_date_month_2018_2020_03.csv'
bank_profile_per_date_month_path = 'bank_profile_per_date_month_2018_2020_03.csv'
bin_profile_per_day_of_month_path = 'bin_profile_per_day_month_2018_2020_03.csv'
bank_profile_per_day_of_month_path = 'bank_profile_per_day_of_month_2018_2020_03.csv'


'''
hyperparameters variables
'''
scale_pos_weight = None

tuned_parameters = {}

best_parameters = {
    'depth': 6,
    'iterations': 1501,
    'random_seed': 8,
    'scale_pos_weight': 0,
    'subsample': 0.6,
    'bagging_temperature': 3.0,
    'rsm': 0.35,
    'eval_metric': 'BrierScore',
    'early_stopping_rounds': 500,
    'l2_leaf_reg': 37.7,
    'model_size_reg': 6.0,
    'random_strength': 3.0,
    'best_model_min_trees': 700
}


'''
other variables
'''
label = 'success'
work_dir = '/var/spark/ml_files/'
model_type = 'ML-BPR'
start_date = '2020-01-01'
end_date = '2020-01-31'

In [None]:
'''
Variable manipulation after variable injection
'''

features_encoded = features_cat_encoded + features_num_encoded

usecols = feature_candidates + usecols

best_parameters['scale_pos_weight'] = scale_pos_weight

In [None]:

df_train = pd.concat((read_from(file, usecols=usecols, s3_dir='ml_files') for file in training_files) , ignore_index=True)

# df = df[~(df['new_status'] == 'Reversed')]
df_train = df_train[~(df_train['payment_service_id'] == 'mes')]
df_train = df_train[~(df_train['payment_service_id'] == 'paypalExpress')]
# df = df[~df['payment_amount_usd'].isna()]
df_train.shape

In [None]:
df_eval = pd.concat((read_from(file, usecols=usecols, s3_dir='ml_files') for file in eval_files) , ignore_index=True)
df_eval = df_eval[(df_eval['received_date'] >= '2020-01-01') & (df_eval['received_date'] < '2020-02-01')]
df_eval.shape

In [None]:
df_test = pd.concat((read_from(file, usecols=usecols, s3_dir='ml_files') for file in test_files) , ignore_index=True)
df_test = df_test[(df_test['received_date'] >= '2020-02-01') & (df_test['received_date'] < '2020-03-01')]

df_test.shape


In [None]:
original_size = len(df_train)
balanced_size = len(df_train)
fail_size = df_train[label].value_counts(normalize=True)[0.0]
success_size =  df_train[label].value_counts(normalize=True)[1.0]

In [None]:
from src.web.utils import is_expired

df_train['is_expired'] = df_train.apply(is_expired, axis=1)
df_train.loc[~df_train['date_increment'].isna(), 'is_expired'] = True

df_eval['is_expired'] = df_eval.apply(is_expired, axis=1)
df_eval.loc[~df_eval['date_increment'].isna(), 'is_expired'] = True

df_test['is_expired'] = df_test.apply(is_expired, axis=1)
df_test.loc[~df_test['date_increment'].isna(), 'is_expired'] = True


In [None]:

# additional_fields = ['card_brand',  'segment_num', 'segment_num_group' ,'bank_name', 'duration', 'is_expired', 'renewal_window', 'payment_currency', 'funding_source', 'card_category', 'card_class', 'card_usage', 'renew_att_num', 'site_id', 'bin', 'merchant_number', 'billing_country', 'funding_source', "payment_service_id", 'day_of_month', 'failed_decline_type',  'failed_day_of_month', 'failed_response_code', 'payment_amount_usd', 'issuer_country',  'failed_response_message','days_between',  'transaction_date_in_string', 'cc_expiration_date', 'failed_attempt_date']
additional_fields = [x for x in additional_fields if x not in (features_cat+features_num)]
fields = features_cat + features_num + additional_fields


# df_decline_type = read_from('Decline_Type.csv')

features_dict = {'LABEL': label, 'FIELDS': fields ,'FEATURES_CAT': features_cat, 'FEATURES_NUM':features_num, 'FEATURES_ENCODED':features_encoded, 'FEATURES_NUM_ENCODED':features_num_encoded, 'FEATURES_NUM_CALCULATED':features_num_calculated, 'FEATURES_FLOAT': features_float}
features_dict_key = 'preprocessing__features_dict'
# features_dict['df_bin_profile'] = None  
# features_dict['df_decline_type'] = df_decline_type
features_dict['FEATURES_GROUPED'] = features_grouped
features_dict['ADDITIONAL_FIELDS'] = additional_fields

# features_dict['group_dict'] = {}

In [None]:
'''Prepares training parameters'''

from catboost import CatBoostClassifier, Pool
from src.web.train_util import *

classifier = CatBoostClassifier

cat_features_len = len(features_cat) +  len (features_grouped)
input_data = df_train

features_dict['use_cat_encoder'] = False
_preProcessor = PreProcessing().fit(input_data, input_data['success'], features_dict=features_dict)

_df_val = df_eval
_df_test = df_test


In [None]:
scale_pos_weight = (_df_val[label].value_counts(normalize=True)[0.0] / _df_val[label].value_counts(normalize=True)[1.0] )
best_parameters['scale_pos_weight'] = scale_pos_weight

_x_eval = _preProcessor.transform(_df_val)
_y_eval = _df_val["success"]


alg_name = 'catboostclassifier'


cat_features = list(range(0,cat_features_len))

fit_params = {
    f"{alg_name}__verbose": True,
    f"{alg_name}__cat_features": cat_features,
    f"{alg_name}__plot": True,
    f"{alg_name}__eval_set": Pool(_x_eval, _y_eval, cat_features)
}


features_dict['fit_params'] = fit_params


In [None]:
""" Train the model"""

clf, result_d = build_and_train(
    input_data, 
    classifier, 
    tuned_parameters, 
    alg_name, 
    model_file, 
    best_param=best_parameters, 
    features_dict=features_dict, 
    test_data=_df_test,
    output_dir=output_dir)
                                   
print("result_dict: ", result_d)

In [None]:
'''
output the model
'''

if training_runner is None:
    model_id = 'ML-BPR-Test-1'
    version = get_latest_version(model_id, model_type) + 1
    model_name = model_id + '.' + str(version)
    model_file, model_file_name = write_model(clf, model_name)
    
    preprocess_repo_path = handle_preprocessing_file(model_id, version)
    size_desc = str(", original size: %s (fail: %s, success: %s), balanced_size: %s" % (original_size, fail_size, success_size, original_size))
    desc = 'Using more specific failed_decline_type. Assign scale_pos_weight. Remove site_id.  Remove days_between-card_brand and funding_source. Add days_between-failed_decline_code. Include data with null first_cal_attempt. Add bank_name-is_expired, handle card_brand to be lower when grouping. Remove days_between_first_cal. With days_between-card_brand, reduce model_size_reg. Handle sub_duration_group in preprocessing. Minus segment_num. Using updated bank_profile, bin_profile to end of March. Use duration, sub_age, segment_num as numeric. Update sub_duration_group, days_between-card_brand,  duration(handle 28, 29,31 366 and 731), card_brand, renew_att_num,  sub_age.  Add days_between-failed_decline_type.  bank_card_max_per_date.  Add issuer_country-is_expired and bin-is_expired .is_expired to be True for all non na date increment. More individual features. Add more specific failed_decline_type. 2020_03 as val data and 2020_04 as test data  {}_{}_for_calendar retry model,  eval_metric= BrierScore, with no date_increment, no payment amount and bin profile). {}'.format(start_date, end_date, size_desc)

    hyper_params = result_d.pop('hyper_params', None)
    extended_att = {"preprocess_repo_path": preprocess_repo_path, "input_features": input_features}
    repo_path = upload_artifact(model_file_name)
    insert_model_info(model_id, version, repo_path, desc=desc, model_type=model_type,eval_metrics=json.dumps(result_d), 
                      hyper_parameter=json.dumps(hyper_params), extended_att=json.dumps(extended_att), features_dict=features_dict, algorithm='CatBoostClassifier')
    
else:
    model_file = joblib.dump(clf, model_file)

print('model_file generated: ', model_file)