# Dependencies
Install **anaconda** is recommended

| Name             | Version | Numpy & Python Version   |             |
| ---------------- |---------|--------------------------|-------------|
| cassandra-driver | 3.11.0  |      py35_1              | conda-forge |
| pandas           | 0.19.1  | np111py35_0              |             | 
| scikit-learn     | 0.18.1  | np111py35_0              |             |
| scipy            | 0.18.1  | np111py35_0              |             |
| matplotlib       | 2.0.0   | np111py35_0              |             |

In [None]:
'''
commands.
'''

%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
'''
Parameters. 
Parameters that are defined in this cell can be injected and overwritten by the machine learning platform.
'''

# MLP defined parameters 
training_runner = None
project_id = None
training_id = None
metrics_feedback_url = None
model_file = None
output_dir = 'out'
training_metrics_file = 'training.metrics'
cross_validation_metrics_file = 'cross_validation.metrics'
testing_metrics_file = 'testing.metrics'
feature_importance_file = "feature.importance"

# user defined parameters

# label keys
label = 'success'
# model file directory
work_dir = '/var/spark/ml_files/'
model_type = 'ML-TOD'
start_date = '2019-09-01'

end_date = '2019-12-31'
# desc = '%s_%s_for_calendar_retry_attempt'.format(start_date, end_date)

# data
training_data = work_dir + 'tod_all_fields_2019_06.csv'
# bin_profile_data =  work_dir + 'bin_profile_2019_01_to_2019_05.csv'
# payment_mid_bin_data = work_dir + 'payment_mid_bin_2019_01_to_05.csv'
# decline_type_data = work_dir + 'Decline_Type.csv'

# features
input_features = {
            "billing_country": {
                "type": "string"
            },
            "bin": {
                "type": "string"
            },
            "bank_name": {
                "type": "string"
            },
            "card_brand": {
                "type": "string"
            },
            "card_category": {
                "type": "string"
            },
            "card_class": {
                "type": "string"
            },
            "card_usage": {
                "type": "string"
            },
            "day_of_month": {
                "type": "integer"
            },
            "funding_source": {
                "type": "string"
            },
            "issuer_country": {
                "type": "string"
            },
            "merchant_number": {
                "type": "string"
            },
            "payment_amount_usd": {
                "type": "number"
            },
            "payment_currency": {
                "type": "string"
            },
            "payment_method_id": {
                "type": "string"
            },
            "payment_service_id": {
                "type": "string"
            },
            "site_id": {
                "type": "string"
            },
            "transaction_date_in_string": {
                "type": "string"
            }
}

features_cat = [ 'duration' ]
features_float = ['bin', 'renew_att_num']
features_num = ['segment_num']
features_num_encoded = []
features_num_calculated = []

features_cat_encoded = ['txn_hour_min_segment', 'week_of_month', 'day_of_week', 'payment_service_id', 'merchant_number'] 
features_encoded = features_cat_encoded + features_num_encoded

features_grouped = [['txn_hour_min_segment', 'bin'], 
                    ['txn_hour_min_segment', 'billing_country'],
                    ['txn_hour_min_segment', 'site_id'], 
                    ['txn_hour_min_segment', 'week_of_month'],
                    ['txn_hour_min_segment', 'payment_currency']]

additional_fields =  [ 'payment_amount_usd' ,'issuer_country', 'billing_country', 'day_of_month', 'site_id', 'merchant_number', 'transaction_hour',
                'payment_service_id', 'bin', 'payment_currency', 'bank_name', 'transaction_date_in_string']

scale_pos_weight = None
tuned_parameters = {}

best_parameters = {
              'depth': 5,
              'iterations': 1201,
              'random_seed': 7,
              'scale_pos_weight': scale_pos_weight,
              'subsample': 0.5,
              'bagging_temperature': 3.5,
              'rsm': 0.35,
              'eval_metric': 'BrierScore',
              'early_stopping_rounds': 500,
              'model_size_reg': 2.5,
              'l2_leaf_reg': 20.9,
              'random_strength': 5.0
              }

training_data_paths =  ['tod_all_fields_2019_11.csv']
eval_data_paths = ['tod_all_fields_2019_12.csv']
test_data_paths = ['tod_all_fields_2020_01.csv']
sub_seg_expire_files = [ 'sub_seg_expire_2019_all.csv', 'sub_seg_expire_2020_01_2020_02.csv']

exclude_decline_types = ['invalid_account', 'invalid_cc', 'invalid_txn','correct_cc_retry', 'expired_card', 'format error', 'no savings account', 'revoc', 'declined non generic', 'do not try again/use alternate payment card']


In [None]:
from src.web.train_util import *

In [None]:
'''
configurations.
'''
pd.options.display.max_colwidth = 300
pd.options.display.max_columns = 100

In [None]:
feature_candidates = [ 'issuer_country', 'day_of_month', 'site_id', 'merchant_number', 'transaction_hour',
                'payment_service_id', 'bin', 'payment_currency', 'bank_name', "card_category", "date_increment", 'decline_type']


usecols = feature_candidates +  ['renew_att_num', 'cid' ,'payment_amount_usd', 'new_status', 'response_message', 'subscription_id', 'subsegment_id','success', 'cid' ,'added_expiry_years', 'received_date', 'billing_country', 'transaction_date_in_string', 'cc_expiration_date']

In [None]:
# training_data_paths = ['tod_all_fields_2019_09.csv', 'tod_all_fields_2019_10.csv', 'tod_all_fields_2019_11.csv']
# training_data_paths =  ['tod_all_fields_2019_11.csv']

df_train =  pd.concat((read_from( file, usecols=usecols) for file in training_data_paths) , ignore_index=True)
df_train = df_train[~(df_train['payment_service_id'] == 'mes')]
df_train = df_train[~(df_train['payment_service_id'] == 'paypalExpress')]
df_train = df_train[~df_train['payment_amount_usd'].isna()]
df_train = df_train[~(df_train['new_status'] == 'Reversed')]
df_train['bin'] = df_train['bin'].fillna('').astype(str).str.replace('.0', '', regex=False)

print(training_data_paths)
df_train.shape

In [None]:
# eval_data_paths = ['tod_all_fields_2019_12.csv']
# df_eval =  read_from(eval_data_path, usecols=usecols)
df_eval =  pd.concat((read_from( file, usecols=usecols) for file in eval_data_paths) , ignore_index=True)
df_eval['bin'] = df_eval['bin'].fillna('').astype(str).str.replace('.0', '', regex=False)
df_eval = df_eval[df_eval['renew_att_num'] == 1]
df_eval = df_eval[~(df_eval['payment_service_id'] == 'mes')]
df_eval = df_eval[~(df_eval['payment_service_id'] == 'paypalExpress')]
df_eval = df_eval[~df_eval['payment_amount_usd'].isna()]
df_eval = df_eval[~(df_eval['new_status'] == 'Reversed')]
df_eval.shape

In [None]:
# df_test =  read_from(test_data_path, usecols=usecols)
df_test =  pd.concat((read_from( file, usecols=usecols) for file in test_data_paths) , ignore_index=True)

df_test['bin'] = df_test['bin'].fillna('').astype(str).str.replace('.0', '', regex=False)
df_test = df_test[df_test['renew_att_num'] == 1]
df_test = df_test[~(df_test['payment_service_id'] == 'mes')]
df_test = df_test[~(df_test['payment_service_id'] == 'paypalExpress')]
df_test = df_test[~df_test['payment_amount_usd'].isna()]
df_test = df_test[~(df_test['new_status'] == 'Reversed')]

df_test.shape

In [None]:
df_sub_seg_expire =  pd.concat((read_from(file) for file in sub_seg_expire_files) , ignore_index=True)
# df_sub_seg_expire_2020 = pd.read_csv(WORK_DIR + 'sub_seg_expire_2020_01_2020_02.csv')

df_train = pd.merge(df_train, df_sub_seg_expire[['subsegment_id', 'duration', 'segment_num']], left_on='subsegment_id', right_on='subsegment_id', how='left')
df_eval = pd.merge(df_eval, df_sub_seg_expire[['subsegment_id', 'duration', 'segment_num']], left_on='subsegment_id', right_on='subsegment_id', how='left')
df_test = pd.merge(df_test, df_sub_seg_expire[['subsegment_id', 'duration', 'segment_num']], left_on='subsegment_id', right_on='subsegment_id', how='left')


In [None]:
'''
data manipulation.
'''

df_train = df_train[df_train['date_increment'].isna()]
df_train = df_train[~(df_train['decline_type'].isin(exclude_decline_types))]

df_eval = df_eval[df_eval['date_increment'].isna()]
df_eval = df_eval[~(df_eval['decline_type'].isin(exclude_decline_types))]

df_test = df_test[df_test['date_increment'].isna()]
df_test = df_test[~(df_test['decline_type'].isin(exclude_decline_types))]

#Exclude some data
df_train = df_train[~(df_train['bin'] == 'nan')]
df_eval = df_eval[~(df_eval['bin'] == 'nan')]
df_test = df_test[~(df_test['bin'] == 'nan')]

df_train = df_train[~(df_train['cc_expiration_date'] == 'nan')]
df_eval = df_eval[~(df_eval['cc_expiration_date'] == 'nan')]
df_test = df_test[~(df_test['cc_expiration_date'] == 'nan')]

In [None]:
original_size = len(df_train)
balanced_size = len(df_train)
fail_size = df_train[label].value_counts(normalize=True)[0.0]
success_size =  df_train[label].value_counts(normalize=True)[1.0]

In [None]:
'''
feature manipulation and aggregation.
'''
# features_cat = [ 'duration' ]
# features_float = ['bin', 'renew_att_num']
# features_num = ['segment_num']
# features_num_encoded = []
# features_num_calculated = []
features = features_cat + features_encoded

# features_cat_encoded = ['txn_hour_min_segment', 'week_of_month', 'day_of_week', 'payment_service_id', 'merchant_number'] 

# features_grouped = [['txn_hour_min_segment', 'bin'], 
#                     ['txn_hour_min_segment', 'billing_country'],
#                     ['txn_hour_min_segment', 'site_id'], 
#                     ['txn_hour_min_segment', 'week_of_month'],
#                     ['txn_hour_min_segment', 'payment_currency']]

# features_encoded = features_cat_encoded + features_num_encoded


# additional_fields =  [ 'payment_amount_usd' ,'issuer_country', 'billing_country', 'day_of_month', 'site_id', 'merchant_number', 'transaction_hour',
#                 'payment_service_id', 'bin', 'payment_currency', 'bank_name', 'transaction_date_in_string']
additional_fields = [x for x in additional_fields if x not in features_cat]
fields = features_cat + features_num +  additional_fields


# df_decline_type = pd.read_csv(WORK_DIR + 'Decline_Type.csv')

features_dict = {'LABEL': label, 'FIELDS': fields ,'FEATURES_CAT': features_cat, 'FEATURES_NUM':features_num, 'FEATURES_ENCODED':features_encoded, 'FEATURES_NUM_ENCODED':features_num_encoded, 'FEATURES_NUM_CALCULATED':features_num_calculated, 'FEATURES_FLOAT': features_float}
features_dict_key = 'preprocessing__features_dict'
# features_dict['df_bin_profile'] = None #bin_profile
features_dict['FEATURES_GROUPED'] = features_grouped
features_dict['ADDITIONAL_FIELDS'] = additional_fields


In [None]:
'''Prepares training parameters'''

from catboost import CatBoostClassifier, Pool
from src.web.train_util import *

classifier = CatBoostClassifier

cat_features_len = len(features_cat) +  len (features_grouped)
input_data = df_train
# scale_pos_weight = (input_data[LABEL].value_counts(normalize=True)[0.0] / input_data[LABEL].value_counts(normalize=True)[1.0] ) + 0.5
if not scale_pos_weight:
    scale_pos_weight = 1 #(input_data[LABEL].value_counts(normalize=True)[1.0] / input_data[LABEL].value_counts(normalize=True)[0.0] )

features_dict['use_cat_encoder'] = False
_preProcessor = PreProcessing().fit(input_data, input_data['success'], features_dict=features_dict)            
_x_eval = _preProcessor.transform(df_eval)
_y_eval = df_eval["success"]

alg_name = 'catboostclassifier'


cat_features = list(range(0,cat_features_len))

fit_params = {
    f"{alg_name}__verbose": True,
    f"{alg_name}__cat_features": cat_features,
    f"{alg_name}__plot": True,
    f"{alg_name}__eval_set": Pool(_x_eval, _y_eval, cat_features)
}


features_dict['fit_params'] = fit_params

In [None]:
""" Train the model"""

clf, result_d = build_and_train(
    input_data, 
    classifier, 
    tuned_parameters, 
    alg_name, 
    model_file, 
    best_param=best_parameters, 
    features_dict=features_dict, 
    test_data=df_test,
    output_dir=output_dir)
# model_file, model_file_name = write_model(clf, model_name)
# print("model_file is generated: ", model_file)
print("result_dict: ", result_d)

In [None]:
'''
save feature importance scores.
'''

if training_runner:
    import random
    
    # saving feature importance
    feature_importance = {}
    for feature_name in input_features.keys():
        feature_importance[feature_name] = random.uniform(0.0, 1.0)
    
    with open(feature_importance_file, 'w') as feature_importance_out:
        json.dump(feature_importance, feature_importance_out, ensure_ascii=False, indent=4)

In [None]:
'''
output the model
'''

if training_runner is None:
#     from src.web.model_info_repository import get_latest_version
    model_id = 'ML-TOD-TEST-1'
    version = get_latest_version(model_id, model_type) + 1
    model_name = model_id + '.' + str(version)
    model_file, model_file_name = write_model(clf, model_name)
else:
    model_file = joblib.dump(clf, model_file)

print('model_file generated: ', model_file)