# Dependencies
Install **anaconda** is recommended

| Name             | Version | Numpy & Python Version   |             |
| ---------------- |---------|--------------------------|-------------|
| cassandra-driver | 3.11.0  |      py35_1              | conda-forge |
| pandas           | 0.19.1  | np111py35_0              |             | 
| scikit-learn     | 0.18.1  | np111py35_0              |             |
| scipy            | 0.18.1  | np111py35_0              |             |
| matplotlib       | 2.0.0   | np111py35_0              |             |

In [None]:
'''
commands.
'''

%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
'''
Parameters. 
Parameters that are defined in this cell can be injected and overwritten by the machine learning platform.
'''

# MLP defined parameters 
training_runner = None
project_id = None
training_id = None
metrics_feedback_url = None
model_file = None
output_dir = 'out'
training_metrics_file = 'training.metrics'
cross_validation_metrics_file = 'cross_validation.metrics'
testing_metrics_file = 'testing.metrics'

# user defined parameters

# label keys
label = 'success'
# model file directory
work_dir = '/var/spark/ml_files/'
model_type = 'ML-BR'
start_date = '2018-01-01'
end_date = '2019-05-31'
# desc = '%s_%s_for_calendar_retry_attempt'.format(start_date, end_date)

# data
training_data = work_dir + 'calendar_retry_2019_04.csv'
bin_profile_data =  work_dir + 'bin_profile_2019_01_to_2019_05.csv'
payment_mid_bin_data = work_dir + 'payment_mid_bin_2019_01_to_05.csv'
decline_type_data = work_dir + 'Decline_Type.csv'

# features
input_features = {
    'billing_country': {
        'type': 'string'
    },
    'bin': {
        'type': 'string'
    },
    'card_brand': {
        'type': 'string'
    },
    'card_category': {
        'type': 'string'
    },
    'card_class': {
        'type': 'string'
    },
    'card_usage': {
        'type': 'string'
    },
    'cc_expiration_date': {
        'type': 'string'
    },
    'day_of_month': {
        'type': 'integer'
    },
    'failed_attempt_date': {
        'type': 'string'
    },
    'failed_response_code': {
        'type': 'string'
    },
    'failed_response_message': {
        'type': 'string'
    },
    'funding_source': {
        'type': 'string'
    },
    'issuer_country': {
        'type': 'string'
    },
    'merchant_number': {
        'type': 'string'
    },
    'payment_amount_usd': {
        'type': 'number'
    },
    'payment_currency': {
        'type': 'string'
    },
    'payment_method_id': {
        'type': 'string'
    },
    'payment_service_id': {
        'type': 'string'
    },
    'renew_att_num': {
        'type': 'integer'
    },
    'site_id': {
        'type': 'string'
    },
    'transaction_date_in_string': {
        'type': 'string'
    }
}
features_cat = [
    'card_brand', 
    'funding_source', 
    'card_category', 
    'card_class', 
    'card_usage', 
    'issuer_country', 
    'day_of_month', 
    'site_id', 
    'failed_decline_type', 
    'merchant_number', 
    'payment_service_id', 
    'payment_method_id', 
    'bin', 
    'renew_att_num', 
    'failed_day_of_month', 
    'payment_currency', 
    'days_between',
    'failed_response_code'
]
features_float = [ 
    'bin', 
    'renew_att_num', 
    'failed_response_code' 
]
features_num = [ 
    'payment_amount_usd' 
]
features_encoded = [ 
    'week_of_month', 
    'day_of_week', 
    'is_expired'
]
features_num_bin_profile = [ 
    'Mean', 
    'Median', 
    'StdDev', 
    'Max_99', 
    'Max' 
]

# hyperparameters
tuned_parameters = {
    'objective':['binary:logistic'],
    'learning_rate': [0.2], #so called `eta` value
    'max_depth': [10],
    'min_child_weight': [11],
    'silent': [0],
    'subsample': [0.5],
    'colsample_bytree': [0.7],
#    'n_estimators': [500, 1000], #number of trees, change it to 1000 for better results
    'n_estimators': [1000], #number of trees, change it to 1000 for better results  
    'missing':[-999],
    'max_delta_step':[1],  
    'seed': [1337]
}

best_parameters = {
    'objective': 'binary:logistic',
    'learning_rate': 0.15, #so called `eta` value
    'max_depth': 8,
    'min_child_weight': 8,
    'silent': 0,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
#     'n_estimators': [500, 1000], #number of trees, change it to 1000 for better results
    'n_estimators': 1000, #number of trees, change it to 1000 for better results  
    'missing':-999,
    'seed': 1337,
    'scale_pos_weight': 1,
    'gamma': 1
}

In [None]:
'''
print out all parameters.
'''

print('training_runner = ', training_runner)
print('project_id =', project_id)
print('training_id =', training_id)
print('metrics_feedback_url =', metrics_feedback_url)
print('training_metrics_file =', training_metrics_file)
print('cross_validation_metrics_file =', cross_validation_metrics_file)
print('testing_metrics_file =', testing_metrics_file)
print('model_file =', model_file)
print('label =', label)
print('training_data =', training_data)
print('bin_profile_data =', bin_profile_data)
print('payment_mid_bin_data =', payment_mid_bin_data)
print('decline_type_data =', decline_type_data)
print('input_features =', input_features)
print('features_cat =', features_cat)
print('features_float =', features_float)
print('features_num =', features_num)
print('features_num_bin_profile =', features_num_bin_profile)
print('features_encoded =', features_encoded)
print('tuned_parameters =', tuned_parameters)
print('best_parameters =', best_parameters)

In [None]:
'''
imports.
'''

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import time
import numpy as np

if training_runner is None:
    # from cassandra
    from cassandra.cluster import Cluster
    cassandra_endpoint = '10.62.1.118'
    cluster = Cluster([cassandra_endpoint])
    
#import for training
from sklearn import cross_validation
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn import linear_model
from sklearn import tree
from sklearn import cross_validation
from sklearn import ensemble
from sklearn import linear_model
from sklearn import svm
from sklearn.dummy import DummyClassifier
# from sklearn.model_selection import GridSearchCV
from spark_sklearn import GridSearchCV
from sklearn.preprocessing import label_binarize
from sklearn.neighbors import KNeighborsClassifier
# from src.web.utils import PreProcessing
from src.web.preprocessing import PreProcessing
from src.web.encoder import EnhancedLeaveOneOutEncoder
from src.web.train_util import *
from xgboost import XGBClassifier
from src.web.train_util import *
from sklearn.externals import joblib

In [None]:
'''
configurations.
'''
pd.options.display.max_colwidth = 300
pd.options.display.max_columns = 100

In [26]:
'''
training data manipulation.
'''

retry_success =  pd.read_csv(training_data)
retry_success['bin'] = retry_success['bin'].fillna('').astype(str).str.replace('.0', '', regex=False)
retry_success.shape
retry_success.head()

In [27]:
'''
bin profile data manipulation.
'''

bin_profile = None
if bin_profile_data:
    bin_profile = pd.read_csv(bin_profile_data)
    bin_profile['bin'] = bin_profile['bin'].fillna('').astype(str).str.replace('.0', '', regex=False)
    bin_profile.shape

In [31]:
'''
payment mid bin data manipulation.
'''

payment_mid_bin_dict = None
if payment_mid_bin_data:
    payment_mid_bin = pd.read_csv(payment_mid_bin_data)
    payment_mid_bin['bin'] = payment_mid_bin['bin'].apply(str).str.replace('.0', '', regex=False)
    payment_mid_bin_dict = payment_mid_bin.set_index(['bin', 'payment_service_id', 'merchant_number'])['success_rate'].T.to_dict()
    payment_mid_bin_dict
    

In [None]:
'''
decline type data manipulation.
'''

df_decline_type = pd.read_csv(decline_type_data)

In [None]:
original_size = len(retry_success)
fail_size = retry_success[label].value_counts(normalize=True)[0.0]
success_size =  retry_success[label].value_counts(normalize=True)[1.0]

In [35]:
'''
feature manipulation and aggregation.
'''

features_num_encoded = [] + features_num_bin_profile  #, 'payment_mid_bin'
features_encoded = features_encoded + features_num_encoded
features_num_calculated = []
features = features_cat + features_encoded

fields = features_cat + \
         features_num + \
         ['transaction_date_in_string', 'cc_expiration_date', 'failed_attempt_date'] + \
         features_num_bin_profile

features_dict = {
    'LABEL': label, 
    'FIELDS': fields ,
    'FEATURES_CAT': features_cat, 
    'FEATURES_NUM': features_num, 
    'FEATURES_ENCODED': features_encoded, 
    'FEATURES_NUM_ENCODED': features_num_encoded, 
    'FEATURES_NUM_CALCULATED': features_num_calculated, 
    'FEATURES_FLOAT': features_float
}
features_dict_key = 'preprocessing__features_dict'
features_dict['df_bin_profile'] = bin_profile
features_dict['df_decline_type'] = df_decline_type
# features_dict['payment_mid_bin_dict'] = payment_mid_bin_dict
features_dict['FEATURES_NUM_BIN_PROFILE'] = features_num_bin_profile

In [None]:
'''
print out manipulated and aggregated features.
'''

print('feature_num_encoded', features_encoded)
print('features_encoded', features_encoded)
print('features_num_calculated', features_num_calculated)
print('features', features)
print('fields', fields)
print('features_dict', features_dict)
print('features_dict_key', features_dict_key)

In [36]:
'''
train the model with xgboost classifier
'''

classifier = XGBClassifier
scale_pos_weight = fail_size / success_size
best_parameters['scale_pos_weight'] = scale_pos_weight

features_dict['eval_metric'] = 'map'
xgb_clf, result_d = build_and_train(
    retry_success, 
    classifier, 
    tuned_parameters, 
    'xgbclassifier', 
    model_file, 
    best_param=best_parameters, 
    features_dict=features_dict,
    output_dir=output_dir
)
print('result_dict: ', result_d)

In [None]:
'''
output the model
'''

if training_runner is None:
    from src.web.model_info_repository import get_latest_version
    model_id = 'ML-BR-1'
    version = get_latest_version(model_id, model_type) + 1
    model_name = model_id + '.' + str(version)
    model_file, model_file_name = write_model(xgb_clf, model_name)
else:
    model_file = joblib.dump(xgb_clf, model_file)

print('model_file generated: ', model_file)

In [None]:
# """Upload model to Nexus repo and insert the model info into Cassandra table"""
# import json

# start_date = '2018-01-01'

# end_date = '2019-05-31'

# try:
#     repo_path = upload_artifact(model_file_name)
#     preprocess_repo_path = handle_preprocessing_file(model_id, version)
#     size_desc = str(", original size: %s (fail: %s, success: %s), balanced_size: %s" % (original_size, fail_size, success_size, original_size))
#     desc = '{}_{}_for_calendar retry model. {}'.format(start_date, end_date, size_desc)
#     hyper_params = result_d.pop('hyper_params', None)
#     extended_att = {"preprocess_repo_path": preprocess_repo_path, "input_features": INPUT_FEATURES}
#     insert_model_info(model_id, version, repo_path, desc=desc, model_type=MODEL_TYPE,eval_metrics=json.dumps(result_d), 
#                       hyper_parameter=json.dumps(hyper_params), extended_att=json.dumps(extended_att), features_dict=features_dict)
    
# except:
#     if not hyper_params:
#         result_d['hyper_params'] = hyper_params 