# Dependencies
Install **anaconda** is recommended

| Name             | Version | Numpy & Python Version   |             |
| ---------------- |---------|--------------------------|-------------|
| cassandra-driver | 3.11.0  |      py35_1              | conda-forge |
| pandas           | 0.19.1  | np111py35_0              |             | 
| scikit-learn     | 0.18.1  | np111py35_0              |             |
| scipy            | 0.18.1  | np111py35_0              |             |
| matplotlib       | 2.0.0   | np111py35_0              |             |

In [None]:
'''
commands.
'''

%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
'''
Parameters. 
Parameters that are defined in this cell can be injected and overwritten by the machine learning platform.
'''

# MLP defined parameters 
training_runner = None
project_id = None
training_id = None
metrics_feedback_url = None
model_destination = None

# user defined parameters

# label keys
label = 'success'
# model file directory
work_dir = '/var/spark/ml_files/'
model_type = 'ML-ECO'
start_date = '2018-01-01'
end_date = '2019-05-31'
# desc = '%s_%s_for_calendar_retry_attempt'.format(start_date, end_date)

# data
training_data = work_dir + 'eco_2019_06.csv'
date_increment_bin_data =  work_dir + 'date_increment_bin_2019_Q1.csv'
added_years_bin_data = work_dir + 'added_years_bin_2019_Q1.csv'

# features
input_features = {
    'billing_country': {
        'type': 'string'
    },
    'bin': {
        'type': 'string'
    },
    'card_brand': {
        'type': 'string'
    },
    'card_category': {
        'type': 'string'
    },
    'card_class': {
        'type': 'string'
    },
    'card_usage': {
        'type': 'string'
    },
    'cc_expiration_date': {
        'type': 'string'
    },
    'day_of_month': {
        'type': 'integer'
    },
    'funding_source': {
        'type': 'string'
    },
    'issuer_country': {
        'type': 'string'
    },
    'payment_amount_usd': {
        'type': 'number'
    },
    'payment_currency': {
        'type': 'string'
    },
    'payment_method_id': {
        'type': 'string'
    },
    'transaction_date_in_string': {
        'type': 'string'
    },
    'bank_name': {
        'type': 'string'
    }
}
features_cat = [
    'card_brand', 
    'funding_source', 
    'card_category', 
    'card_class', 
    'card_usage', 
    'issuer_country', 
    'day_of_month', 
    'payment_method_id', 
    'bin',  
    'payment_currency', 
    'date_increment', 
    'bank_name', 
    'merchant_number', 
    'payment_service_id'
]
features_float = [
    'bin', 
    'date_increment'
]
features_num = []
features_num_calculated = [] 
features_num_encoded = [
    'expired_years_diff', 
    'years_over', 
    'date_inc_bin', 
    'add_expiry_years_bin'
]
features_encoded = [
    'week_of_month', 
    'day_of_week', 
    'month', 
    'cc_month', 
    'is_expired'
]

# hyperparameters
tuned_parameters = None
best_parameters = {
    'objective': 'binary:logistic',
    'learning_rate': 0.13, #so called `eta` value
    'max_depth': 10,
    'min_child_weight': 6,
    'silent': 0,
    'subsample': 0.8,
    'colsample_bytree': 0.7,
    'n_estimators': 1000,
    'missing':-999,
    'seed': 1337,
    'scale_pos_weight': 1,
    'eval_metric': 'map',
    'gamma': 2  
}

In [None]:
'''
print out all parameters.
'''

print('training_runner', training_runner)
print('project_id =', project_id)
print('training_id =', training_id)
print('metrics_feedback_url =', metrics_feedback_url)
print('model_destination =', model_destination)
print('label =', label)
print('training_data =', training_data)
print('data_increment_bin_data =', date_increment_bin_data)
print('added_years_bin_data =', added_years_bin_data)
print('input_features =', input_features)
print('features_cat =', features_cat)
print('features_float =', features_float)
print('features_num =', features_num)
print('features_num_calculated = ', features_num_calculated)
print('features_num_encoded = ', features_num_encoded)
print('features_encoded =', features_encoded)
print('tuned_parameters =', tuned_parameters)
print('best_parameters =', best_parameters)

In [None]:
'''
imports.
'''

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import time
import numpy as np

if training_runner is None:
    # from cassandra
    from cassandra.cluster import Cluster
    cassandra_endpoint = '10.62.1.118'
    cluster = Cluster([cassandra_endpoint])
    
# import for training
import numpy as np
from sklearn import cross_validation
from sklearn import metrics
from sklearn.model_selection import cross_val_score

from sklearn import linear_model
from sklearn import tree
from sklearn import cross_validation
from sklearn import ensemble
from sklearn import linear_model
from sklearn import svm
from sklearn.dummy import DummyClassifier
# from sklearn.model_selection import GridSearchCV
from spark_sklearn import GridSearchCV
from sklearn.preprocessing import label_binarize
from sklearn.neighbors import KNeighborsClassifier

# from src.web.utils import PreProcessing
from src.web.preprocessing import PreProcessing
from src.web.preprocessing import make_pipeline
from sklearn.preprocessing import Imputer
from src.web.encoder import EnhancedLeaveOneOutEncoder
from src.web.train_util import *
from xgboost import XGBClassifier
from src.web.train_util import *
from sklearn.externals import joblib

In [None]:
'''
configurations.
'''

pd.options.display.max_colwidth = 300
pd.options.display.max_columns = 100


In [None]:
'''
training data manipulation.
'''

eco = pd.read_csv(training_data)
eco.shape

In [None]:
'''
date increment bin data manipulation.
'''

date_increment_bin_dict = None

if date_increment_bin_data:
    date_increment_bin = pd.read_csv(date_increment_bin_data)
    date_increment_bin.drop(columns=['Unnamed: 0'], inplace=True)
    date_increment_bin['bin'] = date_increment_bin['bin'].apply(str).str.replace('.0', '', regex=False)
    date_increment_bin['date_increment'] = date_increment_bin['date_increment'].apply(str).str.replace('.0', '', regex=False)
    date_increment_bin_dict = date_increment_bin.set_index(['bin', 'date_increment'])['success_rate'].T.to_dict()

In [None]:
'''
added years bin data manipulation.
'''

added_years_bin_dict = None
if added_years_bin_data:
    added_years_bin = pd.read_csv(added_years_bin_data)
    added_years_bin.drop(columns=['Unnamed: 0'], inplace=True)
    added_years_bin['bin'] = added_years_bin['bin'].apply(str).str.replace('.0', '', regex=False)
    added_years_bin['added_expiry_years'] = added_years_bin['added_expiry_years'].apply(str).str.replace('.0', '', regex=False)
    added_years_bin_dict = added_years_bin.set_index(['bin', 'added_expiry_years'])['success_rate'].T.to_dict()

In [None]:
eco = eco[~eco['date_increment'].isna()]
eco = eco[~eco['added_expiry_years'].isna()]
# eco.loc[eco.added_expiry_years == 'STALE', 'added_expiry_years'] = 0
eco['cc_expiration_date'] = eco['cc_expiration_date'].apply(str)
eco.shape

In [None]:
original_size = len(eco)
fail_size = eco[label].value_counts(normalize=True)[0.0]
success_size =  eco[label].value_counts(normalize=True)[1.0]

In [None]:
'''
feature manipulation and aggregation.
'''

features_encoded = features_encoded + features_num_calculated
features = features_cat + features_encoded

# fields = features_cat + features_num + ['transaction_date_in_string', 'cc_expiration_date', 'failed_attempt_date', 'failed_cc_expiration_date'] + features_num_encoded + features_num_calculated
# fields = features_cat + features_num + ['transaction_date_in_string', 'cc_expiration_date', 'billing_country'] + features_num_encoded + features_num_calculated
fields = features_cat + features_num + ['transaction_date_in_string', 'cc_expiration_date', 'billing_country']

# df_decline_type = pd.read_csv(work_dir + 'Decline_Type.csv')

features_dict = {
    'LABEL': label, 
    'FIELDS': fields ,
    'FEATURES_CAT': features_cat, 
    'FEATURES_NUM': features_num, 
    'FEATURES_ENCODED': features_encoded, 
    'FEATURES_NUM_ENCODED': features_num_encoded, 
    'FEATURES_NUM_CALCULATED': features_num_calculated, 
    'FEATURES_FLOAT': features_float
}
features_dict_key = 'preprocessing__features_dict'
features_dict['df_bin_profile'] = None
# features_dict['df_decline_type'] = df_decline_type
# features_dict['df_eco_bin'] = eco_bin_profile

features_dict['date_increment_bin_dict'] = date_increment_bin_dict
features_dict['added_years_bin_dict'] = added_years_bin_dict

In [None]:
'''
train the model with xgboost classifier.
'''

classifier = XGBClassifier
scale_pos_weight = fail_size / success_size
best_parameters['scale_pos_weight'] = scale_pos_weight

model_file = ''
features_dict['eval_metric'] = 'map'
xgb_clf, result_d = build_and_train(
    eco, 
    classifier, 
    tuned_parameters, 
    'xgbclassifier', 
    model_file, 
    best_param=best_parameters, 
    features_dict=features_dict
)
print('result_dict: ', result_d)

In [None]:
'''
output the model
'''

if training_runner is None:
    from src.web.model_info_repository import get_latest_version
    model_id = 'ML-ECO-2'
    version = get_latest_version(model_id, model_type) + 1
    model_name = model_id + '.' + str(version)
    model_file, model_file_name = write_model(xgb_clf, model_name)
else:
    model_file = joblib.dump(xgb_clf, model_destination)

print('model_file generated: ', model_file)

In [None]:
# """Upload model to Nexus repo and insert the model info into Cassandra table"""
# import json

# # start_date = '2018-01-01'
# start_date = '2019-01-01'

# end_date = '2019-05-31'

# try:
#     repo_path = upload_artifact(model_file_name)
#     preprocess_repo_path = handle_preprocessing_file(model_id, version)
#     size_desc = str(", original size: %s (fail: %s, success: %s), balanced_size: %s" % (original_size, fail_size, success_size, original_size))
#     desc = '{}_{}_for_eco model with date_inc_bin and added_years_bin. {}'.format(start_date, end_date, size_desc)
#     hyper_params = result_d.pop('hyper_params', None)
#     extended_att = {"preprocess_repo_path": preprocess_repo_path, "input_features": INPUT_FEATURES}
#     insert_model_info(model_id, version, repo_path, desc=desc, model_type=MODEL_TYPE,eval_metrics=json.dumps(result_d), 
#                       hyper_parameter=json.dumps(hyper_params), extended_att=json.dumps(extended_att), features_dict=features_dict)
    
# except:
#     if not hyper_params:
#         result_d['hyper_params'] = hyper_params 