# Dependencies
Install **anaconda** is recommended

| Name             | Version | Numpy & Python Version   |             |
| ---------------- |---------|--------------------------|-------------|
| cassandra-driver | 3.11.0  |      py35_1              | conda-forge |
| pandas           | 0.19.1  | np111py35_0              |             | 
| scikit-learn     | 0.18.1  | np111py35_0              |             |
| scipy            | 0.18.1  | np111py35_0              |             |
| matplotlib       | 2.0.0   | np111py35_0              |             |

In [None]:
'''
commands.
'''

%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
'''
Parameters. 
Parameters that are defined in this cell can be injected and overwritten by the machine learning platform.
'''

# MLP defined parameters 
training_runner = None
project_id = None
training_id = None
metrics_feedback_url = None
model_file = None
output_dir = 'out'
training_metrics_file = 'training.metrics'
cross_validation_metrics_file = 'cross_validation.metrics'
testing_metrics_file = 'testing.metrics'
feature_importance_file = "feature.importance"

# user defined parameters

# label keys
label = 'success'
# model file directory
model_type = 'ML-TOD'
model_id = 'ML-TOD-2'
start_date = '2019-09-01'

end_date = '2019-12-31'

# features
input_features = {
            "billing_country": {
                "type": "string"
            },
            "bin": {
                "type": "string"
            },
            "bank_name": {
                "type": "string"
            },
            "card_brand": {
                "type": "string"
            },
            "card_category": {
                "type": "string"
            },
            "card_class": {
                "type": "string"
            },
            "card_usage": {
                "type": "string"
            },
            "day_of_month": {
                "type": "integer"
            },
            "funding_source": {
                "type": "string"
            },
            "issuer_country": {
                "type": "string"
            },
            "merchant_number": {
                "type": "string"
            },
            "payment_amount_usd": {
                "type": "number"
            },
            "payment_currency": {
                "type": "string"
            },
            "payment_method_id": {
                "type": "string"
            },
            "payment_service_id": {
                "type": "string"
            },
            "site_id": {
                "type": "string"
            },
            "transaction_date_in_string": {
                "type": "string"
            },
            "duration": {
                "type": "integer"
            },
            "segment_num": {
                "type": "integer"
            },
            "sub_age": {
                "type": "integer"
            }
}

features_cat = ['bin',  
                'segment_num_group', 
                'sub_duration_group',  
                'sub_age_group',
                'renew_att_num',
                'is_first_renewal',
                'payment_currency']  #, 'txn_hour_group', 'funding_source', 'card_brand', 'duration', 'segment_num', 'merchant_number' , 'bank_name', 'card_category',
features_float = ['bin', 'renew_att_num']
features_num = [ 
    
]
# 'duration', 
#     'sub_age',
#     'segment_num'

features_num_encoded = []
features_num_calculated = []

features_cat_encoded = ['txn_hour_min_segment', 'week_of_month', 'day_of_week', 'txn_hour_group',
                        'segment_num_group', 
                        'sub_duration_group',  
                        'sub_age_group',
                        'is_first_renewal',
                        'payment_service_id', 'merchant_number'] 
features_encoded = features_cat_encoded + features_num_encoded

# features_grouped = [['txn_hour_min_segment', 'bin'], 
#                     ['txn_hour_min_segment', 'payment_service_id'],
#                     ['txn_hour_min_segment', 'bank_name'], 
#                     ['txn_hour_min_segment', 'day_of_week'],
#                     ['txn_hour_min_segment', 'day_of_month'],
#                     ['txn_hour_min_segment', 'week_of_month'],
#                     ['txn_hour_min_segment'],
#                     ['txn_hour_group', 'bin'],
#                     ['transaction_hour', 'bin'], 
#                     ['transaction_hour', 'payment_service_id'],
#                     ['transaction_hour', 'bank_name'], 
#                     ['transaction_hour', 'day_of_week'],
#                     ['transaction_hour', 'day_of_month'],
#                     ['transaction_hour', 'week_of_month']
#                    ]

# features_grouped = [
#                     ['bank_name', 'card_category'],
#                     ['txn_hour_group', 'payment_service_id'],
#                     ['payment_service_id', 'merchant_number'],
#                     ['txn_hour_min_segment', 'payment_service_id'],
#                     ['txn_hour_group', 'day_of_week'],
#                     ['txn_hour_group', 'week_of_month'],
#                     ['txn_hour_group', 'day_of_month'],
#                     ['card_brand', 'funding_source'],
#                     ['txn_hour_min_segment'],
#                     ['txn_hour_min_segment', 'bin']
#                    ]

features_grouped = [
                    ['txn_hour_min_segment', 'bank_name', 'card_category'],
                    ['txn_hour_min_segment', 'payment_service_id'],
                    ['payment_service_id', 'merchant_number'],
                    ['txn_hour_min_segment', 'day_of_week'],
                    ['txn_hour_min_segment', 'week_of_month'],
                    ['txn_hour_min_segment', 'day_of_month'],
                    ['txn_hour_group', 'bank_name', 'card_category'],
                    ['txn_hour_group', 'payment_service_id'],
                    ['txn_hour_group', 'bin'],
                    ['txn_hour_group', 'day_of_month'],
                    ['card_brand', 'funding_source'],
                    ['txn_hour_group'],
                    ['transaction_hour'],
                    ['txn_hour_min_segment', 'bin']
                   ]

#                      ['txn_hour_min_segment', 'bank_name'],
#                     ['txn_hour_group', 'bank_name'],


additional_fields =  [ 'payment_amount_usd' ,'issuer_country', 'billing_country', 'day_of_month', 'site_id', 'merchant_number', 'transaction_hour',
                'payment_service_id', 'bin', 'payment_currency', 'bank_name', 'transaction_date_in_string',  'card_category', 'card_brand', 'funding_source']

feature_candidates = [ 'card_brand', 'issuer_country', 'day_of_month', 'site_id', 'merchant_number', 'transaction_hour', 'funding_source',
                'payment_service_id', 'bin', 'payment_currency', 'bank_name', 'card_category', 'date_increment', 'decline_type']


usecols = feature_candidates +  ['renew_att_num', 'cid' ,'payment_amount_usd', 'new_status', 'response_message', 'subscription_id', 'subsegment_id','success', 'cid' , 'received_date', 'billing_country', 'transaction_date_in_string', 'cc_expiration_date']

scale_pos_weight = 1
tuned_parameters = {}

best_parameters = {
              'depth': 5,
              'iterations': 1201,
              'random_seed': 7,
              'scale_pos_weight': scale_pos_weight,
              'subsample': 0.4,
              'bagging_temperature': 3.5,
              'rsm': 0.35,
              'eval_metric': 'AUC:hints=skip_train~false',
              'early_stopping_rounds': 500,
              'model_size_reg': 1.0,
              'random_strength': 5.0
              }


#  'l2_leaf_reg': 20.9,

training_data_paths =  ['tod_all_fields_2019-08.csv', 'tod_all_fields_2019-09.csv', 'tod_all_fields_2019-10.csv', 
                        'tod_all_fields_2019-11.csv', 'tod_all_fields_2019-12.csv', 'tod_all_fields_2020-01.csv', 
                        'tod_all_fields_2020-02.csv', 'tod_all_fields_2020-03.csv', 'tod_all_fields_2020-05.csv']
eval_data_paths = ['tod_all_fields_2020-04.csv']
test_data_paths = ['tod_all_fields_2020-06.csv']

subs_creation_date_files = [
     'subs_subscription_creation_date_2017_01_2017_12.csv', 
     'subs_subscription_creation_date_2018_01_2018_05.csv',
     'subs_subscription_creation_date_2018_06_2018_12.csv',
     'subs_subscription_creation_date_2019_01_2019_12.csv', 
     'subs_subscription_creation_date_2020_01_2020_03.csv',
     'subs_subscription_creation_date_2020_04.csv'
]

subs_files = [
    'subs_subscription_2018_12_to_2020_01.csv', 
    'subs_li_item_2020_02_to_2020_02_20.csv',  
    'subs_li_item_2020_03_2020_05.csv'
]

sub_seg_expire_files = [
    'sub_seg_expire_2019_all.csv', 
    'sub_seg_expire_2020_01_2020_02.csv', 
    'sub_seg_expire_2020_03_to_2020_04.csv', 
    'sub_seg_expire_2020_04_2020_07.csv'
]

'''data  conditions'''

exclude_decline_types = ['invalid_account', 'invalid_cc', 'invalid_txn','correct_cc_retry', 'expired_card']
excluded_processors = ['paypalExpress'] #['mes', 
included_site_ids = [] #['kasperbr']
included_billing_countries = ['BR']

In [None]:
'''
print out all parameters.
'''

print('training_runner = ', training_runner)
print('project_id =', project_id)
print('training_id =', training_id)
print('metrics_feedback_url =', metrics_feedback_url)
print('model_file =', model_file)
print('label =', label)

print('training_data_paths =', training_data_paths)
print('eval_data_paths =', eval_data_paths)
print('test_data_paths =', test_data_paths)
print('sub_seg_expire_files =', sub_seg_expire_files)
print('exclude_decline_types =', exclude_decline_types)


'''
print out manipulated and aggregated features.
'''
print('\n============== training parameters & features ================ ')
print('input_features =', input_features)
print('additional_fields =', additional_fields)
print('tuned_parameters =', tuned_parameters)
print('best_parameters =', best_parameters)
print('features_cat =', features_cat)
print('features_float =', features_float)
print('features_num =', features_num)
print('features_grouped =', features_grouped)

print('feature_num_encoded', features_encoded)
print('features_encoded', features_encoded)
print('features_num_calculated', features_num_calculated)


In [None]:
'''
imports.
'''

#import for training
import numpy as np
from sklearn import cross_validation
from sklearn import metrics
from sklearn.model_selection import cross_val_score

from sklearn import cross_validation
from sklearn import ensemble
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import GridSearchCV
# from spark_sklearn import GridSearchCV
from sklearn.preprocessing import label_binarize

# from src.web.utils import PreProcessing
from src.web.preprocessing import PreProcessing
# from src.web.encoder import EnhancedLeaveOneOutEncoder
from src.web.train_util import *

In [None]:
'''
configurations.
'''
pd.options.display.max_colwidth = 300
pd.options.display.max_columns = 100

In [None]:
from src.web.utils import to_date
from src.web.utils import days_between
from src.web.utils import is_expired

def days_between_period(df):
    d1 = to_date(df['next_renewal_date'])
    d2 = to_date(df['grace_period_date'])
    return abs((d2 - d1).days)

def process_data(df):
    df = df[~(df['payment_service_id'].isin(excluded_processors))]
#     df = df[~df['payment_amount_usd'].isna()]
#     df = df[~(df['new_status'] == 'Reversed')]
#     df = df[df.site_id.isin(included_site_ids)]
    df = df[df.billing_country.isin(included_billing_countries)]

    df['bin'] = df['bin'].fillna('').astype(str).str.replace('.0', '', regex=False)
    
    return df
    
def process_data(dca_df, df_sub, df_sub_seg, df_creation_date):
    df = dca_df.copy()
    df = df[df.billing_country.isin(included_billing_countries)]
    df_amex = df[(df.payment_service_id == 'mes') & (df.card_brand.isin(['American Express', 'Discover']))] 

    df = df
    df = df[~(df.payment_service_id.isin(excluded_processors))]
    df = df[df['date_increment'].isna()]
    
#     df = df[~(df['new_status'] == 'Reversed')]    
#     df = df[~df['payment_amount_usd'].isna()]
    df = pd.merge(df, df_sub[['subsegment_id', 'renewal_window', 'grace_period_date', 'next_renewal_date']], left_on='subsegment_id', right_on='subsegment_id', how='left')
    df = pd.merge(df, df_sub_seg[['subsegment_id', 'duration', 'segment_num']], left_on='subsegment_id', right_on='subsegment_id', how='left')

    
#     df['is_expired'] = df.apply(is_expired, axis=1)
#     df.loc[~df['date_increment'].isna(), 'is_expired'] = True

#     df = df[~(df.duration.isna())]
#     df = df[~(df['bin'] == 'nan')]
#     df = df[~(df['cc_expiration_date'] == 'nan')]
    
    df = pd.concat([df, df_amex])
    df = pd.merge(df, df_creation_date, left_on='subscription_id', right_on='subscription_id', how='left')
    df.subs_activation_date.fillna('2017-01-01 00:00:00', inplace=True)
    df['sub_age'] = df.apply(lambda x: days_between(x.transaction_date_in_string, x.subs_activation_date), axis=1)
    
    return df

In [None]:

subs_creation_date =  pd.concat((read_from(file, s3_dir='ml_files') for file in subs_creation_date_files) , ignore_index=True)
subs_creation_date = subs_creation_date.rename(columns={"SUBSCRIPTION_ID": "subscription_id", "CREATION_DATE": "subs_activation_date"})
subs_creation_date.shape

In [None]:

df_subs =  pd.concat((read_from(file, s3_dir='training_files') for file in subs_files) , ignore_index=True)

df_subs.head()

In [None]:
df_sub_seg_expire =  pd.concat((read_from(file) for file in sub_seg_expire_files) , ignore_index=True).drop_duplicates(subset=['subsegment_id'], keep='first')

In [None]:
# training_data_paths = ['tod_all_fields_2019_09.csv', 'tod_all_fields_2019_10.csv', 'tod_all_fields_2019_11.csv']
# training_data_paths =  ['tod_all_fields_2019_11.csv']

df_train =  pd.concat((read_from( file, usecols=usecols, s3_dir='ml_files') for file in training_data_paths) , ignore_index=True)
# df_train = process_data(df_train)
df_train = process_data(df_train, df_subs, df_sub_seg_expire, subs_creation_date)
print(training_data_paths)
df_train.shape

In [None]:
# eval_data_paths = ['tod_all_fields_2019_12.csv']
# df_eval =  read_from(eval_data_path, usecols=usecols)
df_eval =  pd.concat((read_from( file, usecols=usecols, s3_dir='ml_files') for file in eval_data_paths) , ignore_index=True)
# df_eval = process_data(df_eval)
df_eval = process_data(df_eval, df_subs, df_sub_seg_expire, subs_creation_date)

print(df_eval.shape)


In [None]:
# df_test =  read_from(test_data_path, usecols=usecols)
df_test =  pd.concat((read_from( file, usecols=usecols, s3_dir='ml_files') for file in test_data_paths) , ignore_index=True)
# df_test = process_data(df_test)
df_test = process_data(df_test, df_subs, df_sub_seg_expire, subs_creation_date)


df_test.shape

In [None]:
# df_sub_seg_expire =  pd.concat((read_from(file) for file in sub_seg_expire_files) , ignore_index=True).drop_duplicates(subset=['subsegment_id'], keep='first')
# # df_sub_seg_expire_2020 = pd.read_csv(WORK_DIR + 'sub_seg_expire_2020_01_2020_02.csv')

# df_train = pd.merge(df_train, df_sub_seg_expire[['subsegment_id', 'duration', 'segment_num']], left_on='subsegment_id', right_on='subsegment_id', how='left')
# df_eval = pd.merge(df_eval, df_sub_seg_expire[['subsegment_id', 'duration', 'segment_num']], left_on='subsegment_id', right_on='subsegment_id', how='left')
# df_test = pd.merge(df_test, df_sub_seg_expire[['subsegment_id', 'duration', 'segment_num']], left_on='subsegment_id', right_on='subsegment_id', how='left')


In [None]:
df_test.head()

In [None]:
'''
data manipulation.
'''

df_train = df_train[df_train['date_increment'].isna()]
df_train = df_train[~(df_train['decline_type'].isin(exclude_decline_types))]

df_eval = df_eval[df_eval['date_increment'].isna()]
df_eval = df_eval[~(df_eval['decline_type'].isin(exclude_decline_types))]

df_test = df_test[df_test['date_increment'].isna()]
df_test = df_test[~(df_test['decline_type'].isin(exclude_decline_types))]

#Exclude some data
# df_train = df_train[~(df_train['bin'] == 'nan')]
# df_eval = df_eval[~(df_eval['bin'] == 'nan')]
# df_test = df_test[~(df_test['bin'] == 'nan')]

# df_train = df_train[~(df_train['cc_expiration_date'] == 'nan')]
# df_eval = df_eval[~(df_eval['cc_expiration_date'] == 'nan')]
# df_test = df_test[~(df_test['cc_expiration_date'] == 'nan')]

In [None]:
txn_hour_group = [0,2, 6, 10, 14, 18, 22, 25]

df_train['txn_hour_group'] = pd.cut(df_train['transaction_hour'], txn_hour_group).astype(str).str.replace('.0', '', regex=False)
df_eval['txn_hour_group'] = pd.cut(df_eval['transaction_hour'], txn_hour_group).astype(str).str.replace('.0', '', regex=False)
df_test['txn_hour_group'] = pd.cut(df_test['transaction_hour'], txn_hour_group).astype(str).str.replace('.0', '', regex=False)


In [None]:
segment_num_group = [-1, 1, 2, 3, 4, 5, 6, 7, 8, 15, 20, 25, 30, 40, 50, 70, 100, 150]

df_train['segment_num_group'] = pd.cut(df_train['segment_num'], segment_num_group).astype(str).str.replace('.0', '', regex=False)
df_eval['segment_num_group'] = pd.cut(df_eval['segment_num'], segment_num_group).astype(str).str.replace('.0', '', regex=False)
df_test['segment_num_group'] = pd.cut(df_test['segment_num'], segment_num_group).astype(str).str.replace('.0', '', regex=False)


In [None]:
df_train['is_first_renewal'] = (df_train['segment_num'] < 2) & (df_train['segment_num'] >= 0)
df_eval['is_first_renewal'] = (df_eval['segment_num'] < 2) & (df_eval['segment_num'] >= 0)
df_test['is_first_renewal'] = (df_test['segment_num'] < 2) & (df_test['segment_num'] >= 0)

df_train.loc[(df_train.card_brand.str.lower().str.startswith('american', na=False)), 'card_category'] = 'american_express'
df_eval.loc[(df_eval.card_brand.str.lower().str.startswith('american', na=False)), 'card_category'] = 'american_express'
df_test.loc[(df_test.card_brand.str.lower().str.startswith('american', na=False)), 'card_category'] = 'american_express'

In [None]:
df_train.loc[(df_train['duration'] == 28) | (df_train['duration'] == 29) | (df_train['duration'] == 31) , 'duration'] = 30
df_train.loc[(df_train['duration'] == 366) , 'duration'] = 365
df_train.loc[(df_train['duration'] == 731) , 'duration'] = 730

df_eval.loc[(df_eval['duration'] == 28) | (df_eval['duration'] == 29) | (df_eval['duration'] == 31) , 'duration'] = 30
df_eval.loc[(df_eval['duration'] == 366) , 'duration'] = 365
df_eval.loc[(df_eval['duration'] == 731) , 'duration'] = 730

df_test.loc[(df_test['duration'] == 28) | (df_test['duration'] == 29) | (df_test['duration'] == 31) , 'duration'] = 30
df_test.loc[df_test['duration'] == 366 , 'duration'] = 365
df_test.loc[(df_test['duration'] == 731) , 'duration'] = 730

In [None]:
duration_group = [0, 3, 6, 9, 13, 17, 20, 25, 27, 33, 39, 43, 62, 70, 80, 88, 94, 100, 118, 125, 130, 146, 155, 176, 184, 200, 213, 230, 263, 300, 363, 368, 373,729, 733, 1000, 2000]

df_train['sub_duration_group'] = pd.cut(df_train['duration'], duration_group).astype(str).str.replace('.0', '', regex=False) 
df_eval['sub_duration_group'] = pd.cut(df_eval['duration'], duration_group).astype(str).str.replace('.0', '', regex=False) 
df_test['sub_duration_group'] = pd.cut(df_test['duration'], duration_group).astype(str).str.replace('.0', '', regex=False) 

df_train['sub_age_group'] = pd.cut(df_train['sub_age'], duration_group).astype(str).str.replace('.0', '', regex=False) 
df_eval['sub_age_group'] = pd.cut(df_eval['sub_age'], duration_group).astype(str).str.replace('.0', '', regex=False) 
df_test['sub_age_group'] = pd.cut(df_test['sub_age'], duration_group).astype(str).str.replace('.0', '', regex=False)

In [None]:
df_train.head()

In [None]:
original_size = len(df_train)
balanced_size = len(df_train)
fail_size = df_train[label].value_counts(normalize=True)[0.0]
success_size =  df_train[label].value_counts(normalize=True)[1.0]

In [None]:
'''
feature manipulation and aggregation.
'''
features = features_cat + features_encoded

additional_fields = [x for x in additional_fields if x not in features_cat]
fields = features_cat + features_num +  additional_fields


features_dict = {'LABEL': label, 'FIELDS': fields ,'FEATURES_CAT': features_cat, 'FEATURES_NUM':features_num, 'FEATURES_ENCODED':features_encoded, 'FEATURES_NUM_ENCODED':features_num_encoded, 'FEATURES_NUM_CALCULATED':features_num_calculated, 'FEATURES_FLOAT': features_float}
features_dict_key = 'preprocessing__features_dict'
features_dict['FEATURES_GROUPED'] = features_grouped
features_dict['ADDITIONAL_FIELDS'] = additional_fields

In [None]:
print('fields', fields)
print('features_dict', features_dict)
print('features_dict_key', features_dict_key)

In [None]:
fields

In [None]:
def over_sampling(df):
    print(df[label].value_counts(normalize=True))
    df_0 = df[df[label] == 0]
    df_1 = df[df[label] == 1]
    
    df_0_over = df_0.sample(int(len(df_1)/2), replace=True)
    df = pd.concat([df_0_over, df_1], axis=0, ignore_index=False)
    
    print(df[label].value_counts(normalize=True))
    print(df.shape)
    return df

In [None]:
'''filters data'''

# input_data = df_train[df_train.renew_att_num==1]
# _df_eval = df_eval[df_eval.renew_att_num==1]
# _df_test = df_test[df_test.renew_att_num==1]

input_data = df_train
_df_eval = df_eval
_df_test = df_test

# input_data = over_sampling(input_data)

# _df_eval = over_sampling(_df_eval)
# _df_test = over_sampling(_df_test)

In [None]:
scale_pos_weight

In [None]:
'''Prepares training parameters'''

from catboost import CatBoostClassifier, Pool
import src.web.preprocessing
from src.web.preprocessing import PreProcessing
from src.web.train_util import *
from importlib import import_module
import sys

classifier = CatBoostClassifier

cat_features_len = len(features_cat) +  len (features_grouped)

# scale_pos_weight = (input_data[label].value_counts(normalize=True)[0.0] / input_data[label].value_counts(normalize=True)[1.0] ) + 0.5
if not scale_pos_weight:
    scale_pos_weight = (input_data[label].value_counts(normalize=True)[0.0] / input_data[label].value_counts(normalize=True)[1.0] ) + 0.5

# scale_pos_weight = 1
best_parameters['scale_pos_weight'] = scale_pos_weight
    
features_dict['use_cat_encoder'] = False
_preProcessor = PreProcessing().fit(input_data, input_data['success'], features_dict=features_dict)            
_x_eval = _preProcessor.transform(_df_eval)
_y_eval = _df_eval["success"]

alg_name = 'catboostclassifier'


cat_features = list(range(0,cat_features_len))

fit_params = {
    f"{alg_name}__verbose": True,
    f"{alg_name}__cat_features": cat_features,
    f"{alg_name}__plot": True,
    f"{alg_name}__eval_set": Pool(_x_eval, _y_eval, cat_features)
}


features_dict['fit_params'] = fit_params

model_file = ''

In [None]:
""" Train the model"""
if training_runner is None:
    version = get_latest_version(model_id, model_type) + 1
    model_name = model_id + '.' + str(version)
    features_dict['model_name'] = model_name

clf, result_d = build_and_train(
    input_data, 
    classifier, 
    tuned_parameters, 
    alg_name, 
    model_file, 
    best_param=best_parameters, 
    features_dict=features_dict, 
    test_data=_df_test,
    metrics_feedback_url=metrics_feedback_url)
                                   
print("result_dict: ", result_d)

In [None]:
'''
save feature importance scores.
'''

import random

# saving feature importance
feature_importance = {}
for feature_name in input_features.keys():
    feature_importance[feature_name] = random.uniform(0.0, 1.0)

with open(feature_importance_file, 'w') as feature_importance_out:
    json.dump(feature_importance, feature_importance_out, ensure_ascii=False, indent=4)

In [None]:
'''
output the model
'''

start_date = '2019-08-01'
end_date = '2020-05-30'
if training_runner is None:
    version = get_latest_version(model_id, model_type) + 1
    model_name = model_id + '.' + str(version)
    model_file, model_file_name = write_model(clf, model_name)
    
    preprocess_repo_path = handle_preprocessing_file(model_id, version)
    size_desc = str(", original size: %s (fail: %s, success: %s), balanced_size: %s" % (original_size, fail_size, success_size, original_size))
    desc = f'TOD for kasperbr. Include all renew_att. Use txn_hour_min_segment to aggregate instead of thx_hour_group. Add card_brand-funding_source, txn_hour_min_segment. Only include date_increment is na. Include duration_group, sub_age_group and segment_num_group. With new ROC AUC using predict_proba. No oversampling failed samples. Cal attempt 1 only. Add txn_hour_group. {start_date}_{end_date}_for training data, 2020-04 for eval data, 2020-06 for test data, eval_metric=AUC. Size: {size_desc}'

    hyper_params = result_d.pop('hyper_params', None)
    extended_att = {"preprocess_repo_path": preprocess_repo_path, "input_features": input_features}
    repo_path = upload_artifact(model_file_name)
    insert_model_info(model_id, version, repo_path, desc=desc, model_type=model_type,eval_metrics=json.dumps(result_d), 
                      hyper_parameter=json.dumps(hyper_params), extended_att=json.dumps(extended_att), features_dict=features_dict, algorithm='CatBoostClassifier')
    
else:
    model_file = joblib.dump(clf, model_file)

print('model_file generated: ', model_file)