In [1]:
%load_ext autoreload
%autoreload 2
import matplotlib.pyplot as plt
import matplotlib
import time
import numpy as np
%matplotlib inline

import pandas as pd
pd.options.display.max_colwidth = 300
pd.options.display.max_columns = 100

from src.web.train_util import read_from
from SortedSet.sorted_set import SortedSet

In [2]:
'''
Parameters. 
Parameters that are defined in this cell can be injected and overwritten by the machine learning platform.
'''

# MLP defined parameters 
training_runner = None
project_id = None
training_id = None
metrics_feedback_url = None
model_destination = None

# user defined parameters

# label keys
label = 'success'
# model file directory
work_dir = '/var/spark/ml_files/'
model_type = 'ML-BR'
model_id = 'ML-BR-1'
# desc = '%s_%s_for_calendar_retry_attempt'.format(start_date, end_date)

# data
training_data = ''
# bin_profile_data =  work_dir + 'bin_profile_2019_01_to_2019_05.csv'
# payment_mid_bin_data = work_dir + 'payment_mid_bin_2019_01_to_05.csv'
# decline_type_data = work_dir + 'Decline_Type.csv'

# features
input_features = {
            "billing_country": {
                "type": "string"
            },
            "bin": {
                "type": "string"
            },
            "card_brand": {
                "type": "string"
            },
            "card_category": {
                "type": "string"
            },
            "card_class": {
                "type": "string"
            },
            "card_usage": {
                "type": "string"
            },
            "cc_expiration_date": {
                "type": "string"
            },
            "day_of_month": {
                "type": "integer"
            },
            "failed_attempt_date": {
                "type": "string"
            },
            "failed_response_code": {
                "type": "string"
            },
            "failed_response_message": {
                "type": "string"
            },
            "funding_source": {
                "type": "string"
            },
            "issuer_country": {
                "type": "string"
            },
            "merchant_number": {
                "type": "string"
            },
            "payment_amount_usd": {
                "type": "number"
            },
            "payment_currency": {
                "type": "string"
            },
            "payment_method_id": {
                "type": "string"
            },
            "payment_service_id": {
                "type": "string"
            },
            "renew_att_num": {
                "type": "integer"
            },
            "site_id": {
                "type": "string"
            },
            "transaction_date_in_string": {
                "type": "string"
            },
            "renewal_window": {
                "type": "integer"
            },
            "duration": {
                "type": "integer"
            },
            "segment_num": {
                "type": "integer"
            },
            "sub_age": {
                "type": "integer"
            }, 
            "bank_name": {
                "type": "string"
            },
            "first_calendar_attempt_date": {
                "type": "string"
            }, 
            "previous_cal_response_message_1": {
                "type": "string"
            },
            "previous_cal_response_message_2": {
                "type": "string"
            }, 
            "previous_cal_response_message_3": {
                "type": "string"
            },
            "previous_cal_response_code_1": {
                "type": "string"
            }, 
            "previous_cal_response_code_2": {
                "type": "string"
            },
            "previous_cal_response_code_3": {
                "type": "string"
            },
            "first_cal_response_message_1": {
                "type": "string"
            }, 
            "first_cal_response_message_2": {
                "type": "string"
            },
            "first_cal_response_message_3": {
                "type": "string"
            }
        }

features_cat = [
    'failed_decline_type',
    'failed_decline_type_from_first_cal',
    'day_of_month', 
    'funding_source', 
    'payment_currency', 
    'days_between',
    'days_between_from_first_cal',
    'renewal_window',  
    'renew_att_num',
    'is_first_renewal',
    'failed_response_codes_from_previous_cal',
    'payment_amount_group',
    'billing_country',
    'card_brand']

features_float = [ 
    'bin',  
    'failed_response_code', 
    'date_increment', 
    'renewal_window'
]

features_num = [ 
    'duration', 
    'sub_age',
    'payment_amount_usd'
]
features_num_calculated = []
features_num_bin_profile = []

features_num_encoded = ['success_bank_card_count_per_day_of_month', 'txn_amount_bank_card_max_per_date_diff']

features_cat_encoded = [
    'month', 
    'days_between', 
    'days_between_from_first_cal',
    'failed_decline_type_from_first_cal',
    'renew_att_num', 
    'day_of_week', 
    'num_of_days', 
    'payment_service_id', 
    'merchant_number', 
    'month', 
    'is_expired', 
    'segment_num_group',
    'sub_duration_group',
    'payment_amount_group',
    'sub_age_group',
    'is_first_renewal',
    'card_brand',
    'failed_response_codes_from_previous_cal',
    'failed_response_codes_from_first_cal'
]


features_grouped = [
    ['payment_service_id', 'merchant_number'],
    ['failed_decline_type', 'payment_service_id'],
    ['failed_decline_type', 'funding_source'],
    ['failed_decline_type', 'card_brand'],
    ['failed_decline_type', 'is_expired'],
    ['failed_decline_type', 'failed_response_codes_from_previous_cal'],
    ['payment_service_id', 'merchant_number', 'payment_currency'],
    ['bin', 'is_expired'],
    ['bank_name', 'is_expired'],
    ['bank_name', 'card_category'],
    ['card_brand', 'funding_source'],
    ['payment_service_id', 'is_expired'],
    ['days_between', 'failed_decline_type'],
    ['days_between', 'failed_response_codes_from_previous_cal'],
    ['failed_response_codes_from_previous_cal', 'funding_source'],
    ['days_between_from_first_cal', 'failed_response_codes_from_first_cal'],
    ['days_between_from_first_cal', 'renewal_window'],
    ['days_between_from_first_cal', 'funding_source'],
    ['days_between', 'funding_source'],
    ['card_brand', 'is_expired'],
    ['sub_age_group', 'sub_duration_group'],
    ['sub_duration_group', 'is_expired'],
    ['segment_num_group', 'is_expired'],
    ['day_of_week'],
    ['is_expired'],
    ['issuer_country', 'is_expired'] 
]

#     ['failed_response_messages_from_first_cal_sorted', 'funding_source'],
#     ['failed_response_messages_from_previous_cal', 'funding_source'],
#     ['days_between_from_first_cal', 'failed_decline_type_from_first_cal'],
#     ['renew_att_num', 'days_between', 'renewal_window'],
# ['days_between_from_first_cal', 'renewal_window'],

features_encoded = features_cat_encoded + features_num_encoded

additional_fields = [
    'card_brand',  
    'segment_num', 
    'segment_num_group',
    'payment_amount_group',
    'bank_name', 
    'duration', 
    'is_expired', 
    'renewal_window', 
    'payment_currency', 
    'funding_source', 
    'card_category', 
    'card_class', 
    'card_usage', 
    'renew_att_num', 
    'site_id', 
    'bin', 
    'merchant_number', 
    'billing_country', 
    'funding_source', 
    "payment_service_id", 
    'day_of_month', 
    'failed_decline_type',  
    'failed_day_of_month', 
    'failed_response_code', 
    'payment_amount_usd', 
    'issuer_country',  
    'failed_response_message',
    'days_between',  
    'transaction_date_in_string', 
    'cc_expiration_date', 
    'failed_attempt_date',
    'days_between_from_first_cal',
    'failed_decline_type_from_first_cal',
    'failed_response_codes_from_previous_cal',
    'failed_response_codes_from_first_cal'
]

feature_candidates = ['card_brand', 'funding_source', 'card_category', 'card_class', 'card_usage', 'issuer_country', 
                 'day_of_month', 'site_id', 'failed_decline_type', 'merchant_number', 
                'payment_service_id', 'payment_method_id', 'bin', 'renew_att_num', 'failed_day_of_month', 
                'payment_currency', 'days_between', 'failed_response_code', 'payment_amount_usd', 'date_increment', 
                'transaction_hour', 'failed_response_messages_from_previous_cal', 'failed_response_codes_from_previous_cal', 
                'failed_decline_type_from_previous_cal', 'failed_response_messages_from_first_cal', 'failed_decline_type_from_first_cal', 'days_between_from_first_cal', 
                'failed_decline_type_from_first_cal','failed_response_codes_from_first_cal']

usecols = feature_candidates +  ['new_status','subscription_id', 'subsegment_id', 'success', 'cid' ,'bank_name','added_expiry_years', 'failed_response_message','date_increment', 'received_date', 'billing_country', 'transaction_date_in_string', 'cc_expiration_date', 'failed_attempt_date']

'''
data parameters
'''
excluded_processors = ['mes', 'paypalExpress']

subs_creation_date_files_2019 = [
     'subs_subscription_creation_date_2017_01_2017_12.csv', 
     'subs_subscription_creation_date_2018_01_2018_05.csv',
     'subs_subscription_creation_date_2018_06_2018_12.csv',
     'subs_subscription_creation_date_2019_01_2019_12.csv']

subs_creation_date_files = [
    'subs_subscription_creation_date_2020-01.csv',
    'subs_subscription_creation_date_2020-02.csv',
    'subs_subscription_creation_date_2020-03.csv',
    'subs_subscription_creation_date_2020-04.csv',
    'subs_subscription_creation_date_2020-05.csv',
    'subs_subscription_creation_date_2020-06.csv',
    'subs_subscription_creation_date_2020-07.csv',
    'subs_subscription_creation_date_2020-08.csv',
    'subs_subscription_creation_date_2020-09.csv'
]

subs_files = [
    'subs_subscription_2018_12_to_2020_01.csv', 
    'subs_li_item_2020-02.csv',  
    'subs_li_item_2020-03.csv',
    'subs_li_item_2020-04.csv',
    'subs_li_item_2020-05.csv',  
    'subs_li_item_2020-06.csv',
    'subs_li_item_2020-07.csv',
    'subs_li_item_2020-08.csv',
    'subs_li_item_2020-09.csv'
]

sub_seg_expire_files = [
    'sub_seg_expire_2019_all.csv', 
    'sub_seg_expire_2020_01.csv',
    'sub_seg_expire_2020-02.csv',
    'sub_seg_expire_2020-03.csv',
    'sub_seg_expire_2020-04.csv',
    'sub_seg_expire_2020-05.csv',
    'sub_seg_expire_2020-06.csv',
    'sub_seg_expire_2020-07.csv',
    'sub_seg_expire_2020-08.csv',
    'sub_seg_expire_2020-09.csv'
]


#     'sub_seg_expire_2020_01_2020_02.csv', 
#     'sub_seg_expire_2020_03_to_2020_04.csv', 
#     'sub_seg_expire_2020_04_2020_07.csv'

#'dca_2019_06.csv', 'dca_2019_07.csv', 'dca_2019_08.csv', 'dca_2019_09.csv','dca_2019_10.csv'
# TO DO : dca_2019_11
start_date = '2019-10-01'
end_date = '2020-08-31'

# 'dca_2019_01.csv', 'dca_2019_02.csv', 'dca_2019_03.csv', 'dca_2019_04.csv',
#                   'dca_2019_05.csv', 'dca_2019_06.csv', 'dca_2019_07.csv',
#                   'dca_2019_08.csv',

addition_success_samples = ['dca_2019_05.csv', 'dca_2019_06.csv', 'dca_2019_07.csv', 'dca_2019_08.csv']
# training_files = [
#                     'dca_2019_06.csv', 'dca_2019_07.csv', 'dca_2019_08.csv',
#                    'dca_2019_09.csv', 'dca_2019_10.csv', 'dca_2019_11.csv', 'dca_2019_12.csv', 
#                   'dca_2020_01.csv', 'dca_2020_02.csv', 'dca_2020_03.csv', 'dca_2020_04.csv', 'dca_2020_06.csv']

training_files = [
                'dca_2019_10.csv', 'dca_2019_11.csv', 'dca_2019_12.csv',    
                'dca_2019_10.csv', 'dca_2019_11.csv', 'dca_2019_12.csv',
                'dca_2020_01.csv', 'dca_2020_02.csv', 'dca_2020_03.csv', 'dca_2020_06.csv', 'dca_2020_05.csv',  
                'dca_2020_04.csv', 'dca_2020_08.csv']

# training_files = [
#                    'dca_2019_07.csv', 'dca_2019_08.csv',
#                    'dca_2019_09.csv', 'dca_2019_10.csv', 'dca_2019_11.csv', 'dca_2019_12.csv', 
#                   'dca_2020_01.csv', 'dca_2020_02.csv', 'dca_2020_03.csv', 'dca_2020_05.csv']

eval_files = [ 'dca_2020_07.csv' ]
test_files = [ 'dca_2020_09.csv']
bin_profile_per_date_month_path = 'bin_profile_per_date_month_2018_2020_08.csv'
bank_profile_per_date_month_path = 'bank_profile_per_date_month_2018_2020_08.csv'
bin_profile_per_day_of_month_path = 'bin_profile_per_day_of_month_2018_2020_08.csv'
bank_profile_per_day_of_month_path = 'bank_profile_per_day_of_month_2018_2020_08.csv'



'''
training hyperparameters
'''
scale_pos_weight = None

tuned_parameters = {}

best_parameters = {
              'depth': 6,
              'iterations': 1201,
              'random_seed': 7,
              'scale_pos_weight': scale_pos_weight,
              'subsample': 0.5,
              'bagging_temperature': 3.5,
              'rsm': 0.35,
              'eval_metric': 'BrierScore',
              'early_stopping_rounds': 500,
              'model_size_reg': 2.5,
              'l2_leaf_reg': 20.9,
              'random_strength': 5.0
              }

#  'BrierScore',

# included_billing_countries = ['US']

card_fields=['card_brand', 'card_category', 'card_class', 'card_usage']
generic_decline_codes = SortedSet([])

In [3]:

from src.web.utils import to_date
from src.web.utils import days_between
from src.web.utils import is_expired
from SortedSet.sorted_set import SortedSet



def convert_str_to_sorted_set(s):
    x = str(s).replace(' ', '').replace('.0', '')
    response_codes = SortedSet(x.split(',')) - SortedSet([''])
    unique_codes =  response_codes - generic_decline_codes
    if not unique_codes:
        unique_codes =  response_codes
    return ",".join(unique_codes)

def days_between_period(df):
    d1 = to_date(df['next_renewal_date'])
    d2 = to_date(df['grace_period_date'])
    return abs((d2 - d1).days)

def process_dca_data(dca_df, df_sub, df_sub_seg, df_creation_date):
    df = dca_df.copy()
    
    """Exclude subs that use mes as processor but didn't use either amex or discover card brand"""
    df_amex = df[(df.payment_service_id == 'mes') & (df.card_brand.isin(['American Express', 'Discover']))] 
    df = df[~(df.payment_service_id.isin(excluded_processors))]
    df = pd.concat([df, df_amex])
      
#     df = df[df.billing_country.isin(included_billing_countries)]
    df = pd.merge(df, df_sub[['subsegment_id', 'renewal_window', 'grace_period_date', 'next_renewal_date']], left_on='subsegment_id', right_on='subsegment_id', how='left')
    df = pd.merge(df, df_sub_seg[['subsegment_id', 'duration', 'segment_num']], left_on='subsegment_id', right_on='subsegment_id', how='left')
    
    df['is_expired'] = df.apply(is_expired, axis=1)
    df.loc[~df['date_increment'].isna(), 'is_expired'] = True

    df = df[~(df.duration.isna())]
    df['bin'] = df['bin'].astype(str)
    df = df[~(df['bin'] == 'nan')]
    df = df[~(df['cc_expiration_date'] == 'nan')]

    df = df[~(df['new_status'] == 'Reversed')]
    df = df[~df['payment_amount_usd'].isna()]
    
    df = pd.merge(df, df_creation_date, left_on='subscription_id', right_on='subscription_id', how='left')
    df.subs_activation_date.fillna('2017-01-01 00:00:00', inplace=True)
    df['sub_age'] = df.apply(lambda x: days_between(x.transaction_date_in_string, x.subs_activation_date), axis=1)

    df = df.rename(columns={"next_renewal_date": "first_calendar_attempt_date"})
    
#     #new addition
#     df = df[~df.first_calendar_attempt_date.isna()]
    
    df.failed_decline_type = df.failed_decline_type_from_previous_cal
#     df.loc[df['failed_decline_type_from_first_cal'] == 'Base', 'failed_decline_type_from_first_cal'] = df.failed_decline_type
    
    df.failed_response_codes_from_previous_cal = df.failed_response_codes_from_previous_cal.apply(convert_str_to_sorted_set)
    df.failed_response_codes_from_first_cal = df.failed_response_codes_from_first_cal.apply(convert_str_to_sorted_set)

#     df.failed_response_messages_from_previous_cal = df.failed_response_messages_from_previous_cal_sorted
#     df.failed_response_messages_from_first_cal = df.failed_response_messages_from_first_cal_sorted
    
    
    df[card_fields] = df[card_fields].astype(str).apply(
                lambda x: x.str.lower().replace(' ', '', regex=True) \
                .replace("nodatafound',value:'n/a", "", regex=False) \
                .replace("nodatafound", "",regex=False) \
                .replace("nodatafound'value:'n/a", "",regex=False))

    
    return df


In [4]:

subs_creation_date_2019 =  pd.concat((read_from(file, s3_dir='ml_files') for file in subs_creation_date_files_2019) , ignore_index=True)
subs_creation_date_2019 = subs_creation_date_2019.rename(columns={"SUBSCRIPTION_ID": "subscription_id", "CREATION_DATE": "subs_activation_date"})

subs_creation_date =  pd.concat((read_from(file, s3_dir='ml_files') for file in subs_creation_date_files) , ignore_index=True)
# subs_creation_date = subs_creation_date.rename(columns={"SUBSCRIPTION_ID": "subscription_id", "CREATION_DATE": "subs_activation_date"})

subs_creation_date =  pd.concat([subs_creation_date, subs_creation_date_2019])
subs_creation_date.head()

Unnamed: 0,subscription_id,subs_activation_date
0,13824581610,2020-01-24 13:49:26
1,13835225210,2020-01-27 11:57:02
2,13749247110,2020-01-06 07:01:08
3,13847169310,2020-01-30 11:12:54
4,13830408310,2020-01-26 08:20:54


In [5]:

df_subs =  pd.concat((read_from(file, s3_dir='training_files') for file in subs_files) , ignore_index=True)

df_subs.head()

Unnamed: 0,subsegment_id,next_renewal_date,grace_period_date,segment_number,line_item_type,renewal_window
0,17904709400,2018-12-15,2018-12-26,8.0,RENEWED,11
1,17958051000,2018-12-20,2019-01-19,2.0,RENEWED,30
2,17795605900,2018-12-04,2019-01-03,7.0,RENEWED,30
3,14777236400,2018-12-04,2018-12-19,,RENEWED,15
4,15218507800,2019-01-27,2019-02-20,,RENEWED,24


In [6]:
df_subs.subsegment_id

0           17904709400
1           17958051000
2           17795605900
3           14777236400
4           15218507800
               ...     
53468944    24364356400
53468945    20989957000
53468946    24650442000
53468947    24417006400
53468948    24628199800
Name: subsegment_id, Length: 53468949, dtype: int64

In [7]:
df_sub_seg_expire =  pd.concat((read_from(file, s3_dir='training_files') for file in sub_seg_expire_files) , ignore_index=True).drop_duplicates(subset=['subsegment_id'], keep='first')

In [None]:

df_train = pd.concat((read_from(file, usecols=usecols, s3_dir='ml_files') for file in training_files) , ignore_index=True)

df_train = process_dca_data(df_train, df_subs, df_sub_seg_expire, subs_creation_date)


'''Additional success sample for df_train'''
# df_train_2 = pd.concat((read_from(file, usecols=usecols, s3_dir='ml_files') for file in addition_success_samples) , ignore_index=True)
# df_train_2 = df_train_2[(df_train_2.success == 1)]
# df_train_2 = process_dca_data(df_train_2, df_subs, df_sub_seg_expire, subs_creation_date)
# df_train = pd.concat([df_train, df_train_2])

df_train.shape #(6543250, 42)

  sort=sort,
  sort=sort,


In [None]:
df_val = pd.concat((read_from(file, usecols=usecols, s3_dir='ml_files') for file in eval_files) , ignore_index=True)
df_val['subsegment_id'] = df_val['subsegment_id'].astype(int)

df_val = process_dca_data(df_val, df_subs, df_sub_seg_expire, subs_creation_date)
df_val.shape

In [None]:

df_test = pd.concat((read_from(file, usecols=usecols, s3_dir='ml_files') for file in test_files) , ignore_index=True)

df_test.subsegment_id = df_test.subsegment_id.astype(int)
df_test = process_dca_data(df_test, df_subs, df_sub_seg_expire, subs_creation_date)
df_test.shape

In [None]:
gp = df_train[df_train.billing_country=='US'].groupby(['days_between', 'funding_source'])
df_gp = gp.agg({'success':['sum', 'count'], 'subsegment_id':['nunique']})
df_gp[('','success_rate')] = df_gp[('success', 'sum')] / df_gp[('subsegment_id', 'nunique')] * 100
df_gp.columns = df_gp.columns.droplevel(0)
df_gp = df_gp.rename(columns={'sum': 'success',  'nunique': 'num_of_subs'})
df_gp[df_gp['num_of_subs'] > 10].sort_values(by=['success_rate', 'num_of_subs'], ascending=False)

In [None]:
#import for training
import numpy as np
from sklearn import cross_validation
from sklearn import metrics
from sklearn.model_selection import cross_val_score

from sklearn import cross_validation
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import GridSearchCV
# from spark_sklearn import GridSearchCV
from sklearn.preprocessing import label_binarize


# from src.web.utils import PreProcessing
from src.web.preprocessing import PreProcessing
from src.web.encoder import EnhancedLeaveOneOutEncoder


In [None]:
bin_profile_per_date_month = read_from(bin_profile_per_date_month_path)

bin_profile_per_date_month['bin'] = bin_profile_per_date_month['bin'].apply(str).str.replace('.0', '', regex=False)
max_per_date_month_dict = bin_profile_per_date_month.set_index(['bin', 'month', 'day_of_month'])['Max_99'].T.to_dict()

bin_profile_per_day_of_month = read_from(bin_profile_per_day_of_month_path)

bin_profile_per_day_of_month['bin'] = bin_profile_per_day_of_month['bin'].apply(str).str.replace('.0', '', regex=False)
max_per_day_of_month_dict = bin_profile_per_day_of_month.set_index(['bin', 'day_of_month'])['Max_99'].T.to_dict()

In [None]:
bank_profile_per_date_month = read_from(bank_profile_per_date_month_path)
bank_profile_per_date_month['bank_name'] = bank_profile_per_date_month['bank_name'].astype(str).apply(lambda x: x.lower().replace(' ', '').replace("nationalassociation", "n.a").replace(",", ""))
bank_profile_per_date_month['card_category'] = bank_profile_per_date_month['card_category'].astype(str).apply(lambda x: x.lower().replace(' ', '').replace(",", ""))

max_per_bank_card_date_month_dict = bank_profile_per_date_month.set_index(['bank_name', 'card_category', 'month', 'day_of_month'])['Max_99'].T.to_dict()

bank_profile_per_day_of_month = read_from(bank_profile_per_day_of_month_path)
bank_profile_per_day_of_month['bank_name'] = bank_profile_per_day_of_month['bank_name'].astype(str).apply(lambda x: x.lower().replace(' ', '').replace("nationalassociation", "n.a").replace(",", ""))
bank_profile_per_day_of_month['card_category'] = bank_profile_per_day_of_month['card_category'].astype(str).apply(lambda x: x.lower().replace(' ', '').replace(",", ""))

max_per_bank_card_day_of_month_dict = bank_profile_per_day_of_month.set_index(['bank_name', 'card_category', 'day_of_month'])['Max_99'].T.to_dict()


In [None]:
bank_profile_per_day_of_month

In [None]:
# success_per_date_month_dict = bin_profile_per_date_month.set_index(['bin', 'month', 'day_of_month'])['count'].T.to_dict()
# success_per_day_of_month_dict = bin_profile_per_day_of_month.set_index(['bin', 'day_of_month'])['count'].T.to_dict()

success_per_bank_card_date_month_dict = bank_profile_per_date_month.set_index(['bank_name', 'card_category', 'month', 'day_of_month'])['count'].T.to_dict()
success_per_bank_card_day_of_month_dict = bank_profile_per_day_of_month.set_index(['bank_name', 'card_category', 'day_of_month'])['count'].T.to_dict()

In [None]:
df_train = df_train.sample(frac=1).reset_index(drop=True)
df_train.shape

In [None]:
original_size = len(df_train)
balanced_size = len(df_train)
fail_size = df_train[label].value_counts(normalize=True)[0.0]
success_size =  df_train[label].value_counts(normalize=True)[1.0]

In [None]:
df_train[card_fields] = df_train[card_fields].astype(str).apply(
                lambda x: x.str.lower().replace(' ', '', regex=True) \
                .replace("nodatafound',value:'n/a", "", regex=False) \
                .replace("nodatafound", "",regex=False) \
                .replace("nodatafound'value:'n/a", "",regex=False))

df_val[card_fields] = df_val[card_fields].astype(str).apply(
                lambda x: x.str.lower().replace(' ', '', regex=True) \
                .replace("nodatafound',value:'n/a", "", regex=False) \
                .replace("nodatafound", "",regex=False) \
                .replace("nodatafound'value:'n/a", "",regex=False))

df_test[card_fields] = df_test[card_fields].astype(str).apply(
                lambda x: x.str.lower().replace(' ', '', regex=True) \
                .replace("nodatafound',value:'n/a", "", regex=False) \
                .replace("nodatafound", "",regex=False) \
                .replace("nodatafound'value:'n/a", "",regex=False))

df_train['card_info_is_empty'] =  (df_train['card_brand'] == '') & (df_train['card_category'] == '')
df_val['card_info_is_empty'] =  (df_val['card_brand'] == '') & (df_val['card_category'] == '')
df_test['card_info_is_empty'] =  (df_test['card_brand'] == '') & (df_test['card_category'] == '')

In [None]:
segment_num_group = [-1, 1, 2, 3, 4, 5, 6, 7, 8, 15, 20, 25, 30, 40, 50, 70, 100, 150]

df_train['segment_num_group'] = pd.cut(df_train['segment_num'], segment_num_group).astype(str).str.replace('.0', '', regex=False)
df_val['segment_num_group'] = pd.cut(df_val['segment_num'], segment_num_group).astype(str).str.replace('.0', '', regex=False)
df_test['segment_num_group'] = pd.cut(df_test['segment_num'], segment_num_group).astype(str).str.replace('.0', '', regex=False)


In [None]:
amount_group = [-1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 170, 190, 210, 250, 300, 400, 500, 1000, 1500, 2000, 5000, 10000, 20000]

df_train['payment_amount_group'] = pd.cut(df_train['payment_amount_usd'], amount_group).astype(str).str.replace('.0', '', regex=False)
df_val['payment_amount_group'] = pd.cut(df_val['payment_amount_usd'], amount_group).astype(str).str.replace('.0', '', regex=False)
df_test['payment_amount_group'] = pd.cut(df_test['payment_amount_usd'], amount_group).astype(str).str.replace('.0', '', regex=False)


In [None]:
df_train['is_first_renewal'] = (df_train['segment_num'] < 2) & (df_train['segment_num'] >= 0)
df_val['is_first_renewal'] = (df_val['segment_num'] < 2) & (df_val['segment_num'] >= 0)
df_test['is_first_renewal'] = (df_test['segment_num'] < 2) & (df_test['segment_num'] >= 0)

df_train.loc[(df_train.card_brand.str.lower().str.startswith('american', na=False)), 'card_category'] = 'american_express'
df_val.loc[(df_val.card_brand.str.lower().str.startswith('american', na=False)), 'card_category'] = 'american_express'
df_test.loc[(df_test.card_brand.str.lower().str.startswith('american', na=False)), 'card_category'] = 'american_express'

In [None]:
df_train.loc[(df_train['duration'] == 28) | (df_train['duration'] == 29) | (df_train['duration'] == 31) , 'duration'] = 30
df_train.loc[(df_train['duration'] == 366) , 'duration'] = 365
df_train.loc[(df_train['duration'] == 731) , 'duration'] = 730

df_val.loc[(df_val['duration'] == 28) | (df_val['duration'] == 29) | (df_val['duration'] == 31) , 'duration'] = 30
df_val.loc[(df_val['duration'] == 366) , 'duration'] = 365
df_val.loc[(df_val['duration'] == 731) , 'duration'] = 730

df_test.loc[(df_test['duration'] == 28) | (df_test['duration'] == 29) | (df_test['duration'] == 31) , 'duration'] = 30
df_test.loc[df_test['duration'] == 366 , 'duration'] = 365
df_test.loc[(df_test['duration'] == 731) , 'duration'] = 730

In [None]:
duration_group = [0, 3, 6, 9, 13, 17, 20, 25, 27, 33, 39, 43, 62, 70, 80, 88, 94, 100, 118, 125, 130, 146, 155, 176, 184, 200, 213, 230, 263, 300, 363, 368, 373,729, 733, 1000, 2000]

df_train['sub_duration_group'] = pd.cut(df_train['duration'], duration_group).astype(str).str.replace('.0', '', regex=False) 
df_val['sub_duration_group'] = pd.cut(df_val['duration'], duration_group).astype(str).str.replace('.0', '', regex=False) 
df_test['sub_duration_group'] = pd.cut(df_test['duration'], duration_group).astype(str).str.replace('.0', '', regex=False) 

df_train['sub_age_group'] = pd.cut(df_train['sub_age'], duration_group).astype(str).str.replace('.0', '', regex=False) 
df_val['sub_age_group'] = pd.cut(df_val['sub_age'], duration_group).astype(str).str.replace('.0', '', regex=False) 
df_test['sub_age_group'] = pd.cut(df_test['sub_age'], duration_group).astype(str).str.replace('.0', '', regex=False)

In [None]:
from sklearn.model_selection import GridSearchCV
# from spark_sklearn import GridSearchCV

from src.web.preprocessing import PreProcessing
from src.web.preprocessing import make_pipeline
from sklearn.preprocessing import Imputer


# additional_fields = ['card_brand',  'segment_num', 'segment_num_group' ,'bank_name', 'duration', 'is_expired', 'renewal_window', 'payment_currency', 'funding_source', 'card_category', 'card_class', 'card_usage', 'renew_att_num', 'site_id', 'bin', 'merchant_number', 'billing_country', 'funding_source', "payment_service_id", 'day_of_month', 'failed_decline_type',  'failed_day_of_month', 'failed_response_code', 'payment_amount_usd', 'issuer_country',  'failed_response_message','days_between',  'transaction_date_in_string', 'cc_expiration_date', 'failed_attempt_date']
additional_fields = [x for x in additional_fields if x not in (features_cat+features_num)]
fields = features_cat + features_num + additional_fields
features_num_encoded = features_num_encoded + features_num_bin_profile


features_dict = {'LABEL': label, 'FIELDS': fields ,'FEATURES_CAT': features_cat, 'FEATURES_NUM':features_num, 'FEATURES_ENCODED':features_encoded, 'FEATURES_NUM_ENCODED':features_num_encoded, 'FEATURES_NUM_CALCULATED':features_num_calculated, 'FEATURES_FLOAT': features_float}
features_dict_key = 'preprocessing__features_dict'
features_dict['df_bin_profile'] = None 
features_dict['df_decline_type'] = None

features_dict['FEATURES_NUM_BIN_PROFILE'] = features_num_bin_profile
features_dict['FEATURES_GROUPED'] = features_grouped
features_dict['ADDITIONAL_FIELDS'] = additional_fields


max_per_bank_card_date_month_dict = {}
success_per_bank_card_date_month_dict = {}
features_dict['group_dict'] = {"max_per_date_month_dict": max_per_date_month_dict, "max_per_day_of_month_dict": max_per_day_of_month_dict, \
                               "max_per_bank_card_date_month_dict": max_per_bank_card_date_month_dict, "max_per_bank_card_day_of_month_dict": max_per_bank_card_day_of_month_dict, \
                              "success_per_bank_card_date_month_dict": success_per_bank_card_date_month_dict, "success_per_bank_card_day_of_month_dict": success_per_bank_card_day_of_month_dict}



In [None]:
'''Prepares training parameters'''

from catboost import CatBoostClassifier, Pool
import src.web.preprocessing
from src.web.preprocessing import PreProcessing
from src.web.train_util import *
from importlib import import_module
import sys

classifier = CatBoostClassifier

cat_features_len = len(features_cat) +  len (features_grouped)
input_data = df_train

features_dict['use_cat_encoder'] = False
_preProcessor = PreProcessing().fit(input_data, input_data['success'], features_dict=features_dict)

_df_val = df_val 
_df_test = df_test

if not scale_pos_weight:
    scale_pos_weight = (_df_val[label].value_counts(normalize=True)[0.0] / _df_val[label].value_counts(normalize=True)[1.0] )
best_parameters['scale_pos_weight'] = scale_pos_weight

_x_eval = _preProcessor.transform(_df_val)
_y_eval = _df_val["success"]


alg_name = 'catboostclassifier'


model_file = ''


cat_features = list(range(0,cat_features_len))

fit_params = {
    f"{alg_name}__verbose": True,
    f"{alg_name}__cat_features": cat_features,
    f"{alg_name}__plot": True,
    f"{alg_name}__eval_set": Pool(_x_eval, _y_eval, cat_features)
}


features_dict['fit_params'] = fit_params


In [None]:
'''
print out all parameters.
'''

print('training_runner = ', training_runner)
print('project_id =', project_id)
print('training_id =', training_id)
print('metrics_feedback_url =', metrics_feedback_url)
print('model_destination =', model_destination)
print('label =', label)
print('training_data =', training_data)

print('training_files =', training_files)
print('eval_files =', eval_files)
print('test_files =', test_files)
print('sub_seg_expire_files =', sub_seg_expire_files)
print('subs_files =', subs_files)
print('subs_creation_date_files =', subs_creation_date_files)
print('excluded_processors =', excluded_processors)

'''
print out manipulated and aggregated features.
'''
print('\n============== training parameters & features ================ ')
print('input_features =', input_features)
print('additional_fields =', additional_fields)
print('tuned_parameters =', tuned_parameters)
print('best_parameters =', best_parameters)
print('features_cat =', features_cat)
print('features_float =', features_float)
print('features_num =', features_num)
print('features_grouped =', features_grouped)

print('feature_num_encoded =', features_encoded)
print('features_encoded =', features_encoded)
print('features_num_calculated =', features_num_calculated)


In [None]:
""" Train the model"""
if training_runner is None:
    version = get_latest_version(model_id, model_type) + 1
    model_name = model_id + '.' + str(version)
    features_dict['model_name'] = model_name
    output_dir=None

clf, result_d = build_and_train(
    input_data, 
    classifier, 
    tuned_parameters, 
    alg_name, 
    model_file, 
    best_param=best_parameters, 
    features_dict=features_dict, 
    test_data=_df_test,
    output_dir=output_dir)
                                   
print("result_dict: ", result_d)

In [None]:
'''
output the model
'''

if training_runner is None:
    model_file, model_file_name = write_model(clf, model_name)
    
    preprocess_repo_path = handle_preprocessing_file(model_id, version)
    size_desc = str(", original size: %s (fail: %s, success: %s), balanced_size: %s" % (original_size, fail_size, success_size, original_size))
    desc = 'Global model. Fixing preprocessing bug. Include 530 as response_code. Only using day of month bank_profile. Start_date from 2019-10. Update bank_profile, test data: 2020-09, eval data: 2020-07.  Add day_of_week-funding_source. Remove day_of_week-bank_name.  Remove days_between-bank_name-card_category. Exclude site_id. Start Date: 2019-09. Test data: 2020-08. Eval data: 2020-04.  Not using max_per_bank_card_date_month_dict and success_per_bank_card_date_month_dict. Fix empty duration, segment_num and renewal_window. Remove site_id. Add failed_decline_type-is_expired. Test file: 2020-06. Replace all na/ no data found value with emptry string. Add failed_decline_type-card_brand. 2020-04 as eval data. Revert skip max_per_date_month_dict. Use failed_response_codes_from_previous_cal-funding_source. Skip success_per_date_month. Revert bank_profile to use data until 2020-03. 2020-01 as eval data. Fix renewal_window on df_test. Add failed_response_codes_from_first_cal-funding-source. Remove failed_response_codes_from_first_cal-failed_response_codes_from_previous. Remove days_between_from_first_cal_failed_decline_type_from_first_cal. Training from 2020-01. Add failed_response_codes_from_previous_cal-days_between. Fix bin_profile_per_day_of_month_path file_path. Add fixed failed_response_messages_from_previous_cal-funding_source. Update decline type dict. Rearrange stop recurring decline type. Add card_brand-funding_source. Update decline_type dict 328:.  Use payment_amount_group only.  Add failed_decline_type-funding-source. Add payment_amount_group. Use txn_amount_bank_card_max_per_date_diff. Add failed_decline_type_from_first_cal. Add days_between-failed_response_codes_from_previous_cal Fix failed_response_codes_from_previous_cal. Add failed_response_message. Add failed_decline_type-failed_response_codes_from_previous_cal. Remove card_class. Add failed_decline_type-payment_service_id. Add payment_amount_usd.  Revert filter condition. Assign days_between_from_first_cal as days_between when na.  Add days_between_from_first_cal and group with renewal_window and funding_source. Using more specific failed_decline_type. Add days_between_from_first_cal-funding-source. Remove renew_att_num-days_between-renewal_window. Add days_between_from_first_cal-failed_decline_type_from_first_cal. Include first_calendar_attempt_date = na. Training data from 2020-06. Add is_expired. Revert bank_name-is_expired, bank_name-card_category-is_expired. Use success_bank_card_count_per_day_of_month instead of success_bin. Use days_between_from_first_cal-renewal_window only instead of days_between-renewal_window. Training data until 2020-05. Use 2020-04 as eval data. Eval data: 2020-04. Test data: 2020-06. Group segment_num 0 and 1 as the same group. Use scale_pos_weight. Add days_between_first_cal and renewal_window. Assign scale_pos_weight. Remove site_id.  Remove days_between-card_brand and funding_source. Add days_between-failed_decline_code. Include data with null first_cal_attempt. Add bank_name-is_expired, handle card_brand to be lower when grouping. Remove days_between_first_cal. With days_between-card_brand, reduce model_size_reg. Handle sub_duration_group in preprocessing. Minus segment_num. Using updated bank_profile, bin_profile to end of March. Use duration, sub_age, segment_num as numeric. Update sub_duration_group, days_between-card_brand,  duration(handle 28, 29,31 366 and 731), card_brand, renew_att_num,  sub_age.  Add days_between-failed_decline_type.  bank_card_max_per_date.  Add issuer_country-is_expired and bin-is_expired .is_expired to be True for all non na date increment. More individual features. Add more specific failed_decline_type.  {}_{}_for_calendar retry model,  eval_metric= BrierScore, with no date_increment, no payment amount and bin profile). {}'.format(start_date, end_date, size_desc)

    hyper_params = result_d.pop('hyper_params', None)
    extended_att = {"preprocess_repo_path": preprocess_repo_path, "input_features": input_features}
    repo_path = upload_artifact(model_file_name)
    insert_model_info(model_id, version, repo_path, desc=desc, model_type=model_type,eval_metrics=json.dumps(result_d), 
                      hyper_parameter=json.dumps(hyper_params), extended_att=json.dumps(extended_att), features_dict=features_dict, algorithm='CatBoostClassifier')
    
else:
    model_file = joblib.dump(clf, model_destination)

print('model_file generated: ', model_file)