In [None]:
import os
import time
import datetime
import pandas as pd
import cloudpickle as cp

from sklearn.tree import DecisionTreeClassifier

from simulator.shared import load_transactions
from simulator.training import get_train_test_set


In [None]:
DIR_INPUT = './data/simulated/training'
DIR_OUTPUT = './data/models/'


In [None]:
output_feature = "TX_FRAUD"

input_features = ['TX_AMOUNT', 'TX_DURING_WEEKEND', 'TX_DURING_NIGHT', 'CUSTOMER_ID_NB_TX_1DAY_WINDOW',
                  'CUSTOMER_ID_AVG_AMOUNT_1DAY_WINDOW', 'CUSTOMER_ID_NB_TX_7DAY_WINDOW',
                  'CUSTOMER_ID_AVG_AMOUNT_7DAY_WINDOW', 'CUSTOMER_ID_NB_TX_30DAY_WINDOW',
                  'CUSTOMER_ID_AVG_AMOUNT_30DAY_WINDOW', 'TERMINAL_ID_NB_TX_1DAY_WINDOW',
                  'TERMINAL_ID_RISK_1DAY_WINDOW', 'TERMINAL_ID_NB_TX_7DAY_WINDOW',
                  'TERMINAL_ID_RISK_7DAY_WINDOW', 'TERMINAL_ID_NB_TX_30DAY_WINDOW',
                  'TERMINAL_ID_RISK_30DAY_WINDOW']


In [None]:
# load a singel file or all files in a directory
files = []
if DIR_INPUT.endswith('.csv'):
    files = [DIR_INPUT]
else:
    # load all training files generated by the simulator
    files = [os.path.join(DIR_INPUT, f) for f in os.listdir(DIR_INPUT)]

# load the training data
tx_df = load_transactions(files)


In [None]:
# date range
start_date_training = tx_df['TX_DATETIME'].min()
start_date = start_date_training.strftime("%Y-%m-%d")

# split data into training and test data
(train_df, test_df) = get_train_test_set(
    tx_df, start_date_training, delta_train=7, delta_delay=7, delta_test=7)


In [None]:
# We first create a decision tree object. We will limit its depth to 2 for interpretability,
# and set the random state to zero for reproducibility
classifier = DecisionTreeClassifier(max_depth=2, random_state=0)


# fit the data
classifier.fit(train_df[input_features], train_df[output_feature])


In [None]:
# DEBUG test the classifier
fraud_df = test_df.loc[test_df['TX_FRAUD'] == 1]
fraud_df['TX_FRAUD_PREDICTION'] = classifier.predict(fraud_df[input_features])
fraud_df['TX_FRAUD_PROBABILITY'] = classifier.predict_proba(
    fraud_df[input_features])[:, 1]


In [None]:
# save the model/classifier
if not os.path.exists(DIR_OUTPUT):
    os.makedirs(DIR_OUTPUT)

ts = int(datetime.datetime.timestamp(datetime.datetime.now()) * 100000)

# just serialize the object
cp.dump(classifier, open(DIR_OUTPUT+f"model_{ts}.pkl", "wb"))
cp.dump(classifier, open(DIR_OUTPUT+f"model_latest.pkl", "wb"))
