In [None]:
import os
import sys
import boto3
import datetime
import pandas as pd
import cloudpickle as cp

from pathlib import Path
from dotenv import load_dotenv

from sklearn import model_selection
from sklearn.linear_model import LogisticRegression

from shared import load_csv_from_dir, save_transactions_day

load_dotenv()


In [None]:
local_prefix = './data/'
INPUT_DIR = 'training'
OUTPUT_DIR = 'models/'

input_features_all = ['TX_AMOUNT', 'TX_DURING_WEEKEND', 'TX_DURING_NIGHT', 'CUSTOMER_ID_NB_TX_1DAY_WINDOW',
                      'CUSTOMER_ID_AVG_AMOUNT_1DAY_WINDOW', 'CUSTOMER_ID_NB_TX_7DAY_WINDOW',
                      'CUSTOMER_ID_AVG_AMOUNT_7DAY_WINDOW', 'CUSTOMER_ID_NB_TX_30DAY_WINDOW',
                      'CUSTOMER_ID_AVG_AMOUNT_30DAY_WINDOW', 'TERMINAL_ID_NB_TX_1DAY_WINDOW',
                      'TERMINAL_ID_RISK_1DAY_WINDOW', 'TERMINAL_ID_NB_TX_7DAY_WINDOW',
                      'TERMINAL_ID_RISK_7DAY_WINDOW', 'TERMINAL_ID_NB_TX_30DAY_WINDOW',
                      'TERMINAL_ID_RISK_30DAY_WINDOW']

input_features_medium = ['TX_AMOUNT', 'TX_DURING_WEEKEND', 'TX_DURING_NIGHT',
                           'TERMINAL_ID', 'CUSTOMER_ID_NB_TX_1DAY_WINDOW',
                           'CUSTOMER_ID_AVG_AMOUNT_1DAY_WINDOW', 'CUSTOMER_ID_NB_TX_7DAY_WINDOW',
                           'CUSTOMER_ID_AVG_AMOUNT_7DAY_WINDOW', 'CUSTOMER_ID_NB_TX_30DAY_WINDOW',
                           'CUSTOMER_ID_AVG_AMOUNT_30DAY_WINDOW', 'TERMINAL_ID_NB_TX_1DAY_WINDOW']

input_features_minimum = ['TX_AMOUNT',
                          'TX_DURING_WEEKEND', 'TX_DURING_NIGHT', 'TERMINAL_ID']

output_feature = "TX_FRAUD_PREDICTION"


In [None]:
START_DATE = '2020-04-01'
END_DATE = '2020-04-30'

input_features = input_features_minimum
training_iterations = 2000

In [None]:
# load the merged data and create the training/test sets
input_dir = local_prefix + INPUT_DIR

tx_df = load_csv_from_dir(input_dir, START_DATE, END_DATE)
tx_df['TX_DATETIME'] = pd.to_datetime(tx_df['TX_DATETIME'])

train_df, test_df = model_selection.train_test_split(tx_df, random_state=43)

In [None]:
# fit the model
lr = LogisticRegression(max_iter=training_iterations)
lr.fit(train_df[input_features], train_df[output_feature])

In [None]:
# save the model/classifier
output_dir = local_prefix + OUTPUT_DIR

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

ts = int(datetime.datetime.timestamp(datetime.datetime.now()) * 100000)

# just serialize the object
cp.dump(lr, open(output_dir+f"model_{ts}.pkl", "wb"))
cp.dump(lr, open(output_dir+f"model_latest.pkl", "wb"))