In [None]:
import os
import sys
import boto3
import datetime
import pandas as pd

from pathlib import Path
from dotenv import load_dotenv

from shared import load_transactions, save_transactions_day, download_bucket_folder

load_dotenv()


In [None]:
START_DATE = '2020-04-01'

TX_DIR = 'archive'
FRAUD_TX_DIR = 'fraud'

OUTPUT_DIR = 'merged/'
local_prefix = './data/'

In [None]:
# get all the data from S3
download_bucket_folder(TX_DIR)
download_bucket_folder(FRAUD_TX_DIR)


In [None]:
# load the archived data
input_dir = local_prefix + TX_DIR

files = [os.path.join(input_dir, f) for f in os.listdir(input_dir)]
tx_df = load_transactions(files)


In [None]:
# load the fraud data
input_dir = local_prefix + FRAUD_TX_DIR

files = [os.path.join(input_dir, f) for f in os.listdir(input_dir)]
fraud_df = load_transactions(files)


In [None]:
# sort by TXID
tx_df = tx_df.sort_values('TRANSACTION_ID')
tx_df.reset_index(drop=True, inplace=True)
tx_df = tx_df.drop_duplicates()

fraud_df = fraud_df.sort_values('TRANSACTION_ID')
fraud_df.reset_index(drop=True, inplace=True)
fraud_df = fraud_df.drop_duplicates()

# merge fraud data into archived data

for index, row in fraud_df.iterrows():
    idx = tx_df.loc[tx_df['TRANSACTION_ID'] == row['TRANSACTION_ID']]
    if len(idx) > 0:
        tx_df.loc[idx.index, ['TX_FRAUD_SCENARIO']] = row['TX_FRAUD_SCENARIO']
        tx_df.loc[idx.index, ['TX_FRAUD_PREDICTION']] = 1
        tx_df.loc[idx.index, ['TX_FRAUD_PROBABILITY']] = 1.0

# sort chronological
tx_df = tx_df.sort_values('TX_DATETIME')
tx_df.reset_index(drop=True, inplace=True)


In [None]:
# add columns that make sorting, grouping etc easier
start_date = datetime.datetime.strptime(START_DATE, "%Y-%m-%d")

tx_df['TX_TIME_SECONDS'] = pd.to_numeric(tx_df['TX_DATETIME']) / 1000000000
tx_df['TX_TIME_SECONDS'] = tx_df['TX_TIME_SECONDS'].astype(int)

tx_df['TX_TIME_DAYS'] = tx_df['TX_DATETIME'] - start_date
tx_df['TX_TIME_DAYS'] = tx_df['TX_TIME_DAYS'].apply(lambda x: x.days)


In [None]:
# save the merged data, grouped by day
save_transactions_day(tx_df, START_DATE, output_dir=local_prefix + OUTPUT_DIR)
