In [None]:
import os
import sys
import boto3
import pandas as pd

from pathlib import Path
from dotenv import load_dotenv

from simulator.shared import load_transactions, read_from_pkl

load_dotenv()


In [None]:
local_prefix='./data/'

In [None]:
def download_folder(key, bucket_name='fsi-fraud-detection', local_prefix='./data/'):
    s3 = boto3.resource('s3')
    bucket = s3.Bucket(bucket_name)

    objs = list(bucket.objects.filter(Prefix=key))

    for obj in objs:
        # remove the file name from the object key
        obj_path = f"{local_prefix}{os.path.dirname(obj.key)}"
        # create nested directory structure
        Path(obj_path).mkdir(parents=True, exist_ok=True)
        # save file with full path locally
        bucket.download_file(obj.key, local_prefix + obj.key)

In [None]:
# get all the data from S3
download_folder('archive')
download_folder('fraud')

In [None]:
# load the data
input_dir = local_prefix + 'fraud'
files = [os.path.join(input_dir, f) for f in os.listdir(input_dir)]
fraud_df = load_transactions(files)
fraud_df = fraud_df.sort_values('TRANSACTION_ID')
fraud_df.reset_index(drop=True, inplace=True)

input_dir = local_prefix + 'archive'
files = [os.path.join(input_dir, f) for f in os.listdir(input_dir)]
tx_df = load_transactions(files)
tx_df = tx_df.sort_values('TRANSACTION_ID')
tx_df.reset_index(drop=True, inplace=True)

In [None]:
#merge fraud data into the archived data
for index, row in fraud_df.iterrows():
    idx = tx_df.loc[tx_df['TRANSACTION_ID'] == row['TRANSACTION_ID']]
    if len(idx) > 0:
        tx_df.loc[idx.index,['TX_FRAUD_SCENARIO']] = row['TX_FRAUD_SCENARIO']
        tx_df.loc[idx.index,['TX_FRAUD_PREDICTION']] = 1
        tx_df.loc[idx.index,['TX_FRAUD_PROBABILITY']] = 1.0


In [None]:
DIR_OUTPUT = "./data/training/"

# make sure the dir exist
if not os.path.exists(DIR_OUTPUT):
    os.makedirs(DIR_OUTPUT)

tx_df.to_csv(DIR_OUTPUT + 'merged.csv', index=False)