# Simulate AWS DMS output

In my thesis, the input files are pretended to be written by AWS DMS. Instead of actually creating a relational database and extracting it using AWS DMS, I have chosen to directly create the output files.

In [1]:
import pandas as pd
import random
import os
from faker import Faker
from datetime import datetime, timedelta
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

fake = Faker()

In [2]:
def generate_customers(first_registration:datetime):
    customers = []
    
    # Memorized value for the creation time of the previous customer.
    created_prev = None
    
    # A single hard-coded customer for reference
    customer = {
        "id": 0, 
        "username": 'janisourander@kamk.fi',
        "password": fake.md5(),
        "created": datetime(1290, 1, 15, 0, 0, 0),
        "modified": datetime(1970, 2, 20, 12, 34, 56),
    }
    
    customers.append(customer)

    for i in range(1, 10):
        
        if created_prev is None:
            # The first ever customer was created at 13:37, 15th of Jan 2020.
            created = first_registration
        else:
            # Each other customer were created n second later than the previous.
            created = created_prev + timedelta(seconds=random.randrange(1, 3600))
        
        customer = {
            "id": i, 
            "username": fake.unique.ascii_email(),
            "password": fake.md5(),
            "created": created,
            "modified": created + timedelta(seconds=random.randrange(0, 3600 * 24 * 365)),
        }
        
        customers.append(customer)
        
        # Set current created datetime as previous
        created_prev = created

    return pd.DataFrame.from_records(customers)
    
    # return spark.createDataFrame(pdf)

In [3]:
# Settings for path prefixing
src_container = 'S3'
src_prefix = 'staging'
src_system = 'alpha'
src_db = 'customers'
src_table = 'customers'
tgt_dir = os.path.join(src_container, src_prefix, src_system, src_db, src_table)
tgt_filename = 'LOAD' + '1'.zfill(8) + '.parquet'

# Create directory to contain all the Datasets
if not os.path.exists(tgt_dir):
    os.makedirs(tgt_dir)

# Generate the Dataset - Customers

df_c = generate_customers(first_registration=datetime(2020, 1, 15, 13, 37, 0))
output_path = os.path.join(tgt_dir, tgt_filename)

print('Writing to:', output_path)

Writing to: S3\staging\alpha\customers\customers\LOAD00000001.parquet


In [4]:
df_c.to_parquet(output_path)

In [5]:
df_c

Unnamed: 0,id,username,password,created,modified
0,0,janisourander@kamk.fi,97abaa87d5127504399cf73ef35e3ac8,1290-01-15 00:00:00,1970-02-20 12:34:56
1,1,fbryant@hotmail.com,d55ed13ff10e1f44fb917828756d8b7c,2020-01-15 13:37:00,2020-08-16 04:10:56
2,2,sandracoffey@phelps-harris.org,398633c285f98e4f848a362af0608fd8,2020-01-15 14:07:24,2020-12-12 10:26:01
3,3,abigailnavarro@hotmail.com,7904848a525464cc2e25117527e96d89,2020-01-15 15:04:43,2020-08-01 10:13:53
4,4,melissa69@glover-hernandez.org,5cf62ef34d196ec0e08a4aaf756fabe9,2020-01-15 15:37:40,2020-11-02 01:18:47
5,5,heathercampbell@garcia-hernandez.org,da3272ebe29b5fb83fe7edba5ebe86dd,2020-01-15 16:11:12,2020-04-05 22:23:03
6,6,wramirez@yahoo.com,1197f14a4ade6e7a573d6d7181e1cc1c,2020-01-15 17:08:15,2020-09-06 09:16:54
7,7,lauriemoore@moreno.net,d52e62add46aadb25e39375185f17b05,2020-01-15 17:40:42,2020-02-22 11:11:14
8,8,gparker@yahoo.com,f8049ff99d476ad65b13a325ebbc7774,2020-01-15 18:30:16,2020-07-13 13:40:10
9,9,donaldcunningham@hotmail.com,1ea7b70cab7b98569effbb547cf31445,2020-01-15 19:12:11,2020-10-14 04:18:29


In [9]:
datetime(1970, 2, 20, 12, 34, 56).tzinfo