# Simulate AWS DMS output

In my thesis, the input files are pretended to be written by AWS DMS. Instead of actually creating a relational database and extracting it using AWS DMS, I have chosen to directly create the output files.

In [33]:
import pandas as pd
import random
import os
from faker import Faker
from datetime import datetime, timedelta, date
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

fake = Faker()

## Functions

In [188]:
def generate_customers(first_registration:datetime):
    customers = []
    
    # Memorized value for the creation time of the previous customer.
    created_prev = None
    
    # A single hard-coded customer for reference with an id of 0
    customer = {
        'id': 0, 
        'username': 'janisourander@kamk.fi',
        'password': fake.md5(),
        'created': datetime(1290, 1, 15, 0, 0, 0),
        'modified': datetime(1970, 2, 20, 12, 34, 56),
    }
    
    customers.append(customer)

    for i in range(9):
        
        if created_prev is None:
            # The first ever customer was created at 13:37, 15th of Jan 2020.
            created = first_registration
        else:
            # Each other customer were created n second later than the previous.
            created = created_prev + timedelta(seconds=random.randrange(1, 3600))
        
        customer = {
            'id': len(customers), 
            'username': fake.unique.ascii_email(),
            'password': fake.md5(),
            'created': created,
            'modified': created + timedelta(seconds=random.randrange(0, 3600 * 24 * 365)),
        }
        
        customers.append(customer)
        
        # Set current created datetime as previous
        created_prev = created

    return pd.DataFrame.from_records(customers)
    
    
def generate_device_models():
    devices = [
        {
            'id': 0, 
            'release_date': date(2010, 5, 15),
            'name': 'Super Gadget 100',
            'color': 'Red',
            'description': 'lorem ipsum',
            'created': datetime(2010, 3, 21, 12, 0, 1),
            'modified': datetime(2010, 3, 21, 12, 0, 1),
        },
        {
            'id': 1, 
            'release_date': date(2010, 5, 15),
            'name': 'Super Gadget 100',
            'color': 'Black',
            'description': 'lorem ipsum',
            'created': datetime(2010, 3, 21, 12, 0, 2),
            'modified': datetime(2010, 3, 21, 12, 0, 2),
        },
        {
            'id': 2, 
            'release_date': date(2010, 11, 1),
            'name': 'Super Gadget 100',
            'color': 'Pink',
            'description': 'lorem ipsum',
            'created': datetime(2010, 8, 5, 7, 0, 0),
            'modified': datetime(2010, 8, 5, 7, 0, 0),
        },
        {
            'id': 3, 
            'release_date': date(2018, 5, 13),
            'name': 'Super Gadget 200',
            'color': 'White',
            'description': 'lorem ipsum',
            'created': datetime(2018, 3, 20, 12, 1, 1),
            'modified': datetime(2018, 3, 20, 12, 1, 1),
        },
    ]
    
    return pd.DataFrame.from_records(devices)


def generate_devices(customers: pd.DataFrame, devices: pd.DataFrame):

    # List of unique model numbers
    unique_device_models = list(devices.id.unique())
    
    # Container
    devices = []
    i = 0

    for (_, cust) in customers.iterrows():
        
        # Fetch the creation date of user and add some
        created = cust.created + timedelta(seconds=random.randrange(1, 3600))
        
        # Create 1 or 2 devices per user by random
        for x in range(random.randrange(1, 3)):
            device = {
                'id': len(devices), 
                'customer_fk': cust.id,
                'device_fk': random.choice(unique_device_models),
                'serial_number': fake.ssn(),
                'created': created,
                'modified': created + timedelta(seconds=random.randrange(1, 3600)),
            }

            devices.append(device)
    
    return pd.DataFrame.from_records(devices)

        
def generate_customer_details(customers: pd.DataFrame):
    
    # Container
    details = []
    
    for (_, cust) in customers.iterrows():
        detail = {
            'id': len(details),
            'customer_fk': cust.id,
            'birthday': fake.date_of_birth(),
            'language': fake.language_code(),
            'street_address': fake.street_address(),
            'postal_code': fake.postalcode(),
            'city': fake.city(),
            'country': fake.country_code(),
            'phone_number': fake.phone_number(),
            'created': cust.created,
            'modified': cust.created,
        }
        
        details.append(detail)
        
    return pd.DataFrame.from_records(details)

## Create Directories

In [40]:
# Settings for path prefixing
container = 'S3'
prefix = 'staging'
tool = 'dms'
system = 'company_rds'

# Mapping
table_paths = {}

# Map of combinations to create: (db, table)
tables = [
    ('customers', 'customers'), 
    ('customers', 'customer_details'),
    ('devices', 'device_models'),
    ('devices', 'devices')
]

for db, table in tables:
    # Generate rdbms-path and path to file.
    dt = f'{db}.{table}'
    dp = os.path.join(container, prefix, tool, system, db, table, 'LOAD00000001.parquet')
    
    # Append
    table_paths[dt] = dp

    # Create
    if not os.path.exists(os.path.dirname(dp)):
        os.makedirs(os.path.dirname(dp))

## Generate Datasets

### Customers

In [190]:
# Generate
df_c = generate_customers(first_registration=datetime(2020, 1, 15, 13, 37, 0))

# Write
# df_c.to_parquet(table_paths['customers.customers'])

# Show
df_c

Unnamed: 0,id,username,password,created,modified
0,0,janisourander@kamk.fi,75b773c6b8e52daed135a8c810d0c0b8,1290-01-15 00:00:00,1970-02-20 12:34:56
1,1,david04@gmail.com,8fb9c6f1b9403e60bfdfc98f5b70f9ee,2020-01-15 13:37:00,2020-06-08 18:21:03
2,2,jonesdorothy@hancock.com,bda9a6830504e441e83c1bbf1646e6e2,2020-01-15 14:27:47,2020-01-26 19:00:04
3,3,kruegerkatherine@gmail.com,137420430f094d6f592703af872e2196,2020-01-15 15:01:39,2020-04-14 05:26:27
4,4,wagnerruben@phelps.com,283fd66e2a5388c531d6ce5093f78e62,2020-01-15 15:22:13,2020-06-08 20:37:15
5,5,marc10@hotmail.com,ee4897d952dc6b0162dd2791b8f03aef,2020-01-15 15:42:17,2020-11-06 11:28:29
6,6,john98@hotmail.com,90772adc4ffb8ebb1fe1b2d50aaf1e6b,2020-01-15 16:32:25,2020-11-17 15:20:25
7,7,amy10@hotmail.com,83313021a9c43e2c93d884d2982be182,2020-01-15 16:55:36,2020-07-05 17:49:37
8,8,franciskelsey@fletcher-martin.com,f394f1cdd91231ebd498b66da3848355,2020-01-15 17:19:53,2020-04-25 18:21:29
9,9,ingramvictor@brown.com,1371cfc677d4c3c77978a2b2a30c5f49,2020-01-15 18:08:38,2020-05-10 01:40:07


### Device Models

In [191]:
# Generate
df_dm = generate_device_models()

# Write
# df_dm.to_parquet(table_paths['customers.customers'])

# Show
df_dm

Unnamed: 0,id,release_date,name,color,description,created,modified
0,0,2010-05-15,Super Gadget 100,Red,lorem ipsum,2010-03-21 12:00:01,2010-03-21 12:00:01
1,1,2010-05-15,Super Gadget 100,Black,lorem ipsum,2010-03-21 12:00:02,2010-03-21 12:00:02
2,2,2010-11-01,Super Gadget 100,Pink,lorem ipsum,2010-08-05 07:00:00,2010-08-05 07:00:00
3,3,2018-05-13,Super Gadget 200,White,lorem ipsum,2018-03-20 12:01:01,2018-03-20 12:01:01


### Devices

In [192]:
# Generate devices based on customers and device models
df_d = generate_devices(df_c, df_dm)

# Show
df_d

Unnamed: 0,id,customer_fk,device_fk,serial_number,created,modified
0,0,0,0,497-17-1134,1290-01-15 00:47:11,1290-01-15 01:26:53
1,1,0,2,305-03-5680,1290-01-15 00:47:11,1290-01-15 01:33:09
2,2,1,2,325-31-8248,2020-01-15 13:53:00,2020-01-15 13:55:32
3,3,2,1,532-29-9859,2020-01-15 14:41:38,2020-01-15 14:43:29
4,4,3,2,317-74-7481,2020-01-15 15:07:00,2020-01-15 15:26:11
5,5,4,3,752-33-0215,2020-01-15 16:02:12,2020-01-15 16:47:50
6,6,4,3,260-77-5487,2020-01-15 16:02:12,2020-01-15 16:50:27
7,7,5,2,537-38-8480,2020-01-15 15:54:32,2020-01-15 16:43:37
8,8,5,1,608-25-0178,2020-01-15 15:54:32,2020-01-15 15:57:09
9,9,6,1,843-51-4830,2020-01-15 17:29:09,2020-01-15 18:19:01


### Customer_details

In [193]:
df_cd = generate_customer_details(df_c)

df_cd

Unnamed: 0,id,customer_fk,birthday,language,street_address,postal_code,city,country,phone_number,created,modified
0,0,0,1932-01-05,ko,095 Ramsey Springs,51617,Gibsonfort,FJ,(063)073-1117x746,1290-01-15 00:00:00,1290-01-15 00:00:00
1,1,1,1923-08-02,se,82904 Karen Land Apt. 044,89198,Robertmouth,AZ,(354)274-5376x9060,2020-01-15 13:37:00,2020-01-15 13:37:00
2,2,2,2012-03-23,li,963 Stewart Branch,76794,North Charlesshire,TR,643-691-3159x1268,2020-01-15 14:27:47,2020-01-15 14:27:47
3,3,3,1946-01-03,tn,958 Long Summit Suite 041,75461,Tylerbury,CR,001-324-466-0029x22069,2020-01-15 15:01:39,2020-01-15 15:01:39
4,4,4,1984-06-27,szl,8448 Lisa Rapid,13253,Port Michaelside,JO,444.295.7187x659,2020-01-15 15:22:13,2020-01-15 15:22:13
5,5,5,2018-10-16,mi,07600 Moreno Ridge Apt. 373,26143,Johnsonmouth,KW,236.010.6833x58767,2020-01-15 15:42:17,2020-01-15 15:42:17
6,6,6,2001-11-06,it,18797 Miles Points,79351,New Sandraberg,CG,001-858-901-9920,2020-01-15 16:32:25,2020-01-15 16:32:25
7,7,7,1908-04-11,ia,67339 Nicholas Viaduct,31415,Davidbury,SK,(726)162-6995x2842,2020-01-15 16:55:36,2020-01-15 16:55:36
8,8,8,1929-08-24,fr,888 Stephen Skyway,61204,Taylorberg,DJ,(754)363-4594x74974,2020-01-15 17:19:53,2020-01-15 17:19:53
9,9,9,1975-05-07,mn,815 Victor Ford,67057,Adammouth,MY,952-464-9166x59204,2020-01-15 18:08:38,2020-01-15 18:08:38
