# Simulate AWS DMS output

In my thesis, the input files are pretended to be written by AWS DMS. Instead of actually creating a relational database and extracting it using AWS DMS, I have chosen to directly create the output files.

In [2]:
import pandas as pd
import random
import os
from helpers.paths import PathMerger
from faker import Faker
from datetime import datetime, timedelta, date, timezone
from dateutil import tz
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

fake = Faker()

## Functions

In [10]:
def curr(offset=None):
    """
    Params:
      * (Optional) offset: a timedelta object.
      
    Creates a datime object in seconds-precision.
    """

    now = datetime.now(tz=timezone.utc)
    
    if offset:
        now = now + offset
    
    return datetime(now.year, now.month, now.day, now.hour, now.minute, now.second)

def datetime_to_dms_str(input_datetime):
    return input_datetime.strftime('%Y-%m-%d %H:%M:%S')


def generate_customers(first_registration:datetime):
    customers = []
    
    # Memorized value for the creation time of the previous customer.
    created_prev = None
    
    # A single hard-coded customer for reference with an id of 0
    customer = {
        'dms_timestamp': datetime_to_dms_str(curr()),
        'id': 1, 
        'username': 'janisourander@kamk.fi',
        # 'password': fake.md5(),
        'created': datetime(1970, 1, 15, 10, 0, 0),
        'modified': datetime(1970, 2, 20, 12, 34, 56),
    }
    
    customers.append(customer)

    for i in range(9):
        
        if created_prev is None:
            # The first ever customer was created at 13:37, 15th of Jan 2020.
            created = first_registration
        else:
            # Each other customer were created n second later than the previous.
            created = created_prev + timedelta(seconds=random.randrange(1, 3600))
        
        customer = {
            'dms_timestamp': datetime_to_dms_str(curr()),
            'id': len(customers), 
            'username': fake.unique.ascii_email(),
             # 'password': fake.md5(),
            'created': created,
            'modified': created + timedelta(seconds=random.randrange(0, 3600 * 24 * 365)),
        }
        
        customers.append(customer)
        
        # Set current created datetime as previous
        created_prev = created

    return pd.DataFrame.from_records(customers)
    
    
def generate_device_models():
    devices = [
        {
            'dms_timestamp': datetime_to_dms_str(curr()),
            'id': 1, 
            'release_date': date(2010, 5, 15),
            'name': 'Super Gadget 100',
            'color': 'Red',
            'description': 'lorem ipsum',
            'created': datetime(2010, 3, 21, 12, 0, 1),
            'modified': datetime(2010, 3, 21, 12, 0, 1),
        },
        {
            'dms_timestamp': datetime_to_dms_str(curr()),
            'id': 2, 
            'release_date': date(2010, 5, 15),
            'name': 'Super Gadget 100',
            'color': 'Black',
            'description': 'lorem ipsum',
            'created': datetime(2010, 3, 21, 12, 0, 2),
            'modified': datetime(2010, 3, 21, 12, 0, 2),
        },
        {
            'dms_timestamp': datetime_to_dms_str(curr()),
            'id': 3, 
            'release_date': date(2010, 11, 1),
            'name': 'Super Gadget 100',
            'color': 'Pink',
            'description': 'lorem ipsum',
            'created': datetime(2010, 8, 5, 7, 0, 0),
            'modified': datetime(2010, 8, 5, 7, 0, 0),
        },
        {
            'dms_timestamp': datetime_to_dms_str(curr()),
            'id': 4, 
            'release_date': date(2018, 5, 13),
            'name': 'Super Gadget 200',
            'color': 'White',
            'description': 'lorem ipsum',
            'created': datetime(2018, 3, 20, 12, 1, 1),
            'modified': datetime(2018, 3, 20, 12, 1, 1),
        },
    ]
    
    return pd.DataFrame.from_records(devices)


def generate_devices(customers: pd.DataFrame, devices: pd.DataFrame):

    # List of unique model numbers
    unique_device_models = list(devices.id.unique())
    
    # Container
    devices = []
    # i = 0

    for (_, cust) in customers.iterrows():
        
        # Fetch the creation date of user and add some
        created = cust.created + timedelta(seconds=random.randrange(1, 3600))
        
        # Create 1 or 2 devices per user by random
        for x in range(random.randrange(1, 3)):
            device = {
                'dms_timestamp': datetime_to_dms_str(curr()),
                'id': len(devices) + 1, 
                'customer_fk': cust.id,
                'model_fk': random.choice(unique_device_models),
                'serial_number': fake.ssn(),
                'created': created,
                'modified': created + timedelta(seconds=random.randrange(1, 3600)),
            }

            devices.append(device)
    
    return pd.DataFrame.from_records(devices)

        
def generate_customer_details(customers: pd.DataFrame):
    
    # Container
    details = []
    
    for (_, cust) in customers.iterrows():
        detail = {
            'dms_timestamp': datetime_to_dms_str(curr()),
            'id': len(details) + 1,
            'customer_fk': cust.id,
            'birthday': fake.date_of_birth(),
            'language': fake.language_code(),
            'street_address': fake.street_address(),
            'postal_code': fake.postalcode(),
            'city': fake.city(),
            'country': fake.country_code(),
            'phone_number': fake.phone_number(),
            'created': cust.created,
            'modified': cust.created,
        }
        
        details.append(detail)
        
    return pd.DataFrame.from_records(details)

## Create Directories

In [11]:
# Table paths
table_paths = {}

# Map of combinations to create: (db, table, data_generator_function)
tables = [
    ('customers', 'customers'), 
    ('customers', 'customer_details'),
    ('devices', 'device_models'),
    ('devices', 'devices')
]

for db, table in tables:
    dt = f'{db}.{table}'
    sp = PathMerger(db, table).staging
    dp = os.path.join(sp, 'LOAD00000001.parquet')
    
    # Create dirs
    if not os.path.exists(sp):
        os.makedirs(sp)
        
    table_paths[dt] = dp

## Generate Datasets

### Customers

In [12]:
# Generate
df_c = generate_customers(first_registration=datetime(2020, 1, 15, 13, 37, 0))

# Write
df_c.to_parquet(table_paths['customers.customers'])

# Show
df_c

Unnamed: 0,dms_timestamp,id,username,created,modified
0,2021-09-11 11:29:58,1,janisourander@kamk.fi,1970-01-15 10:00:00,1970-02-20 12:34:56
1,2021-09-11 11:29:58,1,wilsonkeith@hotmail.com,2020-01-15 13:37:00,2020-12-24 16:26:37
2,2021-09-11 11:29:58,2,brian08@ellis.biz,2020-01-15 14:25:31,2020-12-16 16:57:23
3,2021-09-11 11:29:58,3,vargasalexander@gmail.com,2020-01-15 14:53:36,2020-02-03 11:15:41
4,2021-09-11 11:29:58,4,davidmiller@pierce.info,2020-01-15 15:02:14,2020-06-27 19:40:24
5,2021-09-11 11:29:58,5,carolhughes@yahoo.com,2020-01-15 15:21:33,2020-11-08 17:03:15
6,2021-09-11 11:29:58,6,abrooks@gmail.com,2020-01-15 16:20:07,2020-03-17 13:01:56
7,2021-09-11 11:29:58,7,kevinromero@mclaughlin.com,2020-01-15 17:19:35,2020-05-03 09:07:44
8,2021-09-11 11:29:58,8,jamesrivera@villanueva-brewer.net,2020-01-15 18:12:36,2020-08-19 19:16:28
9,2021-09-11 11:29:58,9,rebecca89@roy.info,2020-01-15 18:29:07,2020-06-13 07:16:17


### Device Models

In [13]:
# Generate
df_dm = generate_device_models()

# Write
df_dm.to_parquet(table_paths['devices.device_models'])

# Show
df_dm

Unnamed: 0,dms_timestamp,id,release_date,name,color,description,created,modified
0,2021-09-11 11:30:04,1,2010-05-15,Super Gadget 100,Red,lorem ipsum,2010-03-21 12:00:01,2010-03-21 12:00:01
1,2021-09-11 11:30:04,2,2010-05-15,Super Gadget 100,Black,lorem ipsum,2010-03-21 12:00:02,2010-03-21 12:00:02
2,2021-09-11 11:30:04,3,2010-11-01,Super Gadget 100,Pink,lorem ipsum,2010-08-05 07:00:00,2010-08-05 07:00:00
3,2021-09-11 11:30:04,4,2018-05-13,Super Gadget 200,White,lorem ipsum,2018-03-20 12:01:01,2018-03-20 12:01:01


### Devices

In [14]:
# Generate devices based on customers and device models
df_d = generate_devices(df_c, df_dm)

# Write
df_d.to_parquet(table_paths['devices.devices'])

# Show
df_d

Unnamed: 0,dms_timestamp,id,customer_fk,model_fk,serial_number,created,modified
0,2021-09-11 11:30:13,1,1,3,754-42-2330,1970-01-15 10:06:21,1970-01-15 10:39:07
1,2021-09-11 11:30:13,2,1,3,886-67-5966,2020-01-15 13:50:19,2020-01-15 13:56:20
2,2021-09-11 11:30:13,3,2,1,230-28-0710,2020-01-15 15:02:22,2020-01-15 15:23:42
3,2021-09-11 11:30:13,4,2,1,726-54-0610,2020-01-15 15:02:22,2020-01-15 15:31:21
4,2021-09-11 11:30:13,5,3,4,387-99-5212,2020-01-15 15:52:03,2020-01-15 16:10:12
5,2021-09-11 11:30:13,6,4,1,133-78-4165,2020-01-15 16:01:32,2020-01-15 16:35:56
6,2021-09-11 11:30:13,7,5,2,443-39-2038,2020-01-15 15:36:19,2020-01-15 16:17:17
7,2021-09-11 11:30:13,8,5,1,370-05-7782,2020-01-15 15:36:19,2020-01-15 15:57:16
8,2021-09-11 11:30:13,9,6,3,798-82-6441,2020-01-15 17:00:17,2020-01-15 17:20:12
9,2021-09-11 11:30:13,10,6,1,372-86-3075,2020-01-15 17:00:17,2020-01-15 17:27:39


### Customer_details

In [15]:
# Generate customer details. One per customer.
df_cd = generate_customer_details(df_c)

# Write
df_cd.to_parquet(table_paths['customers.customer_details'])

df_cd

Unnamed: 0,dms_timestamp,id,customer_fk,birthday,language,street_address,postal_code,city,country,phone_number,created,modified
0,2021-09-11 11:30:27,1,1,1986-12-07,oc,807 Contreras Center,64641,Joeside,BD,923.864.4079x69991,1970-01-15 10:00:00,1970-01-15 10:00:00
1,2021-09-11 11:30:27,2,1,2018-02-17,ne,848 Lawrence Shore Apt. 394,92082,Juliehaven,VE,608.855.2686x402,2020-01-15 13:37:00,2020-01-15 13:37:00
2,2021-09-11 11:30:27,3,2,1929-11-18,ast,248 Oconnell Ranch Apt. 023,94900,Port Michael,TO,(828)927-0165,2020-01-15 14:25:31,2020-01-15 14:25:31
3,2021-09-11 11:30:27,4,3,1915-02-07,gu,697 Delgado Keys,33199,Lake Teresa,SY,001-767-452-1963x425,2020-01-15 14:53:36,2020-01-15 14:53:36
4,2021-09-11 11:30:27,5,4,2016-12-28,xh,200 Jennifer Lodge,19134,Johnnyview,SC,+1-174-300-0262x78541,2020-01-15 15:02:14,2020-01-15 15:02:14
5,2021-09-11 11:30:27,6,5,1909-11-11,hne,4062 Perez Inlet,95532,Valerieview,PE,626.617.8780x34224,2020-01-15 15:21:33,2020-01-15 15:21:33
6,2021-09-11 11:30:27,7,6,1948-11-12,nl,331 Young Lodge,67256,North Carly,KP,674.285.0881x38462,2020-01-15 16:20:07,2020-01-15 16:20:07
7,2021-09-11 11:30:27,8,7,1981-07-11,gd,665 Patricia Manors,53769,Fergusonburgh,BS,(351)779-9163x16329,2020-01-15 17:19:35,2020-01-15 17:19:35
8,2021-09-11 11:30:27,9,8,1948-06-18,dv,6634 Edward Street Suite 011,72919,East Joseton,MG,001-973-395-3383,2020-01-15 18:12:36,2020-01-15 18:12:36
9,2021-09-11 11:30:27,10,9,1989-08-05,mi,270 Jason Gateway Suite 026,2222,Thomasstad,SI,100.102.7295x02122,2020-01-15 18:29:07,2020-01-15 18:29:07


## List resulting files

In [16]:
# Maximum lenght of a path
max_w = max([len(x[1]) for x in table_paths.items()]) + 2

# UNIX epoch start
epoch = datetime(1970, 1, 1, tzinfo=timezone.utc)

for k, f in table_paths.items():

    s = os.stat(f)
    f_path = '.' + os.sep + f 
    f_size = s.st_size // 1024 
    f_mod = (
        (epoch + timedelta(seconds=s.st_mtime))
        .astimezone(tz.gettz('Europe/Helsinki'))
        .strftime('%y-%m-%d %H:%M:%S')
    )
    
    print(f"{f_mod} | {f_path:<{max_w}} | {f_size:>2d} KB")

21-09-11 14:29:58 | .\S3\staging\dms\abc\customers\customers\LOAD00000001.parquet        |  4 KB
21-09-11 14:30:27 | .\S3\staging\dms\abc\customers\customer_details\LOAD00000001.parquet |  9 KB
21-09-11 14:30:04 | .\S3\staging\dms\abc\devices\device_models\LOAD00000001.parquet      |  6 KB
21-09-11 14:30:13 | .\S3\staging\dms\abc\devices\devices\LOAD00000001.parquet            |  6 KB


## Preview only first lines

In [17]:
from IPython.display import HTML

panda_tables = [df_c, df_cd, df_dm, df_d]

for pt in panda_tables:
    display(HTML(pt.head(1).to_html(index=False)))

dms_timestamp,id,username,created,modified
2021-09-11 11:29:58,1,janisourander@kamk.fi,1970-01-15 10:00:00,1970-02-20 12:34:56


dms_timestamp,id,customer_fk,birthday,language,street_address,postal_code,city,country,phone_number,created,modified
2021-09-11 11:30:27,1,1,1986-12-07,oc,807 Contreras Center,64641,Joeside,BD,923.864.4079x69991,1970-01-15 10:00:00,1970-01-15 10:00:00


dms_timestamp,id,release_date,name,color,description,created,modified
2021-09-11 11:30:04,1,2010-05-15,Super Gadget 100,Red,lorem ipsum,2010-03-21 12:00:01,2010-03-21 12:00:01


dms_timestamp,id,customer_fk,model_fk,serial_number,created,modified
2021-09-11 11:30:13,1,1,3,754-42-2330,1970-01-15 10:06:21,1970-01-15 10:39:07
