# Exercise 3: Parallel ETL

In [1]:
%load_ext sql

In [2]:
from time import time
import configparser
import matplotlib.pyplot as plt
import pandas as pd

# STEP 1: Get the params of the created redshift cluster 
- We need:
    - The redshift cluster <font color='red'>endpoint</font>
    - The <font color='red'>IAM role ARN</font> that give access to Redshift to read from S3

In [3]:
config = configparser.ConfigParser()
config.read_file(open('dwh.cfg'))
KEY=config.get('AWS','key')
SECRET= config.get('AWS','secret')

DWH_DB= config.get("DWH","DWH_DB")
DWH_DB_USER= config.get("DWH","DWH_DB_USER")
DWH_DB_PASSWORD= config.get("DWH","DWH_DB_PASSWORD")
DWH_PORT = config.get("DWH","DWH_PORT")

In [6]:
# FILL IN THE REDSHIFT ENPOINT HERE
# e.g. DWH_ENDPOINT="redshift-cluster-1.csmamz5zxmle.us-west-2.redshift.amazonaws.com" 
DWH_ENDPOINT="redshift-cluster-de-assignment.ceoqlgitrobq.us-east-1.redshift.amazonaws.com" 
    
#FILL IN THE IAM ROLE ARN you got in step 2.2 of the previous exercise
#e.g DWH_ROLE_ARN="arn:aws:iam::988332130976:role/dwhRole"
DWH_ROLE_ARN="arn:aws:iam::206790211102:role/myRedshiftRole"

# STEP 2: Connect to the Redshift Cluster

In [25]:
conn_string="postgresql://{}:{}@{}:{}/{}".format(DWH_DB_USER, DWH_DB_PASSWORD, DWH_ENDPOINT, DWH_PORT,DWH_DB)
print(conn_string)
%sql $conn_string

postgresql://awsuser:Passw0rd@redshift-cluster-de-assignment.ceoqlgitrobq.us-east-1.redshift.amazonaws.com:5439/dev


In [12]:
import boto3

s3 = boto3.resource('s3',
                       region_name="us-east-1",
                       aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET
                     )

sampleDbBucket =  s3.Bucket("my-demo-data-redshift")

for obj in sampleDbBucket.objects.filter():
    print(obj)

s3.ObjectSummary(bucket_name='my-demo-data-redshift', key='recurly_accounts.csv')


# STEP 3: Create Tables

In [29]:
%%sql 
DROP TABLE IF EXISTS "accounts";
CREATE TABLE "accounts" ( 
	batch_id             bigint    ENCODE RAW,
	date_id              integer    ENCODE RAW,
	date_id_ps           integer    ENCODE RAW,
	month_id             integer    ENCODE RAW,
	month_id_ps          integer    ENCODE az64,
	handle_batch_id      bigint    ENCODE az64,
	id                   varchar(1024)    ENCODE RAW,
	code                 varchar(65535)    ENCODE lzo,
	org_id               bigint    ENCODE RAW,
	parent_account_id    varchar(65535)    ENCODE lzo,
	bill_to              varchar(65535)    ENCODE lzo,
	"state"              varchar(65535)    ENCODE lzo,
	username             varchar(65535)    ENCODE lzo,
	email                varchar(65535)    ENCODE lzo,
	email_id             integer    ENCODE RAW,
	preferred_locale     varchar(65535)    ENCODE lzo,
	first_name           varchar(65535)    ENCODE lzo,
	last_name            varchar(65535)    ENCODE lzo,
	company              varchar(65535)    ENCODE lzo,
	country_code         varchar(2)    ENCODE RAW,
	custom_fields        varchar    ENCODE zstd,
	has_live_subscription boolean    ENCODE RAW,
	has_active_subscription boolean    ENCODE RAW,
	has_future_subscription boolean    ENCODE RAW,
	has_canceled_subscription boolean    ENCODE RAW,
	has_paused_subscription boolean    ENCODE RAW,
	has_past_due_invoice boolean    ENCODE RAW,
	created_at           timestamp    ENCODE RAW,
	updated_at           timestamp    ENCODE RAW,
	deleted_at           timestamp    ENCODE RAW,
	created_at_ps        timestamp    ENCODE RAW,
	updated_at_ps        timestamp    ENCODE RAW,
	deleted_at_ps        timestamp    ENCODE RAW,
	sales_channel        integer    ENCODE RAW,
	sales_rep            varchar(1024)    ENCODE RAW,
	renewed_by           integer    ENCODE RAW,
	original_orgid       bigint    ENCODE RAW,
	sale_credit_sharing  decimal(18,4)    ENCODE RAW,
	cs_credit_sharing    decimal(18,4)    ENCODE RAW,
	partner_credit_sharing decimal(18,4)    ENCODE RAW,
	ecom_credit_sharing  decimal(18,4)    ENCODE RAW,
	credit_sharing       varchar    ENCODE zstd
 )   DISTSTYLE AUTO;

 * postgresql://awsuser:***@redshift-cluster-de-assignment.ceoqlgitrobq.us-east-1.redshift.amazonaws.com:5439/dev
Done.
Done.


[]

In [32]:
%sql select * from accounts limit 10

 * postgresql://awsuser:***@redshift-cluster-de-assignment.ceoqlgitrobq.us-east-1.redshift.amazonaws.com:5439/dev
10 rows affected.


batch_id,date_id,date_id_ps,month_id,month_id_ps,handle_batch_id,id,code,org_id,parent_account_id,bill_to,state,username,email,email_id,preferred_locale,first_name,last_name,company,country_code,custom_fields,has_live_subscription,has_active_subscription,has_future_subscription,has_canceled_subscription,has_paused_subscription,has_past_due_invoice,created_at,updated_at,deleted_at,created_at_ps,updated_at_ps,deleted_at_ps,sales_channel,sales_rep,renewed_by,original_orgid,sale_credit_sharing,cs_credit_sharing,partner_credit_sharing,ecom_credit_sharing,credit_sharing
1302,20200326,20200326,202003,202003,1605,mlapmucae71s,organization-6081,6081,,self,active,nielsen.com,email_385455,385455,,,,6081-Organization tanmay.nanda,,[],False,False,False,False,False,False,2020-03-26 17:02:34,2020-10-14 06:57:27,,2020-03-26 09:02:34,2020-10-13 22:57:27,,3,,,,0.0,0.0,0.0,0.0,
1302,20200316,20200316,202003,202003,1605,mjcu7skgf86m,organization-76683,76683,,self,active,izen.me,email_451477,451477,,,,76683-Organization joe,,[],False,False,False,False,False,False,2020-03-16 22:03:33,2020-10-14 06:56:44,,2020-03-16 14:03:33,2020-10-13 22:56:44,,3,,,,0.0,0.0,0.0,0.0,
1302,20200318,20200318,202003,202003,1605,mjnshlfgmy1l,organization-76851,76851,,self,active,addymuruginjagi@gmail.com,email_645780,645780,,,,76851-Organization addymuruginjagi,KE,[],False,False,False,False,False,False,2020-03-18 10:53:27,2020-10-14 06:56:57,,2020-03-18 02:53:27,2020-10-13 22:56:57,,3,,,,0.0,0.0,0.0,0.0,
1302,20200324,20200323,202003,202003,1605,mktmpl0d9z6u,organization-78519,78519,,self,active,loan1@mailinator.com,email_919642,919642,,,,11111,,[],False,False,False,False,False,False,2020-03-24 07:35:54,2020-10-14 06:57:18,,2020-03-23 23:35:54,2020-10-13 22:57:18,,3,,,,0.0,0.0,0.0,0.0,
1302,20200305,20200305,202003,202003,1605,mh49lovo2i2r,organization-73369,73369,,self,active,eksource.com,email_375936,375936,,,,73369-Organization sam,,[],False,False,False,False,False,False,2020-03-05 15:05:29,2020-10-14 06:55:23,,2020-03-05 07:05:29,2020-10-13 22:55:23,,3,,,,0.0,0.0,0.0,0.0,
1302,20200310,20200310,202003,202003,1605,mi3d5lks5pg6,organization-2104,2104,,self,active,tylergprada@gmail.com,email_379692,379692,,,,2104-Convention Data Services,,[],False,False,False,False,False,False,2020-03-10 13:07:45,2020-10-14 06:55:46,,2020-03-10 05:07:45,2020-10-13 22:55:46,,3,,,,0.0,0.0,0.0,0.0,
1302,20200302,20200301,202003,202003,1605,mgfj117iwi3l,organization-46126,46126,,self,active,edd.fr,email_555391,555391,,,,Aday,FR,[],True,True,False,False,False,False,2020-03-02 03:53:47,2022-03-07 02:00:24,,2020-03-01 19:53:47,2022-03-06 18:00:24,,3,,,,0.0,0.0,0.0,0.0,
1302,20200326,20200326,202003,202003,1605,ml9bypfbwk4e,organization-79163,79163,,self,active,a2wmaster.com.br,email_867261,867261,,,,79163-Organization keila,,[],False,False,False,False,False,False,2020-03-26 12:24:10,2020-10-14 06:57:27,,2020-03-26 04:24:10,2020-10-13 22:57:27,,3,,,,0.0,0.0,0.0,0.0,
1302,20200327,20200327,202003,202003,1605,mlg1njg0rhmc,organization-79429,79429,,self,active,kms-technology.com,email_405807,405807,,,,79429-Organization loantran floating licenses 30,,[],False,False,False,False,False,False,2020-03-27 10:58:51,2020-10-14 06:57:30,,2020-03-27 02:58:51,2020-10-13 22:57:30,,3,,,,0.0,0.0,0.0,0.0,
1302,20200302,20200301,202003,202003,1605,mgflcdnwz8k7,organization-22468,22468,,self,active,loantest1@test.com,email_476136,476136,,,,22468-Organization loantest1,,[],False,False,False,False,False,False,2020-03-02 04:06:45,2020-10-14 06:54:48,,2020-03-01 20:06:45,2020-10-13 22:54:48,,3,,,,0.0,0.0,0.0,0.0,


# STEP 4: Load Data into the cluster

In [31]:
%%time
qry = """
    copy accounts from 's3://my-demo-data-redshift/recurly_accounts.csv'
    credentials 'aws_iam_role={}'
    csv 
    IGNOREHEADER 1;
""".format(DWH_ROLE_ARN)

%sql $qry

 * postgresql://awsuser:***@redshift-cluster-de-assignment.ceoqlgitrobq.us-east-1.redshift.amazonaws.com:5439/dev
Done.
CPU times: total: 0 ns
Wall time: 2.04 s


[]

In [23]:
%%sql
SELECT errors.tbl, *
FROM stl_load_errors errors

 * postgresql://awsuser:***@redshift-cluster-de-assignment.ceoqlgitrobq.us-east-1.redshift.amazonaws.com:5439/dev
2 rows affected.


tbl,userid,slice,tbl_1,starttime,session,query,filename,line_number,colname,type,col_length,position,raw_line,raw_field_value,err_code,err_reason,is_partial,start_offset
108075,100,0,108075,2022-04-19 14:44:43.790270,1075309855,28068,s3://my-demo-data-redshift/recurly_accounts.csv,1,batch_id,int8,0,2,"""batch_id"",""date_id"",""date_id_ps"",""month_id"",""month_id_ps"",""handle_batch_id"",""id"",""code"",""org_id"",""parent_account_id"",""bill_to"",""state"",""username"",""email"",""email_id"",""preferred_locale"",""first_name"",""last_name"",""company"",""country_code"",""custom_fields"",""has_live_subscription"",""has_active_subscription"",""has_future_subscription"",""has_canceled_subscription"",""has_paused_subscription"",""has_past_due_invoice"",""created_at"",""updated_at"",""deleted_at"",""created_at_ps"",""updated_at_ps"",""deleted_at_ps"",""sales_channel"",""sales_rep"",""renewed_by"",""original_orgid"",""sale_credit_sharing"",""cs_credit_sharing"",""partner_credit_sharing"",""ecom_credit_sharing"",""credit_sharing""",batch_id,1207,"Invalid digit, Value 'b', Pos 0, Type: Long",0,0
108077,100,1,108077,2022-04-19 14:45:23.172046,1075309855,28073,s3://my-demo-data-redshift/recurly_accounts.csv,1,batch_id,int8,0,2,"""batch_id"",""date_id"",""date_id_ps"",""month_id"",""month_id_ps"",""handle_batch_id"",""id"",""code"",""org_id"",""parent_account_id"",""bill_to"",""state"",""username"",""email"",""email_id"",""preferred_locale"",""first_name"",""last_name"",""company"",""country_code"",""custom_fields"",""has_live_subscription"",""has_active_subscription"",""has_future_subscription"",""has_canceled_subscription"",""has_paused_subscription"",""has_past_due_invoice"",""created_at"",""updated_at"",""deleted_at"",""created_at_ps"",""updated_at_ps"",""deleted_at_ps"",""sales_channel"",""sales_rep"",""renewed_by"",""original_orgid"",""sale_credit_sharing"",""cs_credit_sharing"",""partner_credit_sharing"",""ecom_credit_sharing"",""credit_sharing""",batch_id,1207,"Invalid digit, Value 'b', Pos 0, Type: Long",0,0
