In [93]:
# boto3 is used to interact with AWS through python environment
# pandas for data manipulations and transformation on file
import boto3
import pandas as pd 
import psycopg2
import json

In [94]:
import configparser
config = configparser.ConfigParser()
config.read_file(open('config/configuration files.config'))


In [95]:
config.get("DWH","DWH_DB")

'aws-etl-redshift'

In [96]:
KEY = config.get('AWS', 'KEY')
SECRET = config.get('AWS', 'SECRET')

DWH_WORKGROUP_NAME = config.get('DWH', 'DWH_WORKGROUP_NAME')
DWH_NAMESPACE_NAME = config.get('DWH', 'DWH_NAMESPACE_NAME')
DWH_DB = config.get('DWH', 'DWH_DB')
DWH_DB_USER = config.get('DWH', 'DWH_DB_USER')
DWH_DB_PASSWORD = config.get('DWH', 'DWH_DB_PASSWORD')
DWH_PORT = config.getint('DWH', 'DWH_PORT')
DWH_IAM_ROLE_NAME = config.get('DWH', 'DWH_IAM_ROLE_NAME')
REGION = config.get("DWH","REGION")

In [81]:
# Create EC2 resource
ec2 = boto3.resource('ec2',
                     region_name=REGION,
                     aws_access_key_id=KEY,
                     aws_secret_access_key=SECRET)

# Create S3 resource
s3 = boto3.resource('s3',
                    region_name=REGION,
                    aws_access_key_id=KEY,
                    aws_secret_access_key=SECRET)

# Create IAM client
iam = boto3.client('iam',
                   region_name=REGION,
                   aws_access_key_id=KEY,
                   aws_secret_access_key=SECRET)

# Create Redshift client
redshift = boto3.client('redshift-serverless',
                        region_name=REGION,
                        aws_access_key_id=KEY,
                        aws_secret_access_key=SECRET)

In [82]:
# accessing the bucket

# Define the bucket name
bucket_name = 'ridwanclouds-bucket'

# Access the bucket
bucket = s3.Bucket(bucket_name)

# List all items in the bucket using an empty prefix
log_data_file = [filename.key for filename in bucket.objects.filter(Prefix='')]
log_data_file


['compressed_customers.csv.gz',
 'compressed_products.csv.gz',
 'compressed_sales.csv.gz']

In [83]:
#workgroup info 
try:
    workgroup_info = redshift.get_workgroup(workgroupName=DWH_WORKGROUP_NAME)
    print("Workgroup Info:")
    print(workgroup_info)
except Exception as e:
    workgroup_info

Workgroup Info:
{'workgroup': {'baseCapacity': 16, 'configParameters': [{'parameterKey': 'auto_mv', 'parameterValue': 'true'}, {'parameterKey': 'datestyle', 'parameterValue': 'ISO, MDY'}, {'parameterKey': 'enable_case_sensitive_identifier', 'parameterValue': 'false'}, {'parameterKey': 'enable_user_activity_logging', 'parameterValue': 'true'}, {'parameterKey': 'query_group', 'parameterValue': 'default'}, {'parameterKey': 'require_ssl', 'parameterValue': 'false'}, {'parameterKey': 'search_path', 'parameterValue': '$user, public'}, {'parameterKey': 'use_fips_ssl', 'parameterValue': 'false'}, {'parameterKey': 'max_query_execution_time', 'parameterValue': '14400'}], 'creationDate': datetime.datetime(2024, 5, 22, 13, 38, 23, 869000, tzinfo=tzutc()), 'endpoint': {'address': 'default-workgroup.339713018722.us-east-1.redshift-serverless.amazonaws.com', 'port': 5439, 'vpcEndpoints': [{'networkInterfaces': [{'availabilityZone': 'us-east-1b', 'networkInterfaceId': 'eni-0756a60271051dbdb', 'private

In [84]:
workgroup_info_df = pd.DataFrame(workgroup_info)
workgroup_info_df.head(2)

Unnamed: 0,workgroup,ResponseMetadata
baseCapacity,16,
configParameters,"[{'parameterKey': 'auto_mv', 'parameterValue':...",


In [85]:
# Describe the namespace
try:
    namespace_info = redshift.get_namespace(namespaceName=DWH_NAMESPACE_NAME)
    print("Namespace Info:")
    print(namespace_info)
except Exception as e:
    print(f"Error describing namespace: {e}")

Namespace Info:
{'namespace': {'adminUsername': 'admin', 'creationDate': datetime.datetime(2024, 5, 22, 13, 38, 23, 349000, tzinfo=tzutc()), 'dbName': 'dev', 'iamRoles': [], 'kmsKeyId': 'AWS_OWNED_KMS_KEY', 'logExports': [], 'namespaceArn': 'arn:aws:redshift-serverless:us-east-1:339713018722:namespace/02da2352-e369-4565-b22d-83b652e65ab3', 'namespaceId': '02da2352-e369-4565-b22d-83b652e65ab3', 'namespaceName': 'default-namespace', 'status': 'AVAILABLE'}, 'ResponseMetadata': {'RequestId': '03a714e1-0513-4cb0-8661-47c45e73c4fe', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '03a714e1-0513-4cb0-8661-47c45e73c4fe', 'date': 'Fri, 24 May 2024 14:25:19 GMT', 'content-type': 'application/x-amz-json-1.1', 'content-length': '382', 'connection': 'keep-alive'}, 'RetryAttempts': 0}}


In [86]:
# testing the redshift to be sure the allocated role is active and effective 

workgroup_endpoint = 'default-workgroup.339713018722.us-east-1.redshift-serverless.amazonaws.com'

# Connect to the Redshift Serverless endpoint to list databases
try:
    conn = psycopg2.connect(
        dbname=DWH_DB,
        user=DWH_DB_USER,
        password=DWH_DB_PASSWORD,
        host=workgroup_endpoint,
        port=DWH_PORT
    )
    # Execute SQL query
    cur = conn.cursor()
    cur.execute('SELECT * FROM "aws-etl-redshift"."public"."shoes"')
    rows = cur.fetchall()

    # Print results
    print("Results:")
    for row in rows:
        print(row)

    # Close cursor and connection
    cur.close()
    conn.close()

except Exception as e:
    print(f"Error: {e}")

Error: Relation shoes does not exist in the database.



In [78]:
## Upload the compressed data files into s3 bucket 

# defining S3 bucket name and file paths
bucket_name = 'ridwanclouds-bucket'
compressed_customer_csv_path = "/Users/villy/Documents/GitHub/aws-ETL-pipeline/data/compressed_customers.csv.gz"
compressed_product_csv_path = "/Users/villy/Documents/GitHub/aws-ETL-pipeline/data/compressed_products.csv.gz"
compressed_sales_csv_path = "/Users/villy/Documents/GitHub/aws-ETL-pipeline/data/compressed_sales.csv.gz"



# Function Uploading compressed CSV files to S3 bucket
def upload_to_s3(file_path, bucket_name, object_name):
    try:
        s3.meta.client.upload_file(file_path, bucket_name, object_name)
        print(f"File uploaded successfully to S3 bucket: {bucket_name}")
    except Exception as e:
        print(f"Error uploading file to S3 bucket: {e}")

# Uploading compressed CSV files
upload_to_s3(compressed_customer_csv_path, bucket_name, 'compressed_customers.csv.gz')
upload_to_s3(compressed_product_csv_path, bucket_name, 'compressed_products.csv.gz')
upload_to_s3(compressed_sales_csv_path, bucket_name, 'compressed_sales.csv.gz')


File uploaded successfully to S3 bucket: ridwanclouds-bucket
File uploaded successfully to S3 bucket: ridwanclouds-bucket
File uploaded successfully to S3 bucket: ridwanclouds-bucket


### Using COPY command for ETL process 

- The COPY command is used to load data into DW on Redshift. It is seemlesly easy and faster as the compressed data is already 
located in an S3 bucket as an object. 


In [91]:
conn = psycopg2.connect(
        dbname=DWH_DB,
        user=DWH_DB_USER,
        password=DWH_DB_PASSWORD,
        host=workgroup_endpoint,
        port=DWH_PORT
    )
cur = conn.cursor()
cur.execute('''SELECT * FROM "aws-etl-redshift"."public"."customers";''')
result = cur.fetchall()
print(result)


[]


In [92]:
# Connect to the Redshift Serverless endpoint
try:
    conn = psycopg2.connect(
        dbname=DWH_DB,
        user=DWH_DB_USER,
        password=DWH_DB_PASSWORD,
        host=workgroup_endpoint,
        port=DWH_PORT
    )
    cur = conn.cursor()

    # Copy data into customers table
    copy_customers = f"""
    COPY customers
    FROM 's3://s3://ridwanclouds-bucket/compressed_customers.csv.gz'
    IAM_ROLE '{DWH_IAM_ROLE_NAME}'
    CSV
    GZIP
    IGNOREHEADER 1;
    """
    cur.execute(copy_customers)

    # Copy data into products table
    copy_products = f"""
    COPY products
    FROM 's3://s3://ridwanclouds-bucket/compressed_products.csv.gz'
    IAM_ROLE '{DWH_IAM_ROLE_NAME}'
    CSV
    GZIP
    IGNOREHEADER 1;
    """
    cur.execute(copy_products)

    # Copy data into sales table
    copy_sales = f"""
    COPY sales
    FROM 's3://ridwanclouds-bucket/compressed_sales.csv.gz'
    IAM_ROLE '{DWH_IAM_ROLE_NAME}'
    CSV
    GZIP
    IGNOREHEADER 1;
    """
    cur.execute(copy_sales)

    # Commit the transactions
    conn.commit()

    # Close cursor and connection
    cur.close()
    conn.close()

    print("Data loaded successfully.")

except Exception as e:
    print(f"Error: {e}")

Error: cross-database reference to database "aws-etl-redshift" is not supported

