In [29]:
import time
import configparser
import matplotlib.pyplot as plt
import pandas as pd
import configparser
import psycopg2
import boto3
import sql_queries
import json

import importlib
from botocore.exceptions import ClientError

In [30]:
def create_iam_role():
    try:
        print("1.1 Creating a new IAM Role") 
        dwhRole = iam.create_role(
            Path='/',
            RoleName=IAM_ROLE_NAME,
            Description = "Allows Redshift clusters to call AWS services on your behalf.",
            AssumeRolePolicyDocument=json.dumps(
                {'Statement': [{'Action': 'sts:AssumeRole',
                   'Effect': 'Allow',
                   'Principal': {'Service': 'rexdshift.amazonaws.com'}}],
                 'Version': '2012-10-17'})
        )    
    except Exception as e:
        print(e)

    print("1.2 Attaching Policy")

    iam.attach_role_policy(RoleName=IAM_ROLE_NAME,
                           PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess"
                          )['ResponseMetadata']['HTTPStatusCode']

    print("1.3 Get the IAM role ARN")
    return iam.get_role(RoleName=IAM_ROLE_NAME)['Role']['Arn']

In [31]:
def create_new_cluster():
    print("Creating a new Redshift cluster")

    try:
        response = redshift.create_cluster(        
            #HW
            ClusterType=DB_CLUSTER_TYPE,
            NodeType=DB_NODE_TYPE,
            NumberOfNodes=int(DB_NUM_NODES),
            
            #Snapshots
            AutomatedSnapshotRetentionPeriod=DB_SNAPSHOT_RETENTION,

            #Identifiers & Credentials
            DBName=DB_NAME,
            ClusterIdentifier=DB_CLUSTER_IDENTIFIER,
            MasterUsername=DB_USER,
            MasterUserPassword=DB_PASSWORD,

            #Roles (for s3 access)
            IamRoles=[roleArn]
        )
        
        wait_for_cluster('available')
    except Exception as e:
        print(e)

In [50]:
def pause_cluster():
    
    print(f'Pausing cluster {DB_CLUSTER_IDENTIFIER}')
    print(f'Deleting old snapshot ({DB_SNAPSHOT_IDENTIFIER})')
    try:
        delete_snapshots(DB_SNAPSHOT_IDENTIFIER)
        print(f'Deleting cluster {DB_CLUSTER_IDENTIFIER} while retaining snapshot ({DB_SNAPSHOT_IDENTIFIER})')
        redshift.delete_cluster( ClusterIdentifier=DB_CLUSTER_IDENTIFIER,  SkipFinalClusterSnapshot=False, FinalClusterSnapshotIdentifier=DB_SNAPSHOT_IDENTIFIER)
        wait_for_cluster('deleted')
        print(f'Cluster deleted with snapshot {DB_SNAPSHOT_IDENTIFIER} retained.')
    except Exception as e:
        print(e)
    

In [33]:
def resume_cluster():
    print(f'Resuming cluster from snapshot {DB_SNAPSHOT_IDENTIFIER}')
    redshift.restore_from_cluster_snapshot( ClusterIdentifier=DB_CLUSTER_IDENTIFIER, SnapshotIdentifier=DB_SNAPSHOT_IDENTIFIER, IamRoles=[roleArn]  )
    
    wait_for_cluster('available')  

In [206]:
def delete_snapshots(snapshot_identifier=None):
    count_snapshots=len(redshift.describe_cluster_snapshots(ClusterIdentifier=DB_CLUSTER_IDENTIFIER)['Snapshots'])
    if count_snapshots > 0:
        print(f"Found {count_snapshots} snapshot(s):")
        try:
            if snapshot_identifier != None:
                if check_for_snapshot():
                    print(f'Deleting snapshot {snapshot_identifier}')
                    redshift.delete_cluster_snapshot(SnapshotIdentifier=snapshot_identifier)
                else:
                    print(f'Snapshot {snapshot_identifier} not present - continuing')
            else:
                
                for snapshot in redshift.describe_cluster_snapshots(ClusterIdentifier=DB_CLUSTER_IDENTIFIER)['Snapshots']:
                    if snapshot['SnapshotType']=='manual':
                        print(f"Deleting snapshot {snapshot['SnapshotIdentifier']}")
                        redshift.delete_cluster_snapshot(SnapshotIdentifier=snapshot['SnapshotIdentifier'])
                    elif snapshot['SnapshotType']=='automated':
                        print(f"Automated snapshot {snapshot['SnapshotIdentifier']} cannot be manually deleted but will be droped after {DB_SNAPSHOT_RETENTION} day retention period")
                    else:
                        print(f"Found unknown snapshot {snapshot['SnapshotIdentifier']} - taking no action")
        except Exception as e:
            print(e)

In [35]:
def delete_cluster():
    print(f'Deleting {DB_CLUSTER_IDENTIFIER}')
    redshift.delete_cluster( ClusterIdentifier=DB_CLUSTER_IDENTIFIER,  SkipFinalClusterSnapshot=True)
    wait_for_cluster('deleted')
    delete_snapshots()

In [36]:
def wait_for_cluster(desired_status):

    i=0
    try:
        while redshift.describe_clusters(ClusterIdentifier=DB_CLUSTER_IDENTIFIER)['Clusters'][0]['ClusterStatus']!=desired_status:
            print (f"Waiting for cluster to be {desired_status} (status currently \'{redshift.describe_clusters(ClusterIdentifier=DB_CLUSTER_IDENTIFIER)['Clusters'][0]['ClusterStatus']}\')")
            time.sleep(10)
            i += 1

            if i > 30:
        #       Error
                print(f'Error - cluster not {desired_status} after 5 minutes')
                exit(1)

        # TODO - handle error better    
        print(f'Cluster {desired_status}')
        
    except Exception as e:
        print(e)


In [37]:
def create_tables(cur, conn):
    """
    - Creates the tables
    """
    print('Creating tables')
    for query in sql_queries.create_table_queries:
        cur.execute(query.format(DB_ROLE_ARN))
        conn.commit()

   

In [38]:
def copy_to_dim(cur, conn):
    """
    - populates the dimension tables
    """
    print('Populating dimension tables')
    for query in sql_queries.copy_to_dim_queries:
        print(query.format(DB_ROLE_ARN))
        cur.execute(query.format(DB_ROLE_ARN))
        conn.commit()     

In [39]:
def copy_to_fact(cur, conn):
    """
    - populates the fact tables
    """
    print('Populating fact tables')
    for query in sql_queries.copy_to_fact_queries:
        cur.execute(query.format(DB_ROLE_ARN))
        conn.commit()    

In [40]:
def drop_tables(cur, conn):
    """
    - drops the dimension tables
    """
    print('Dropping tables')
    for query in sql_queries.drop_dim_table_queries:
        print(query.format(DB_ROLE_ARN))
        cur.execute(query.format(DB_ROLE_ARN))
        conn.commit()  

In [41]:
def prettyRedshiftProps(props):
    pd.set_option('display.max_colwidth', -1)
    keysToShow = ["ClusterIdentifier", "NodeType", "ClusterStatus", "MasterUsername", "DBName", "Endpoint", "NumberOfNodes", 'VpcId']
    x = [(k, v) for k,v in props.items() if k in keysToShow]
    return pd.DataFrame(data=x, columns=["Key", "Value"])

In [199]:
def check_for_snapshot():
    try:
        snapshot_exists = DB_SNAPSHOT_IDENTIFIER in [snapshot['SnapshotIdentifier'] for snapshot in redshift.describe_cluster_snapshots(SnapshotIdentifier=DB_SNAPSHOT_IDENTIFIER)['Snapshots']]
        return snapshot_exists
    except Exception as e:
        print(e)

In [14]:
DB_SNAPSHOT_IDENTIFIER = 'dwh-snapshot'
DB_SNAPSHOT_RETENTION = 1

In [15]:
config = configparser.ConfigParser()
config.read_file(open('aws.cfg'))

DB_CLUSTER_TYPE        = config.get("CLUSTER","DB_CLUSTER_TYPE")
DB_NUM_NODES           = config.get("CLUSTER","DB_NUM_NODES")
DB_NODE_TYPE           = config.get("CLUSTER","DB_NODE_TYPE")

DB_CLUSTER_IDENTIFIER  = config.get("CLUSTER","DB_CLUSTER_IDENTIFIER")
DB_NAME                = config.get("CLUSTER","DB_NAME")
DB_USER                = config.get("CLUSTER","DB_USER")
DB_PASSWORD         = config.get("CLUSTER","DB_PASSWORD")
DB_PORT                = config.get("CLUSTER","DB_PORT")

IAM_ROLE_NAME          = config.get("IAM_ROLE", "IAM_ROLE_NAME")
ARN                    = config.get("IAM_ROLE", "ARN")

(DB_USER, DB_PASSWORD, DB_NAME)

pd.DataFrame({"Param":
                  ["DB_CLUSTER_TYPE", "DB_NUM_NODES", "DB_NODE_TYPE", "DB_CLUSTER_IDENTIFIER", 
                   "DB_NAME", "DB_USER", "DB_PASSWORD", "DB_PORT", "IAM_ROLE_NAME","ARN"],
              "Value":
                  [DB_CLUSTER_TYPE, DB_NUM_NODES, DB_NODE_TYPE, DB_CLUSTER_IDENTIFIER, 
                   DB_NAME, DB_USER, DB_PASSWORD, DB_PORT, IAM_ROLE_NAME, ARN],
             })

config.read_file(open('aws.credentials'))

KEY                    = config.get('AWS','KEY')
SECRET                 = config.get('AWS','SECRET')


DB_ENDPOINT = ''
DB_ROLE_ARN = ''


### Create clients for IAM, EC2 and Redshift
**Note**: We are creating these resources in the the **us-west-2** region. Choose the same region in the your AWS web console to the see these resources.

In [16]:
iam = boto3.client('iam',aws_access_key_id=KEY,
                     aws_secret_access_key=SECRET,
                     region_name='us-west-2'
                  )
ec2 = boto3.resource('ec2',
                       region_name="us-west-2",
                       aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET
                    )
redshift = boto3.client('redshift',
                       region_name="us-west-2",
                       aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET
                       )

roleArn=create_iam_role()


### Create an IAM Role that makes Redshift able to access S3 bucket (ReadOnly)

1.1 Creating a new IAM Role
An error occurred (EntityAlreadyExists) when calling the CreateRole operation: Role with name dwhRole already exists.
1.2 Attaching Policy
1.3 Get the IAM role ARN
arn:aws:iam::109203719027:role/dwhRole


### - Create a [RedShift Cluster](https://console.aws.amazon.com/redshiftv2/home)
#### - For complete arguments to `create_cluster`, see [docs](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/redshift.html#Redshift.Client.create_cluster)

In [200]:
# resume a saved cluster or create a new one
if check_for_snapshot()==True:
    print(f'Resuming cluster operation based on snapshot {DB_SNAPSHOT_IDENTIFIER}')
#    resume_cluster()
else:
    print(f'No saved instances of {DB_CLUSTER_IDENTIFIER} found - creating a new cluster')
#    create_new_cluster()

Resuming cluster operation based on snapshot dwh-snapshot


In [16]:
conn = psycopg2.connect("host={} dbname={} user={} password={} port={}"
                        .format(DB_ENDPOINT, DB_NAME, DB_USER, DB_PASSWORD, DB_PORT))

cur = conn.cursor()

In [17]:
importlib.reload(sql_queries)
create_tables(cur, conn)
copy_to_dim(cur, conn)
copy_to_fact(cur, conn)
# drop_tables(cur, conn)

Creating staging tables


In [201]:
pause_cluster()

Pausing cluster dwhCluster
Deleting old snapshot (dwh-snapshot)
Deleting snaphot dwh-snapshot
local variable 'snapshot' referenced before assignment
Deleting cluster dwhCluster while retaining snapshot (dwh-snapshot)
An error occurred (ClusterSnapshotAlreadyExists) when calling the DeleteCluster operation: Cannot create the snapshot because a snapshot with the identifier dwh-snapshot already exists.


In [208]:
resume_cluster()

Resuming cluster from snapshot dwh-snapshot
Waiting for cluster to be available (status currently 'creating')
Waiting for cluster to be available (status currently 'creating')
Waiting for cluster to be available (status currently 'creating')
Waiting for cluster to be available (status currently 'creating')
Waiting for cluster to be available (status currently 'creating')
Waiting for cluster to be available (status currently 'creating')
Waiting for cluster to be available (status currently 'creating')
Waiting for cluster to be available (status currently 'creating')
Waiting for cluster to be available (status currently 'creating')
Waiting for cluster to be available (status currently 'creating')
Waiting for cluster to be available (status currently 'creating')
Waiting for cluster to be available (status currently 'creating')
Waiting for cluster to be available (status currently 'creating')
Waiting for cluster to be available (status currently 'creating')
Waiting for cluster to be availa

In [209]:
delete_cluster()

Deleting dwhCluster
Waiting for cluster to be deleted (status currently 'deleting')
Waiting for cluster to be deleted (status currently 'deleting')
Waiting for cluster to be deleted (status currently 'deleting')
Waiting for cluster to be deleted (status currently 'deleting')
Waiting for cluster to be deleted (status currently 'deleting')
Waiting for cluster to be deleted (status currently 'deleting')
Waiting for cluster to be deleted (status currently 'deleting')
Waiting for cluster to be deleted (status currently 'deleting')
An error occurred (ClusterNotFound) when calling the DescribeClusters operation: Cluster dwhcluster not found.
Found 1 snapshot(s):
Deleting snapshot dwh-snapshot
