# Create A Redshift Cluster using the AWS python SDK 

**Prerequisits**<br>
Make sure you have an AWS secret and access key

- Create a new IAM user in your AWS account
- Give it `AdministratorAccess`, From `Attach existing policies directly` Tab
- Take note of the access key and secret 
- Insert into the file `dwh.cfg` as follows:
<font color='red'>
<BR>
\[AWS]<BR>
KEY= YOUR_AWS_KEY<BR>
SECRET= YOUR_AWS_SECRET<BR>
<font/>

In [1]:
import pandas as pd
import boto3
from botocore.exceptions import ClientError
import configparser
import json

In [2]:
%load_ext sql

## Load DWH Params from config file

In [6]:
config = configparser.ConfigParser()
config.read("dwh.cfg")

KEY                    = config.get("AWS","KEY")
SECRET                 = config.get("AWS","SECRET")

DWH_CLUSTER_TYPE       = config.get("DWH","DWH_CLUSTER_TYPE")
DWH_CLUSTER_IDENTIFIER = config.get("DWH","DWH_CLUSTER_IDENTIFIER")
DWH_NUM_NODES          = config.get("DWH","DWH_NUM_NODES")
DWH_NODE_TYPE          = config.get("DWH","DWH_NODE_TYPE")

DWH_IAM_ROLE_NAME      = config.get("DWH", "DWH_IAM_ROLE_NAME")

DWH_DB                 = config.get("CLUSTER","DB_NAME")
DWH_DB_USER            = config.get("CLUSTER","DB_USER")
DWH_DB_PASSWORD        = config.get("CLUSTER","DB_PASSWORD")
DWH_PORT               = config.get("CLUSTER","DB_PORT")

# (DWH_DB_USER, DWH_DB_PASSWORD, DWH_DB)

# Output params in a dataframe
pd.DataFrame({"Param":
                  ["DWH_CLUSTER_TYPE", 
                   "DWH_NUM_NODES", 
                   "DWH_NODE_TYPE", 
                   "DWH_CLUSTER_IDENTIFIER", 
                   "DWH_DB", "DWH_DB_USER", 
                   "DWH_DB_PASSWORD", 
                   "DWH_PORT", 
                   "DWH_IAM_ROLE_NAME",
                   ],
              "Value":
                  [DWH_CLUSTER_TYPE,
                   DWH_NUM_NODES,
                   DWH_NODE_TYPE,
                   DWH_CLUSTER_IDENTIFIER,
                   DWH_DB, DWH_DB_USER,
                   DWH_DB_PASSWORD,
                   DWH_PORT, 
                   DWH_IAM_ROLE_NAME,
                   ]
             })

Unnamed: 0,Param,Value
0,DWH_CLUSTER_TYPE,multi-node
1,DWH_NUM_NODES,2
2,DWH_NODE_TYPE,dc2.large
3,DWH_CLUSTER_IDENTIFIER,dwhCluster
4,DWH_DB,dwh
5,DWH_DB_USER,dwhuser
6,DWH_DB_PASSWORD,Passw0rd
7,DWH_PORT,5439
8,DWH_IAM_ROLE_NAME,dwhRole


## Create clients for EC2, S3, IAM, and Redshift

In [7]:
import boto3

ec2 = boto3.resource("ec2",
                     region_name="eu-west-1",
                     aws_access_key_id=KEY,
                     aws_secret_access_key=SECRET
                     )

s3 = boto3.resource("s3",
                     region_name="eu-west-1",
                     aws_access_key_id=KEY,
                     aws_secret_access_key=SECRET
                     )

iam = boto3.client("iam",
                   region_name="eu-west-1",
                   aws_access_key_id=KEY,
                   aws_secret_access_key=SECRET
                   )

redshift = boto3.client("redshift",
                        region_name="eu-west-1",
                        aws_access_key_id=KEY,
                        aws_secret_access_key=SECRET
                        )

## Create an IAM Role that makes Redshift able to access S3 bucket (ReadOnly)

In [8]:
# Create the IAM role
try:
    print('1.1 Creating a new IAM Role')
    dwhRole = iam.create_role(
        Path="/",
        RoleName=DWH_IAM_ROLE_NAME,
        Description = "Allows Redshift clusters to call AWS services",
        AssumeRolePolicyDocument=json.dumps(
            {'Statement': [{'Action': 'sts:AssumeRole',
               'Effect': 'Allow',
               'Principal': {'Service': 'redshift.amazonaws.com'}}],
             'Version': '2012-10-17'})
    )
except Exception as e:
    print(e)

1.1 Creating a new IAM Role


In [9]:
# Attach Policy
print('1.2 Attaching Policy')
iam.attach_role_policy(RoleName=DWH_IAM_ROLE_NAME,
                       PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess"
                       )['ResponseMetadata']['HTTPStatusCode']

1.2 Attaching Policy


200

In [10]:
# Get and print the IAM role ARN
print('1.3 Get the IAM role ARN')
roleArn = iam.get_role(RoleName=DWH_IAM_ROLE_NAME)["Role"]["Arn"]

print(roleArn)

1.3 Get the IAM role ARN
arn:aws:iam::873674308518:role/dwhRole


## Create a RedShift Cluster
For complete arguments to `create_cluster`, see [docs](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/redshift.html#Redshift.Client.create_cluster)

In [11]:
try:
    response = redshift.create_cluster(        
        # Add parameters for hardware
        ClusterType=DWH_CLUSTER_TYPE,
        NodeType=DWH_NODE_TYPE,
        NumberOfNodes=int(DWH_NUM_NODES),

        # Add parameters for identifiers & credentials
        DBName=DWH_DB,
        ClusterIdentifier=DWH_CLUSTER_IDENTIFIER,
        MasterUsername=DWH_DB_USER,
        MasterUserPassword=DWH_DB_PASSWORD,
        
        # Add parameter for role (to allow s3 access)
         IamRoles=[roleArn]
    )
except Exception as e:
    print(e)

*Describe* the cluster to see its status

(run this block several times until the cluster status becomes `Available`)

In [15]:
def prettyRedshiftProps(props):
    pd.set_option('display.max_colwidth', None)
    keysToShow = ["ClusterIdentifier", "NodeType", "ClusterStatus", "MasterUsername", "DBName", "Endpoint", "NumberOfNodes", 'VpcId']
    x = [(k, v) for k,v in props.items() if k in keysToShow]
    return pd.DataFrame(data=x, columns=["Key", "Value"])

myClusterProps = redshift.describe_clusters(
    ClusterIdentifier=DWH_CLUSTER_IDENTIFIER)['Clusters'][0]
prettyRedshiftProps(myClusterProps)

Unnamed: 0,Key,Value
0,ClusterIdentifier,dwhcluster
1,NodeType,dc2.large
2,ClusterStatus,available
3,MasterUsername,dwhuser
4,DBName,dwh
5,Endpoint,"{'Address': 'dwhcluster.cu3xqkpifx2m.eu-west-1.redshift.amazonaws.com', 'Port': 5439}"
6,VpcId,vpc-9e6524f8
7,NumberOfNodes,2


**Take note** of the cluster endpoint and role ARN. Insert into the file `dwh.cfg` as follows:

\[CLUSTER]<br>
HOST=DWH_ENDPOINT

\[IAM_ROLE]<br>
ARN=DWH_ROLE_ARN

<font color='red'>DO NOT RUN THIS unless the cluster status becomes "Available" </font>

In [16]:
DWH_ENDPOINT = myClusterProps['Endpoint']['Address']
DWH_ROLE_ARN = myClusterProps['IamRoles'][0]['IamRoleArn']
print("DWH_ENDPOINT :: ", DWH_ENDPOINT)
print("DWH_ROLE_ARN :: ", DWH_ROLE_ARN)

DWH_ENDPOINT ::  dwhcluster.cu3xqkpifx2m.eu-west-1.redshift.amazonaws.com
DWH_ROLE_ARN ::  arn:aws:iam::873674308518:role/dwhRole


## Open an incoming TCP port to access the cluster endpoint

(Attention: Using IP `0.0.0.0/0` is very insecure, only use for demo purposes)

In [21]:
try:
    vpc = ec2.Vpc(id=myClusterProps['VpcId'])
    defaultSg = list(vpc.security_groups.all())[-1]  # accesses the default group, shaky code ;-)
    print(defaultSg)
    
    defaultSg.authorize_ingress(
        GroupName= defaultSg.group_name,
        CidrIp='0.0.0.0/0',
        IpProtocol='TCP',
        FromPort=int(DWH_PORT),
        ToPort=int(DWH_PORT)
    )
except Exception as e:
    print(e)

ec2.SecurityGroup(id='sg-5792ac2a')


**Make sure you can connect to the cluster**

In [22]:
conn_string=f"postgresql://{DWH_DB_USER}:{DWH_DB_PASSWORD}@{DWH_ENDPOINT}:{DWH_PORT}/{DWH_DB}"

print(conn_string)
%sql $conn_string

postgresql://dwhuser:Passw0rd@dwhcluster.cu3xqkpifx2m.eu-west-1.redshift.amazonaws.com:5439/dwh


'Connected: dwhuser@dwh'

---

## IN THE END: Clean up your resources

<b><font color='red'>DO NOT RUN THIS UNLESS YOU ARE SURE NOT TO HAVE TO USE THESE RESSOURCES AGAIN</b>

In [25]:
# Run to delete the created resources
redshift.delete_cluster( ClusterIdentifier=DWH_CLUSTER_IDENTIFIER,  SkipFinalClusterSnapshot=True)

{'Cluster': {'ClusterIdentifier': 'dwhcluster',
  'NodeType': 'dc2.large',
  'ClusterStatus': 'deleting',
  'ClusterAvailabilityStatus': 'Modifying',
  'MasterUsername': 'dwhuser',
  'DBName': 'dwh',
  'Endpoint': {'Address': 'dwhcluster.cu3xqkpifx2m.eu-west-1.redshift.amazonaws.com',
   'Port': 5439},
  'ClusterCreateTime': datetime.datetime(2020, 5, 28, 20, 4, 59, 754000, tzinfo=tzutc()),
  'AutomatedSnapshotRetentionPeriod': 1,
  'ManualSnapshotRetentionPeriod': -1,
  'ClusterSecurityGroups': [],
  'VpcSecurityGroups': [{'VpcSecurityGroupId': 'sg-5792ac2a',
    'Status': 'active'}],
  'ClusterParameterGroups': [{'ParameterGroupName': 'default.redshift-1.0',
    'ParameterApplyStatus': 'in-sync'}],
  'ClusterSubnetGroupName': 'default',
  'VpcId': 'vpc-9e6524f8',
  'AvailabilityZone': 'eu-west-1c',
  'PreferredMaintenanceWindow': 'thu:05:00-thu:05:30',
  'PendingModifiedValues': {},
  'ClusterVersion': '1.0',
  'AllowVersionUpgrade': True,
  'NumberOfNodes': 3,
  'PubliclyAccessible'

In [28]:
# Check status
myClusterProps = redshift.describe_clusters(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER)['Clusters'][0]
prettyRedshiftProps(myClusterProps)

ClusterNotFoundFault: An error occurred (ClusterNotFound) when calling the DescribeClusters operation: Cluster dwhcluster not found.

In [27]:
# Run to delete the created resources
iam.detach_role_policy(RoleName=DWH_IAM_ROLE_NAME, PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess")
iam.delete_role(RoleName=DWH_IAM_ROLE_NAME)

{'ResponseMetadata': {'RequestId': '2db2a23c-704e-4cba-99a9-51fefcfb9fa7',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '2db2a23c-704e-4cba-99a9-51fefcfb9fa7',
   'content-type': 'text/xml',
   'content-length': '200',
   'date': 'Thu, 28 May 2020 21:07:44 GMT'},
  'RetryAttempts': 0}}

---