In [1]:
import pandas as pd
import boto3
import json

# STEP 0: Make sure you have an AWS secret and access key


Create a new IAM user in your AWS account
Give it AdministratorAccess, From Attach existing policies directly Tab
Take note of the access key and secret
Edit the file dwh.cfg in the same folder as this notebook and fill
[AWS]
KEY= YOUR_AWS_KEY
SECRET= YOUR_AWS_SECRET

## Load DWH Parameters from a file

In [2]:
import configparser
config = configparser.ConfigParser()
#config.read_file(open('dwh.cfg'))
config.read('dwh.cfg')

KEY                    = config.get('AWS','KEY')
SECRET                 = config.get('AWS','SECRET')

DWH_CLUSTER_TYPE       = config.get("DWH","DWH_CLUSTER_TYPE")
DWH_NODE_TYPE          = config.get("DWH","DWH_NODE_TYPE")
DWH_NUMBER_NODES    = config.get("DWH", "DWH_NUMBER_NODES")
DWH_CLUSTER_IDENTIFIER = config.get("DWH","DWH_CLUSTER_IDENTIFIER")
DWH_DB                 = config.get("DWH","DWH_DB")
DWH_DB_USER            = config.get("DWH","DWH_DB_USER")
DWH_DB_PASSWORD        = config.get("DWH","DWH_DB_PASSWORD")
DWH_PORT               = config.get("DWH","DWH_PORT")
DWH_IAM_ROLE_NAME      = config.get("DWH", "DWH_IAM_ROLE_NAME")


(DWH_DB_USER, DWH_DB_PASSWORD, DWH_DB)

pd.DataFrame({"Param":
                  ["DWH_CLUSTER_TYPE",  "DWH_NODE_TYPE", "DWH_CLUSTER_IDENTIFIER", "DWH_DB", "DWH_DB_USER", "DWH_DB_PASSWORD", "DWH_PORT"],
              "Value":
                  [DWH_CLUSTER_TYPE,  DWH_NODE_TYPE, DWH_CLUSTER_IDENTIFIER, DWH_DB, DWH_DB_USER, DWH_DB_PASSWORD, DWH_PORT]
             })

Unnamed: 0,Param,Value
0,DWH_CLUSTER_TYPE,multi-node
1,DWH_NODE_TYPE,dc2.large
2,DWH_CLUSTER_IDENTIFIER,redshift-cluster-1
3,DWH_DB,dev
4,DWH_DB_USER,awsuser
5,DWH_DB_PASSWORD,AWSuser1
6,DWH_PORT,5439


## Create clients for IAM, EC2, S3 and Redshift

In [3]:
import boto3

ec2 = boto3.resource('ec2',
                       region_name="eu-west-2",
                       aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET
                    )

s3 = boto3.resource('s3',
                       region_name="eu-west-2",
                       aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET
                   )

iam = boto3.client('iam',aws_access_key_id=KEY,
                     aws_secret_access_key=SECRET,
                     region_name='eu-west-2'
                  )

redshift = boto3.client('redshift',
                       region_name="eu-west-2",
                       aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET
                       )

## Check out the sample data sources on S3


In [5]:
#sampleDbBucket =  s3.Bucket("awssampledbeuwest2")

#for obj in sampleDbBucket.objects.filter(Prefix="ssbgz"):
 #   print(obj)
# for obj in sampleDbBucket.objects.all():
#     print(obj)

#for bucket in s3.buckets.all():
 #   print(bucket.name)
    
sampleDbBucket =  s3.Bucket("udacity-dend")
for obj in sampleDbBucket.objects.filter(Prefix="song-data/A/S/S/"):
    print(obj)

s3.ObjectSummary(bucket_name='udacity-dend', key='song-data/A/S/S/TRASSCL12903CAC2F7.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='song-data/A/S/S/TRASSGQ128F9351BA1.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='song-data/A/S/S/TRASSHP128F146D28F.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='song-data/A/S/S/TRASSHQ128F42250A7.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='song-data/A/S/S/TRASSHY128F42B30FC.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='song-data/A/S/S/TRASSJE128F427927A.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='song-data/A/S/S/TRASSMH128F4251B73.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='song-data/A/S/S/TRASSNW128F92F7728.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='song-data/A/S/S/TRASSPU128F9358C09.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='song-data/A/S/S/TRASSRC128F9351B5C.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='song-data/A/S/S/TRASSRM128F427

# STEP 1: IAM ROLE
Create an IAM Role that makes Redshift able to access S3 bucket (ReadOnly)

In [6]:
from botocore.exceptions import ClientError

#1.1 Create the role, 
try:
    print("1.1 Creating a new IAM Role") 
    dwhRole = iam.create_role(
        Path='/',
        RoleName=DWH_IAM_ROLE_NAME,
        Description = "Allows Redshift clusters to call AWS services on your behalf.",
        AssumeRolePolicyDocument=json.dumps(
            {'Statement': [{'Action': 'sts:AssumeRole',
               'Effect': 'Allow',
               'Principal': {'Service': 'redshift.amazonaws.com'}}],
             'Version': '2012-10-17'})
    )    
except Exception as e:
    print(e)
    
    
print("1.2 Attaching Policy")

iam.attach_role_policy(RoleName=DWH_IAM_ROLE_NAME,
                       PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess"
                      )['ResponseMetadata']['HTTPStatusCode']

print("1.3 Get the IAM role ARN")
roleArn = iam.get_role(RoleName=DWH_IAM_ROLE_NAME)['Role']['Arn']

{
            "Effect": "Allow",
            "Action": "iam:PassRole",
            "Resource": [
                "arn:aws:iam::795712660208:role/dwhRole"
               
             ]
        }


print(roleArn)

1.1 Creating a new IAM Role
1.2 Attaching Policy
1.3 Get the IAM role ARN
arn:aws:iam::795712660208:role/dwhRole


# STEP 2: Redshift Cluster
Create a RedShift Cluster
For complete arguments to create_cluster, see docs

In [7]:
try:
    response = redshift.create_cluster(        
        #HW
        ClusterType=DWH_CLUSTER_TYPE,
        NodeType=DWH_NODE_TYPE,
        NumberOfNodes=int(DWH_NUMBER_NODES),

        #Identifiers & Credentials
        DBName=DWH_DB,
        ClusterIdentifier=DWH_CLUSTER_IDENTIFIER,
        MasterUsername=DWH_DB_USER,
        MasterUserPassword=DWH_DB_PASSWORD,
        
        #Roles (for s3 access)
        IamRoles=[roleArn]  
    )
except Exception as e:
    print(e)

## 2.1 Describe the cluster to see its status
run this block several times until the cluster status becomes Available

In [8]:
def prettyRedshiftProps(props):
    pd.set_option('display.max_colwidth', None)
    keysToShow = ["ClusterIdentifier", "NodeType", "ClusterStatus", "MasterUsername", "DBName", "Endpoint", "NumberOfNodes", 'VpcId']
    x = [(k, v) for k,v in props.items() if k in keysToShow]
    return pd.DataFrame(data=x, columns=["Key", "Value"])

myClusterProps = redshift.describe_clusters(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER)['Clusters'][0]
prettyRedshiftProps(myClusterProps)

Unnamed: 0,Key,Value
0,ClusterIdentifier,redshift-cluster-1
1,NodeType,dc2.large
2,ClusterStatus,available
3,MasterUsername,awsuser
4,DBName,dev
5,Endpoint,"{'Address': 'redshift-cluster-1.c67h7beejfsl.eu-west-2.redshift.amazonaws.com', 'Port': 5439}"
6,VpcId,vpc-d6f6acbe
7,NumberOfNodes,4


## 2.2 Take note of the cluster endpoint and role ARN 
DO NOT RUN THIS unless the cluster status becomes "Available"

In [9]:
DWH_ENDPOINT = myClusterProps['Endpoint']['Address']
DWH_ROLE_ARN = myClusterProps['IamRoles'][0]['IamRoleArn']
#print("DWH_ENDPOINT :: ", DWH_ENDPOINT)
#print("DWH_ROLE_ARN :: ", DWH_ROLE_ARN)
print(DWH_ENDPOINT)
print(DWH_ROLE_ARN)


redshift-cluster-1.c67h7beejfsl.eu-west-2.redshift.amazonaws.com
arn:aws:iam::795712660208:role/dwhRole


# STEP 3: Open an incoming TCP port to access the cluster ednpoint

In [12]:
try:
    vpc = ec2.Vpc(id=myClusterProps['VpcId'])
    defaultSg = list(vpc.security_groups.all())[0]
    print(defaultSg)
    defaultSg.authorize_ingress(
        GroupName=defaultSg.group_name,
        CidrIp='0.0.0.0/0',
        IpProtocol='TCP',
        FromPort=int(DWH_PORT),
        ToPort=int(DWH_PORT)
    )
except Exception as e:
    print(e)

ec2.SecurityGroup(id='sg-0ebbaadb28b17da9c')
An error occurred (InvalidPermission.Duplicate) when calling the AuthorizeSecurityGroupIngress operation: the specified rule "peer: 0.0.0.0/0, TCP, from port: 5439, to port: 5439, ALLOW" already exists


# STEP 4: Make sure you can connect to the cluster

In [38]:
%load_ext sql

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [39]:
conn_string="postgresql://{}:{}@{}:{}/{}".format(DWH_DB_USER, DWH_DB_PASSWORD, DWH_ENDPOINT, DWH_PORT,DWH_DB)
print(conn_string)
%sql $conn_string

postgresql://awsuser:AWSuser1@redshift-cluster-1.c67h7beejfsl.eu-west-2.redshift.amazonaws.com:5439/dev


# STEP 5: Clean up your resources
DO NOT RUN THIS UNLESS YOU ARE SURE
We will be using these resources in the next exercises

In [22]:
#### CAREFUL!!
#-- Uncomment & run to delete the created resources
redshift.delete_cluster( ClusterIdentifier=DWH_CLUSTER_IDENTIFIER,  SkipFinalClusterSnapshot=True)
#### CAREFUL!!

{'Cluster': {'ClusterIdentifier': 'redshift-cluster-1',
  'NodeType': 'dc2.large',
  'ClusterStatus': 'deleting',
  'ClusterAvailabilityStatus': 'Modifying',
  'MasterUsername': 'awsuser',
  'DBName': 'dev',
  'Endpoint': {'Address': 'redshift-cluster-1.c67h7beejfsl.eu-west-2.redshift.amazonaws.com',
   'Port': 5439},
  'ClusterCreateTime': datetime.datetime(2021, 3, 15, 11, 15, 22, 95000, tzinfo=tzutc()),
  'AutomatedSnapshotRetentionPeriod': 1,
  'ManualSnapshotRetentionPeriod': -1,
  'ClusterSecurityGroups': [],
  'VpcSecurityGroups': [{'VpcSecurityGroupId': 'sg-0ebbaadb28b17da9c',
    'Status': 'active'},
   {'VpcSecurityGroupId': 'sg-5472a52f', 'Status': 'active'}],
  'ClusterParameterGroups': [{'ParameterGroupName': 'default.redshift-1.0',
    'ParameterApplyStatus': 'in-sync'}],
  'ClusterSubnetGroupName': 'default',
  'VpcId': 'vpc-d6f6acbe',
  'AvailabilityZone': 'eu-west-2b',
  'PreferredMaintenanceWindow': 'sat:20:00-sat:20:30',
  'PendingModifiedValues': {},
  'ClusterVersi

run the  block several times until the cluster really deleted

In [23]:
myClusterProps = redshift.describe_clusters(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER)['Clusters'][0]
prettyRedshiftProps(myClusterProps)

Unnamed: 0,Key,Value
0,ClusterIdentifier,redshift-cluster-1
1,NodeType,dc2.large
2,ClusterStatus,deleting
3,MasterUsername,awsuser
4,DBName,dev
5,Endpoint,"{'Address': 'redshift-cluster-1.c67h7beejfsl.eu-west-2.redshift.amazonaws.com', 'Port': 5439}"
6,VpcId,vpc-d6f6acbe
7,NumberOfNodes,2


In [24]:
#### CAREFUL!!
#-- Uncomment & run to delete the created resources
iam.detach_role_policy(RoleName=DWH_IAM_ROLE_NAME, PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess")
iam.delete_role(RoleName=DWH_IAM_ROLE_NAME)
#### CAREFUL!!

{'ResponseMetadata': {'RequestId': '66825fc1-6c23-467e-bb2c-258a14e653a8',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '66825fc1-6c23-467e-bb2c-258a14e653a8',
   'content-type': 'text/xml',
   'content-length': '200',
   'date': 'Mon, 15 Mar 2021 11:50:01 GMT'},
  'RetryAttempts': 0}}