## Pre-requisite steps - Create an IAM user and save the credentials

### Add aws.credentials to .gitignore
1. echo "aws.credentials" >> .gitignore

### Set up an AWS user whose credentials you are going to use
1. Launch AWS (I did it from the Udacity console
1. Navigate to Services --> IAM --> Users
1. Choose a name of your choice.
1. Select "Programmatic access" as the access type. Click Next.
1. Choose the Attach existing policies directly tab, and select the "AdministratorAccess". Click Next.
1. Skip adding any tags. Click Next.
1. Review and create the user. It will show you a pair of access key ID and secret.
1. Take note of the pair of access key ID and secret. This pair is collectively known as Access key.
1. Add the access key id and the secret key id to the aws.credentials file 

```
[AWS]
KEY=#####################
SECRET=#################################

```

see [Udacity exercise](https://learn.udacity.com/nanodegrees/nd027-ent-rbs/parts/cd0055/lessons/2ea9a35d-4276-48f0-aba8-3fd5d9057a7c/concepts/69da665c-acbe-4dcc-8846-46434098d418) for further details.

### Import packages

In [1]:
import pandas as pd
import boto3
import json
import time
import configparser
import os
import re

### Load AWS config from file

In [2]:
config = configparser.ConfigParser()
config.read_file(open('aws.cfg'))

DB_CLUSTER_TYPE        = config.get("CLUSTER","DB_CLUSTER_TYPE")
DB_NUM_NODES           = config.get("CLUSTER","DB_NUM_NODES")
DB_NODE_TYPE           = config.get("CLUSTER","DB_NODE_TYPE")

DB_CLUSTER_IDENTIFIER  = config.get("CLUSTER","DB_CLUSTER_IDENTIFIER")
DB_NAME                = config.get("CLUSTER","DB_NAME")
DB_USER                = config.get("CLUSTER","DB_USER")
DB_PASSWORD         = config.get("CLUSTER","DB_PASSWORD")
DB_PORT                = config.get("CLUSTER","DB_PORT")

IAM_ROLE_NAME          = config.get("IAM_ROLE", "IAM_ROLE_NAME")
ARN                    = config.get("IAM_ROLE", "ARN")

(DB_USER, DB_PASSWORD, DB_NAME)

pd.DataFrame({"Param":
                  ["DB_CLUSTER_TYPE", "DB_NUM_NODES", "DB_NODE_TYPE", "DB_CLUSTER_IDENTIFIER", 
                   "DB_NAME", "DB_USER", "DB_PASSWORD", "DB_PORT", "IAM_ROLE_NAME","ARN"],
              "Value":
                  [DB_CLUSTER_TYPE, DB_NUM_NODES, DB_NODE_TYPE, DB_CLUSTER_IDENTIFIER, 
                   DB_NAME, DB_USER, DB_PASSWORD, DB_PORT, IAM_ROLE_NAME, ARN],
             })

Unnamed: 0,Param,Value
0,DB_CLUSTER_TYPE,multi-node
1,DB_NUM_NODES,4
2,DB_NODE_TYPE,dc2.large
3,DB_CLUSTER_IDENTIFIER,dwhCluster
4,DB_NAME,dwh
5,DB_USER,dwhuser
6,DB_PASSWORD,Passw0rd
7,DB_PORT,5439
8,IAM_ROLE_NAME,dwhRole
9,ARN,arn:aws:iam::109203719027:role/dwhRole


### Read in AWS credentials

In [3]:
config.read_file(open('aws.credentials'))

KEY                    = config.get('AWS','KEY')
SECRET                 = config.get('AWS','SECRET')

### Create client for S3
**Note**: We are creating these resources in the the **us-west-2** region. Choose the same region in the your AWS web console to the see these resources.

In [4]:
s3 = boto3.resource('s3',
                       region_name="us-west-2",
                       aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET
                   )

### Create S3 buckets

In [1]:
S3_BUCKET  = "tbcp"
print("1.1 Creating s3 bucket") 
try:
    s3.create_bucket(Bucket=S3_BUCKET, CreateBucketConfiguration={
                    'LocationConstraint': 'us-west-2'})
except Exception as e:
    print(e)

1.1 Creating s3 bucket
name 's3' is not defined


### Check that the new bucket exists

In [6]:
s3_client = boto3.client('s3',
                   region_name="us-west-2",
                   aws_access_key_id=KEY,
                   aws_secret_access_key=SECRET)
response = s3_client.list_buckets()

# Output the bucket names
print('Existing buckets:')
for bucket in response['Buckets']:
    print(f'  {bucket["Name"]}')

Existing buckets:
  aws-logs-109203719027-us-east-1
  tom-baird-capstone-project-2


### This will delete the bucket (unless it contains files)

In [7]:
# my_bucket = 'tom-baird-capstone-project-2'
# objects = s3_client.list_objects_v2(Bucket=my_bucket)
# fileCount = objects['KeyCount']

# if fileCount == 0:
#  response = s3_client.delete_bucket(Bucket=my_bucket)
#  print("{} has been deleted successfully !!!".format(my_bucket))
# else:
#  print("{} is not empty {} objects present".format(my_bucket,fileCount))
#  print("Please make sure S3 bucket is empty before deleting it !!!")

### Upload data

In [43]:
s3.meta.client.upload_file('immigration_data_sample.csv','tom-baird-capstone-project-2','immigration_data_sample.csv')






In [30]:
S3_BUCKET='tom-baird-capstone-project-2'
destination_filepath='tmp/whatever'

local_filepath='data/S3bucket_temp/I94_data/i94yr=2016/i94mon=8/part-00199-11fe5f6a-6006-4092-b8d6-2ab88c9a6e6a.c000.snappy.parquet'
filename = re.findall('([^\/]*$)', local_filepath)[0]
print(filename)
filename
#os.path.isfile(local_filepath)

part-00199-11fe5f6a-6006-4092-b8d6-2ab88c9a6e6a.c000.snappy.parquet
tmp/whatever/part-00199-11fe5f6a-6006-4092-b8d6-2ab88c9a6e6a.c000.snappy.parquet


In [62]:
S3_BUCKET='tom-baird-capstone-project-2'
destination_folder='tmp/whatever'
local_filepath='data/S3bucket_temp/I94_data/i94yr=2016/i94mon=8/part-00199-11fe5f6a-6006-4092-b8d6-2ab88c9a6e6a.c000.snappy.parquet'

uploadToS3(local_filepath, S3_BUCKET, destination_folder)

Transfering file: data/S3bucket_temp/I94_data/i94yr=2016/i94mon=8/part-00199-11fe5f6a-6006-4092-b8d6-2ab88c9a6e6a.c000.snappy.parquet ==> tom-baird-capstone-project-2/tmp/whatever/part-00199-11fe5f6a-6006-4092-b8d6-2ab88c9a6e6a.c000.snappy.parquet



In [82]:
S3_BUCKET='tom-baird-capstone-project-2'
destination_folder='tmp/whatever'
local_filepath='data/S3bucket_temp/I94_data/i94yr=2016/i94mon=8/'

uploadToS3(local_filepath, S3_BUCKET, destination_folder)

0 Transfering file: data/S3bucket_temp/I94_data/i94yr=2016/i94mon=8/ ==> tom-baird-capstone-project-2/tmp/whatever/.part-00101-11fe5f6a-6006-4092-b8d6-2ab88c9a6e6a.c000.snappy.parquet.crc

1 Transfering file: data/S3bucket_temp/I94_data/i94yr=2016/i94mon=8/ ==> tom-baird-capstone-project-2/tmp/whatever/part-00092-11fe5f6a-6006-4092-b8d6-2ab88c9a6e6a.c000.snappy.parquet

2 Transfering file: data/S3bucket_temp/I94_data/i94yr=2016/i94mon=8/ ==> tom-baird-capstone-project-2/tmp/whatever/part-00133-11fe5f6a-6006-4092-b8d6-2ab88c9a6e6a.c000.snappy.parquet

3 Transfering file: data/S3bucket_temp/I94_data/i94yr=2016/i94mon=8/ ==> tom-baird-capstone-project-2/tmp/whatever/.part-00098-11fe5f6a-6006-4092-b8d6-2ab88c9a6e6a.c000.snappy.parquet.crc

4 Transfering file: data/S3bucket_temp/I94_data/i94yr=2016/i94mon=8/ ==> tom-baird-capstone-project-2/tmp/whatever/.part-00084-11fe5f6a-6006-4092-b8d6-2ab88c9a6e6a.c000.snappy.parquet.crc

5 Transfering file: data/S3bucket_temp/I94_data/i94yr=2016/i94mon

In [81]:
def uploadToS3(local_filepath, bucket_name, destination_folder, destination_filename=None):
    """
    
    """
#    print(local_filepath)    
    #if a directory is passed as origin_filepath parameter all files are copied into the destination_folder
    i=0
    if os.path.isfile(local_filepath):
        if destination_filename == None:
            destination_filename = re.findall('([^\/]*$)', local_filepath)[0]
        
        destination_filepath=destination_folder+'/'+destination_filename
        
        print(f'Transfering file: {local_filepath} ==> {bucket_name}/{destination_filepath}\n')
        
#         s3.meta.client.upload_file(local_filepath, bucket_name, destination_filepath)
                
    elif os.path.isdir(local_filepath):
        filelist = os.walk(local_filepath)

        for root, subFolders, files in filelist:
            
            for file in files:
                local_filename = os.path.join(root, file)
                destination_filepath = os.path.join(destination_folder, local_filename.replace(local_filepath, ''))
                
                print(f'{i} Transfering file: {local_filepath} ==> {bucket_name}/{destination_filepath}\n')
                s3.meta.client.upload_file(local_filename, bucket_name, destination_filepath)
                
                i=i+1
                if i > 5:
                    break 
    else: 
        
        print('Error')

In [25]:
uploadToS3(local_filepath,S3_BUCKET,'raw/I94_data','')

data/S3bucket_temp/I94_data/i94yr=2016/i94mon=8/part-00199-11fe5f6a-6006-4092-b8d6-2ab88c9a6e6a.c000.snappy.parquet
Transfering file:  data/S3bucket_temp/I94_data/i94yr=2016/i94mon=8/part-00199-11fe5f6a-6006-4092-b8d6-2ab88c9a6e6a.c000.snappy.parquet  ==>  tom-baird-capstone-project-2 / raw/I94_data/i94yr=2016/i94mon=8#/part-00199-11fe5f6a-6006-4092-b8d6-2ab88c9a6e6a.c000.snappy.parquet 



### Check the files in the bucket

In [13]:
my_bucket = s3.Bucket('tom-baird-capstone-project-2')
# my_bucket = s3.Bucket('aws-logs-109203719027-us-east-1')

for my_bucket_object in my_bucket.objects.all():
    print(my_bucket_object.key)

/.ipynb_checkpoints/I94MODE-checkpoint.csv
raw/dim/I94ADDR.csv
raw/dim/I94MODE.csv
raw/dim/I94PORT.csv
raw/dim/I94RES.csv
raw/dim/I94VISA.csv
raw/imm/._SUCCESS.crc
raw/imm/_SUCCESS
raw/imm/i94yr=2016/i94mon=1/.part-00000-1c04fe9e-a2e6-43aa-84b4-23e749fa3167.c000.snappy.parquet.crc
raw/imm/i94yr=2016/i94mon=1/.part-00001-1c04fe9e-a2e6-43aa-84b4-23e749fa3167.c000.snappy.parquet.crc
raw/imm/i94yr=2016/i94mon=1/.part-00002-1c04fe9e-a2e6-43aa-84b4-23e749fa3167.c000.snappy.parquet.crc
raw/imm/i94yr=2016/i94mon=1/.part-00003-1c04fe9e-a2e6-43aa-84b4-23e749fa3167.c000.snappy.parquet.crc
raw/imm/i94yr=2016/i94mon=1/.part-00004-1c04fe9e-a2e6-43aa-84b4-23e749fa3167.c000.snappy.parquet.crc
raw/imm/i94yr=2016/i94mon=1/.part-00005-1c04fe9e-a2e6-43aa-84b4-23e749fa3167.c000.snappy.parquet.crc
raw/imm/i94yr=2016/i94mon=1/.part-00006-1c04fe9e-a2e6-43aa-84b4-23e749fa3167.c000.snappy.parquet.crc
raw/imm/i94yr=2016/i94mon=1/.part-00007-1c04fe9e-a2e6-43aa-84b4-23e749fa3167.c000.snappy.parquet.crc
raw/imm/i94

ModuleNotFoundError: No module named 'pyarrow'

### Delete the sample file

In [45]:
# s3.Object('tom-baird-capstone-project-2', 'immigration_data_sample.csv').delete()



### Create clients for IAM, EC2 and Redshift
**Note**: We are creating these resources in the the **us-west-2** region. Choose the same region in the your AWS web console to the see these resources.

In [16]:
iam = boto3.client('iam',aws_access_key_id=KEY,
                     aws_secret_access_key=SECRET,
                     region_name='us-west-2'
                  )
ec2 = boto3.resource('ec2',
                       region_name="us-west-2",
                       aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET
                    )
redshift = boto3.client('redshift',
                       region_name="us-west-2",
                       aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET
                       )

### Create an IAM Role that makes Redshift able to access S3 bucket (ReadOnly)

In [17]:
from botocore.exceptions import ClientError

#1.1 Create the role, 
try:
    print("1.1 Creating a new IAM Role") 
    dwhRole = iam.create_role(
        Path='/',
        RoleName=IAM_ROLE_NAME,
        Description = "Allows Redshift clusters to call AWS services on your behalf.",
        AssumeRolePolicyDocument=json.dumps(
            {'Statement': [{'Action': 'sts:AssumeRole',
               'Effect': 'Allow',
               'Principal': {'Service': 'redshift.amazonaws.com'}}],
             'Version': '2012-10-17'})
    )    
except Exception as e:
    print(e)
    
print("1.2 Attaching Policy")

iam.attach_role_policy(RoleName=IAM_ROLE_NAME,
                       PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess"
                      )['ResponseMetadata']['HTTPStatusCode']

print("1.3 Get the IAM role ARN")
roleArn = iam.get_role(RoleName=IAM_ROLE_NAME)['Role']['Arn']

print(roleArn)

1.1 Creating a new IAM Role
1.2 Attaching Policy
1.3 Get the IAM role ARN
arn:aws:iam::109203719027:role/dwhRole


### - Create a [RedShift Cluster](https://console.aws.amazon.com/redshiftv2/home)
#### - For complete arguments to `create_cluster`, see [docs](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/redshift.html#Redshift.Client.create_cluster)

In [14]:
print("1.4 Creating a Redshift cluster")

try:
    response = redshift.create_cluster(        
        #HW
        ClusterType=DB_CLUSTER_TYPE,
        NodeType=DB_NODE_TYPE,
        NumberOfNodes=int(DB_NUM_NODES),

        #Identifiers & Credentials
        DBName=DB_NAME,
        ClusterIdentifier=DB_CLUSTER_IDENTIFIER,
        MasterUsername=DB_USER,
        MasterUserPassword=DB_PASSWORD,
        
        #Roles (for s3 access)
        IamRoles=[roleArn]  
    )
except Exception as e:
    print(e)

1.4 Creating a Redshift cluster
An error occurred (ClusterAlreadyExists) when calling the CreateCluster operation: Cluster already exists


### Wait until cluster is available

In [15]:

i=0
while redshift.describe_clusters(ClusterIdentifier=DB_CLUSTER_IDENTIFIER)['Clusters'][0]['ClusterStatus']!='available':
    print ('Waiting for cluster to become available')
    time.sleep(10)
    i += 1
    
    if i > 30:
#       Error
        print('Error - cluster not available after 5 minutes')
        exit(1)
        
# TODO - handle error better    
print('Cluster Available')    


Waiting for cluster to become available
Waiting for cluster to become available
Waiting for cluster to become available
Waiting for cluster to become available
Waiting for cluster to become available
Waiting for cluster to become available
Cluster Available


### Create a function to parse the cluster details

In [9]:
def prettyRedshiftProps(props):
    pd.set_option('display.max_colwidth', -1)
    keysToShow = ["ClusterIdentifier", "NodeType", "ClusterStatus", "MasterUsername", "DBName", "Endpoint", "NumberOfNodes", 'VpcId']
    x = [(k, v) for k,v in props.items() if k in keysToShow]
    return pd.DataFrame(data=x, columns=["Key", "Value"])

### Display the cluster details

In [10]:
time.sleep(10)

myClusterProps = redshift.describe_clusters(ClusterIdentifier=DB_CLUSTER_IDENTIFIER)['Clusters'][0]
prettyRedshiftProps(myClusterProps)


DB_ENDPOINT = myClusterProps['Endpoint']['Address']
DB_ROLE_ARN = myClusterProps['IamRoles'][0]['IamRoleArn']
print("DB_ENDPOINT :: ", DB_ENDPOINT)
print("DB_ROLE_ARN :: ", DB_ROLE_ARN)

DB_ENDPOINT ::  dwhcluster.csasogv133my.us-west-2.redshift.amazonaws.com
DB_ROLE_ARN ::  arn:aws:iam::109203719027:role/dwhRole


### Delete the cluster

In [11]:
#### CAREFUL!!
#-- Uncomment & run to delete the created resources
redshift.delete_cluster( ClusterIdentifier=DB_CLUSTER_IDENTIFIER,  SkipFinalClusterSnapshot=True)
#### CAREFUL!!

def prettyRedshiftProps(props):
    pd.set_option('display.max_colwidth', -1)
    keysToShow = ["ClusterIdentifier", "NodeType", "ClusterStatus", "MasterUsername", "DBName", "Endpoint", "NumberOfNodes", 'VpcId']
    x = [(k, v) for k,v in props.items() if k in keysToShow]
    return pd.DataFrame(data=x, columns=["Key", "Value"])

myClusterProps = redshift.describe_clusters(ClusterIdentifier=DB_CLUSTER_IDENTIFIER)['Clusters'][0]
prettyRedshiftProps(myClusterProps)

#### CAREFUL!!
#-- Uncomment & run to delete the created resources
iam.detach_role_policy(RoleName=IAM_ROLE_NAME, PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess")
iam.delete_role(RoleName=IAM_ROLE_NAME)
#### CAREFUL!!

print('Commands set to stop the Redshift cluster')

InvalidClusterStateFault: An error occurred (InvalidClusterState) when calling the DeleteCluster operation: There is an operation running on the Cluster. Please try to delete it at a later time.