## Pre-requisite steps - Create an IAM user and save the credentials

### Add aws.credentials to .gitignore
1. echo "aws.credentials" >> .gitignore

### Set up an AWS user whose credentials you are going to use
1. Launch AWS (I did it from the Udacity console
1. Navigate to Services --> IAM --> Users
1. Choose a name of your choice.
1. Select "Programmatic access" as the access type. Click Next.
1. Choose the Attach existing policies directly tab, and select the "AdministratorAccess". Click Next.
1. Skip adding any tags. Click Next.
1. Review and create the user. It will show you a pair of access key ID and secret.
1. Take note of the pair of access key ID and secret. This pair is collectively known as Access key.
1. Add the access key id and the secret key id to the aws.credentials file 

```
[AWS]
KEY=#####################
SECRET=#################################

```

see [Udacity exercise](https://learn.udacity.com/nanodegrees/nd027-ent-rbs/parts/cd0055/lessons/2ea9a35d-4276-48f0-aba8-3fd5d9057a7c/concepts/69da665c-acbe-4dcc-8846-46434098d418) for further details.

### Import packages

In [1]:
import pandas as pd
import boto3
import json
import time
import configparser
import os
import re



In [20]:
def uploadToS3(local_filepath, bucket_name, destination_folder, destination_filename=None):
    """
    
    
    """
#    print(local_filepath)    
    #if a directory is passed as origin_filepath parameter all files are copied into the destination_folder
    i=0
    if os.path.isfile(local_filepath):
        if destination_filename == None:
            destination_filename = re.findall('([^\/]*$)', local_filepath)[0]
        
        destination_filepath=destination_folder+'/'+destination_filename
        
        print(f'Transfering file: {local_filepath} ==> {bucket_name}/{destination_filepath}\n')
        
        s3.meta.client.upload_file(local_filepath, bucket_name, destination_filepath)
                
    elif os.path.isdir(local_filepath):
        filelist = os.walk(local_filepath)

        for root, subFolders, files in filelist:
            
            for file in files:
                
                if file[-4:] != '.crc':
                    
                    local_filename = os.path.join(root, file)
                    destination_filepath = os.path.join(destination_folder, local_filename.replace(local_filepath, ''))

                    print(f'Transfering file: {local_filepath} ==> {bucket_name}/{destination_filepath}\n')
                    s3.meta.client.upload_file(local_filename, bucket_name, destination_filepath)
                
    else: 
        
        print('Error - specified local file or directory not found.')

In [7]:
def list_files_in_bucket():
    """
    
    
    """
    my_bucket = s3.Bucket(S3_BUCKET)

    for my_bucket_object in my_bucket.objects.all():
        print(f'{my_bucket_object.key}\t\t{my_bucket_object.size}')

raw/dim/I94ADDR.csv		545
raw/dim/I94MODE.csv		51
raw/dim/I94PORT.csv		25805
raw/dim/I94RES.csv		4183
raw/dim/I94VISA.csv		49
raw/imm/_SUCCESS		0
raw/imm/i94yr=2016/i94mon=1/part-00000-5e74142c-084b-4b36-ac39-973f7b8ba87c.c000.snappy.parquet		3325181
raw/imm/i94yr=2016/i94mon=1/part-00001-5e74142c-084b-4b36-ac39-973f7b8ba87c.c000.snappy.parquet		3113287
raw/imm/i94yr=2016/i94mon=1/part-00002-5e74142c-084b-4b36-ac39-973f7b8ba87c.c000.snappy.parquet		3123661
raw/imm/i94yr=2016/i94mon=1/part-00003-5e74142c-084b-4b36-ac39-973f7b8ba87c.c000.snappy.parquet		3133351
raw/imm/i94yr=2016/i94mon=1/part-00004-5e74142c-084b-4b36-ac39-973f7b8ba87c.c000.snappy.parquet		3117410
raw/imm/i94yr=2016/i94mon=1/part-00005-5e74142c-084b-4b36-ac39-973f7b8ba87c.c000.snappy.parquet		3089914
raw/imm/i94yr=2016/i94mon=1/part-00006-5e74142c-084b-4b36-ac39-973f7b8ba87c.c000.snappy.parquet		3131823
raw/imm/i94yr=2016/i94mon=1/part-00007-5e74142c-084b-4b36-ac39-973f7b8ba87c.c000.snappy.parquet		2964949
raw/imm/i94yr=2

### Load AWS config from file

In [6]:
config = configparser.ConfigParser()
config.read_file(open('aws.cfg'))

DB_CLUSTER_TYPE        = config.get("CLUSTER","DB_CLUSTER_TYPE")
DB_NUM_NODES           = config.get("CLUSTER","DB_NUM_NODES")
DB_NODE_TYPE           = config.get("CLUSTER","DB_NODE_TYPE")

DB_NAME                = config.get("CLUSTER","DB_NAME")
DB_USER                = config.get("CLUSTER","DB_USER")
DB_PASSWORD            = config.get("CLUSTER","DB_PASSWORD")
DB_PORT                = config.get("CLUSTER","DB_PORT")

DB_CLUSTER_IDENTIFIER  = config.get("CLUSTER","DB_CLUSTER_IDENTIFIER")
DB_SNAPSHOT_IDENTIFIER = config.get("CLUSTER","DB_SNAPSHOT_IDENTIFIER")
DB_SNAPSHOT_RETENTION  = config.get("CLUSTER","DB_SNAPSHOT_RETENTION")

IAM_ROLE_NAME          = config.get("IAM_ROLE", "IAM_ROLE_NAME")
ARN                    = config.get("IAM_ROLE", "ARN")

S3_BUCKET              = config.get("S3", "S3_BUCKET")
S3_FOLDER              = config.get("S3", "S3_FOLDER")

I94_DATASET_PATH       = config.get("LOCAL_DATA", "I94_DATASET_PATH")
CLEAN_DATA_DIR         = config.get("LOCAL_DATA", "CLEAN_DATA_DIR")
LOCAL_FILEPATH         = config.get("LOCAL_DATA", "LOCAL_FILEPATH")
I94_LABELS             = config.get("LOCAL_DATA", "I94_LABELS")

### Read in AWS credentials

In [3]:
config = configparser.ConfigParser()
config.read_file(open('aws.credentials'))

KEY                    = config.get('AWS','KEY')
SECRET                 = config.get('AWS','SECRET')

### Create client for S3
**Note**: We are creating these resources in the the **us-west-2** region. Choose the same region in the your AWS web console to the see these resources.

In [4]:
s3 = boto3.resource('s3',
                       region_name="us-west-2",
                       aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET
                   )

### Create S3 buckets

In [12]:
S3_BUCKET='tbcp'
print("1.1 Creating s3 bucket") 
try:
    s3.create_bucket(Bucket=S3_BUCKET, CreateBucketConfiguration={
                    'LocationConstraint': 'us-west-2'})
except Exception as e:
    print(e)

1.1 Creating s3 bucket
An error occurred (BucketAlreadyOwnedByYou) when calling the CreateBucket operation: Your previous request to create the named bucket succeeded and you already own it.


### Check that the new bucket exists

In [13]:
s3_client = boto3.client('s3',
                   region_name="us-west-2",
                   aws_access_key_id=KEY,
                   aws_secret_access_key=SECRET)
response = s3_client.list_buckets()

# Output the bucket names
print('Existing buckets:')
for bucket in response['Buckets']:
    print(f'  {bucket["Name"]}')

Existing buckets:
  aws-logs-109203719027-us-east-1
  tbcp
  tom-baird-capstone-project-2


### This will delete the bucket (unless it contains files)

In [14]:
# my_bucket = S3_BUCKET
# objects = s3_client.list_objects_v2(Bucket=my_bucket)
# fileCount = objects['KeyCount']

# if fileCount == 0:
#  response = s3_client.delete_bucket(Bucket=my_bucket)
#  print("{} has been deleted successfully !!!".format(my_bucket))
# else:
#  print("{} is not empty {} objects present".format(my_bucket,fileCount))
#  print("Please make sure S3 bucket is empty before deleting it !!!")

### Upload data

In [21]:

destination_folder='raw/'
local_filepath='data/'

uploadToS3(local_filepath, S3_BUCKET, destination_folder)

Transfering file: data/ ==> tbcp/raw/dim/I94VISA.csv

Transfering file: data/ ==> tbcp/raw/dim/I94PORT.csv

Transfering file: data/ ==> tbcp/raw/dim/I94ADDR.csv

Transfering file: data/ ==> tbcp/raw/dim/I94MODE.csv

Transfering file: data/ ==> tbcp/raw/dim/I94RES.csv

Transfering file: data/ ==> tbcp/raw/imm/_SUCCESS

Transfering file: data/ ==> tbcp/raw/imm/i94yr=2016/i94mon=10/part-00013-7d940b12-cf61-4c28-bf2a-9303a3d8fc3c.c000.snappy.parquet

Transfering file: data/ ==> tbcp/raw/imm/i94yr=2016/i94mon=10/part-00001-7d940b12-cf61-4c28-bf2a-9303a3d8fc3c.c000.snappy.parquet

Transfering file: data/ ==> tbcp/raw/imm/i94yr=2016/i94mon=10/part-00008-7d940b12-cf61-4c28-bf2a-9303a3d8fc3c.c000.snappy.parquet

Transfering file: data/ ==> tbcp/raw/imm/i94yr=2016/i94mon=10/part-00016-7d940b12-cf61-4c28-bf2a-9303a3d8fc3c.c000.snappy.parquet

Transfering file: data/ ==> tbcp/raw/imm/i94yr=2016/i94mon=10/part-00003-7d940b12-cf61-4c28-bf2a-9303a3d8fc3c.c000.snappy.parquet

Transfering file: data/ =

### Check the files in the bucket

In [9]:
my_bucket = s3.Bucket(S3_BUCKET)

for my_bucket_object in my_bucket.objects.all():
    print(f'{my_bucket_object.key}\t\t{my_bucket_object.size}')

#### Delete files in the bucket

In [None]:
my_bucket = s3.Bucket(S3_BUCKET)

for my_bucket_object in my_bucket.objects.all():
    my_bucket_object.delete()