# Introduction

Follow full tutorials here: https://github.com/aws/aws-sdk-pandas/blob/main/tutorials/001%20-%20Introduction.ipynb

## Pre-requisites
This part assumes that the terraform backend has been provisioned properly (and has not been destroyed).
To follow this guide, a Redshift cluster has to be provisioned.
Simply run `terraform apply` in the current folder, you will get a cluster up and running to get this notebook runing and practice AWS wrangler.

In [1]:
import awswrangler as wr
wr.__version__

'3.4.2'

# Sessions
Create customized session using boto3.Session()
Reference: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/core/session.html

In [2]:
import boto3
from dotenv import load_dotenv
import os

load_dotenv('.env.aws_credentials')
AWS_ACCESS_KEY = os.getenv('AWS_ACCESS_KEY')
AWS_SECRET_KEY = os.getenv('AWS_SECRET_KEY')
AWS_REGION = os.getenv('AWS_REGION')
# using a custom boto3 session using my own aws development credentials
my_session = boto3.Session(
    aws_access_key_id=AWS_ACCESS_KEY,
    aws_secret_access_key=AWS_SECRET_KEY,
    region_name=AWS_REGION)

In [3]:
wr.s3.does_object_exist("s3://noaa-ghcn-pds/fake", boto3_session=my_session)

False

In [4]:
# Amazon S3
bucket_name = "zoomcamp-extracted-data"
# check whether the parquet file from the prefect exercise exists in the s3 bucket
wr.s3.does_object_exist(f"s3://{bucket_name}/yellow_tripdata_2023-09.parquet", boto3_session=my_session)

True

In [5]:
# downloading object to a file path
import os
import pandas as pd
local_file_dir = "./download/"
s3_file_name = "yellow_tripdata_2023-09.parquet"
s3_file_path = f"s3://{bucket_name}/{s3_file_name}"
local_file = os.path.join(local_file_dir, s3_file_name)

wr.s3.download(path=s3_file_path, local_file=local_file)


In [6]:
taxi_df = pd.read_parquet(local_file)
taxi_df = taxi_df.head(100)

taxi_df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee
0,1,2023-09-01 00:15:37,2023-09-01 00:20:21,1.0,0.8,1.0,N,163,230,2,6.5,3.5,0.5,0.0,0.0,1.0,11.5,2.5,0.0
1,2,2023-09-01 00:18:40,2023-09-01 00:30:28,2.0,2.34,1.0,N,236,233,1,14.2,1.0,0.5,2.0,0.0,1.0,21.2,2.5,0.0
2,2,2023-09-01 00:35:01,2023-09-01 00:39:04,1.0,1.62,1.0,N,162,236,1,8.6,1.0,0.5,2.0,0.0,1.0,15.6,2.5,0.0
3,2,2023-09-01 00:45:45,2023-09-01 00:47:37,1.0,0.74,1.0,N,141,229,1,5.1,1.0,0.5,1.0,0.0,1.0,11.1,2.5,0.0
4,2,2023-09-01 00:01:23,2023-09-01 00:38:05,1.0,9.85,1.0,N,138,230,1,45.0,6.0,0.5,17.02,0.0,1.0,73.77,2.5,1.75


In [7]:

REDSHIFT_DATABASE = os.getenv('REDSHIFT_DATABASE') 


In [9]:
# need to ensure that redshift vpc cluster has security group to be properly set up to allow inbound access via port 5439
glue_connection_name = "zoomcamp_redshift_glue_connection"
con = wr.redshift.connect(
    connection=glue_connection_name,
    dbname=REDSHIFT_DATABASE,
    boto3_session=my_session
    )

# testing connection
with con.cursor() as cursor:
    cursor.execute("SELECT 1;")
    print(cursor.fetchall())

# con.close()

([1],)


In [10]:
s3_staging_folder = "copy_to_redshift_staging"
table_name = "demo_taxi_data"
# copy the dataframe to redshift cluster
wr.redshift.copy(
    df=taxi_df,
    path=f"s3://{bucket_name}/{s3_staging_folder}",
    con=con,
    table=table_name,
    schema="public",
    boto3_session=my_session
)

# wr.redshift.to_sql(
#     df=taxi_df,
#     table=table_name,
#     schema="public",
#     con=con,
#     mode='overwrite'
# )

with con.cursor() as cursor:
    cursor.execute(f"SELECT COUNT(*) FROM {table_name};")
    print(cursor.fetchall())

([100],)


# After Session
Please ensure you destroy provisioned resources to avoid any charges.
Simply run `terraform destroy` in the `aws_wrangler_tutorial` folder.