## Connect

In [None]:
import pandas as pd
import psycopg2
import boto3
import json
from sqlalchemy import create_engine
from sqlalchemy import text

In [None]:
def get_secret(secret_name='wysde'):
    region_name = "us-east-1"
    session = boto3.session.Session()
    client = session.client(
        service_name='secretsmanager',
        region_name=region_name)
    get_secret_value_response = client.get_secret_value(SecretId=secret_name)
    get_secret_value_response = json.loads(get_secret_value_response['SecretString'])
    return get_secret_value_response

secret_vals = get_secret()

redshift_endpoint = secret_vals['REDSHIFT_HOST']
redshift_user = secret_vals['REDSHIFT_USERNAME']
redshift_pass = secret_vals['REDSHIFT_PASSWORD']
port = 5439
dbname = "dev"

engine_string = "postgresql+psycopg2://%s:%s@%s:%d/%s" \
% (redshift_user, redshift_pass, redshift_endpoint, port, dbname)
engine = create_engine(engine_string)

In [None]:
query = """
SELECT *
FROM pg_catalog.pg_tables
WHERE schemaname != 'pg_catalog' AND 
    schemaname != 'information_schema';
"""
df = pd.read_sql_query(text(query), engine)

query = """
SELECT * FROM "dev"."public"."users";
"""
df = pd.read_sql_query(text(query), engine)

## Taxi Data Process and Save to Redshift using AWS Wrangler

In [None]:
! pip install psycopg2-binary awscli boto3 awswrangler

In [None]:
import os
import boto3
import json
from time import time

import psycopg2
import pandas as pd
from sqlalchemy import create_engine
import awswrangler as wr


SECRET_NAME = "wysde"
URL = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-01.parquet"
INFILE = "yellow_tripdata_2022-01.parquet"
S3_PATH = "s3://wysde/taxi"
DB_NAME = "dev"
TABLE_NAME = "yellow_tripdata_2022_01"
CONN_NAME = "aws-data-wrangler-redshift-dev"

CONN = wr.redshift.connect(CONN_NAME)

os.system(f"wget {URL} -O {INFILE}")
wr.s3.upload(local_file=INFILE, path=S3_PATH+"/"+INFILE)
dfs = wr.s3.read_parquet(path=S3_PATH+"/"+INFILE, chunked=100000)

for df in dfs:
    df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
    df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
    wr.redshift.copy(
        df=df,
        path=S3_PATH+"/"+INFILE,
        con=CONN,
        schema='public',
        table=TABLE_NAME,
        mode="upsert",
        primary_keys=["VendorID"]
    )