## Install the libraries

In [None]:
%%writefile requirements.txt
dbt-core==1.2.0
dbt-postgres==1.1.1
psycopg2-binary==2.9.3
ipython-sql==0.4.1
boto3==1.24.31

In [None]:
!pip install -r requirements.txt

In [None]:
import boto3
import json
import pandas as pd

from sqlalchemy import create_engine
import psycopg2

In [None]:
DBT_SCHEMA = "dbt_taxi"

## Extract

In this step, we will download the data

In [None]:
!wget --show-progress https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-05.parquet
pd.read_parquet("yellow_tripdata_2022-05.parquet").sample(1000).to_parquet("yellow_tripdata_2022-05_sample_1000.parquet")

In [None]:
!wget -q --show-progress https://d37ci6vzurychx.cloudfront.net/misc/taxi+_zone_lookup.csv

## Explore

In [None]:
yellow_tripdata_df = pd.read_parquet('yellow_tripdata_2022-05_sample_1000.parquet')
yellow_tripdata_df.head()

In [None]:
yellow_tripdata_df.info()

In [None]:
lookup_zone = pd.read_csv('taxi+_zone_lookup.csv')
lookup_zone.head()

In [None]:
lookup_zone.info()

## Preprocessing

In [None]:
# select only few columns that we are interested in
yellow_tripdata_df = yellow_tripdata_df[['VendorID',
                                        'tpep_pickup_datetime',
                                        'tpep_dropoff_datetime',
                                        'passenger_count',
                                        'PULocationID',
                                        'DOLocationID',
                                        'fare_amount']]

yellow_tripdata_df.columns = ['vendor_id',
                            'pickup_datetime',
                            'dropoff_datetime',
                            'passenger_count',
                            'pickup_location_id',
                            'dropoff_location_id',
                            'fare_amount']

yellow_tripdata_df

In [None]:
# rename the columns
lookup_zone.columns = ['locationid','borough','zone','service_zone']

lookup_zone

## Load

In this step, we will load the data into postgres

In [None]:
# Setup the credentials

def get_secret(secret_name):
    region_name = "us-east-1"
    session = boto3.session.Session()
    client = session.client(
        service_name='secretsmanager',
        region_name=region_name)
    get_secret_value_response = client.get_secret_value(SecretId=secret_name)
    get_secret_value_response = json.loads(get_secret_value_response['SecretString'])
    return get_secret_value_response

db_credentials = get_secret(secret_name='wysde')

USERNAME = db_credentials["RDS_POSTGRES_USERNAME"]
PASSWORD = db_credentials["RDS_POSTGRES_PASSWORD"]
HOST = "database-1.cy8ltogyfgas.us-east-1.rds.amazonaws.com"
PORT = 5432
DBNAME = "sparsh"
CONN = f"postgresql://{USERNAME}:{PASSWORD}@{HOST}:{PORT}/{DBNAME}"

In [None]:
# load the data into our postgres database
alchemyEngine = create_engine(CONN, pool_recycle=3600);
postgreSQLConnection = alchemyEngine.connect();

In [None]:
DBT_SCHEMA = "dbt_taxi"

In [None]:
lookup_zone.to_sql('taxi_zone_lookup', postgreSQLConnection, if_exists='replace', schema=DBT_SCHEMA, index=False)

In [None]:
yellow_tripdata_df.to_sql('yellow_taxi_trips', postgreSQLConnection, if_exists='replace', schema=DBT_SCHEMA, index=False);

In [None]:
postgreSQLConnection.close();

## Data Load Validation

In [None]:
%reload_ext sql

In [None]:
%sql {CONN}

In [None]:
%sql select * from {DBT_SCHEMA}.yellow_taxi_trips limit 10;

In [None]:
%sql select count(*) from {DBT_SCHEMA}.yellow_taxi_trips limit 10;

In [None]:
%sql select count(*) from {DBT_SCHEMA}.taxi_zone_lookup limit 10;

## dbt Setup

Initiate the dbt project

In [None]:
!dbt init

In [None]:
%cd nyctaxi

In [None]:
# let's see what's all there
!tree --du -h -C .

In [None]:
!tree --du -h -C ~/.dbt

In [None]:
!cat ~/.dbt/profiles.yml

Setup the database credentials

In [None]:
%%writefile ~/.dbt/profiles.yml
nyctaxi:
  outputs:
    dev:
      type: postgres
      threads: 1
      host: database-1.cy8ltogyfgas.us-east-1.rds.amazonaws.com
      port: 5432
      user: postgres
      pass: 
      dbname: sparsh
      schema: dbt_taxi
  target: dev

In [None]:
!dbt debug

In [None]:
!dbt run

In [None]:
!dbt test

In [None]:
!tree --du -h -C .

In [None]:
!mkdir -p ./models/staging

In [None]:
%%writefile ./models/staging/schema.yml
version: 2

sources:
  - name: source
    schema: dbt_taxi
    tables:
      - name: yellow_taxi_trips
      - name: taxi_zone_lookup

models:
  - name: taxi_zone_lookup_model
    description: "A list of all taxi zones with codes in NYC"
    columns:
      - name: locationid
        tests:
          - not_null
      - name: borough
        tests:
          - not_null
      - name: zone
        tests:
          - not_null
      - name: service_zone
        tests:
          - not_null
  - name: yellow_taxi_trips_models
    description: "A reduced version of yellow taxi trip data in NYC"
    columns:
      - name: vendor_id
        tests:
          - not_null
          - accepted_values:
              values: ['1', '2', '4']
      - name: pickup_datetime
        tests:
          - not_null
      - name: dropoff_datetime
        tests:
          - not_null
      - name: passenger_count
        tests:
          - not_null
      - name: pickup_location_id
        tests:
          - not_null
      - name: dropoff_location_id
        tests:
          - not_null
      - name: fare_amount
        tests:
          - not_null

In [None]:
%%writefile ./models/staging/yellow_taxi_trips_models.sql
select 
    vendor_id,
    pickup_datetime, 
    dropoff_datetime, 
    passenger_count, 
    pickup_location_id, 
    dropoff_location_id, 
    fare_amount
from {{ source('source', 'yellow_taxi_trips') }}

In [None]:
%%writefile ./models/staging/taxi_zone_lookup_model.sql
select 
    locationid,
    borough,
    zone,
    service_zone
from {{ source('source', 'taxi_zone_lookup') }}

We will now create another dbt model, which combines data from the two staging models. Let's assume we want to write a query to join the staging tables on the location ID fields and add the actual location names to the pickup and dropoff locations of the taxi ride data.

In [None]:
%%writefile ./models/trips_with_borough_name_model.sql
select
    t.vendor_id,
    t.pickup_datetime,
    t.dropoff_datetime,
    z1.borough as pickup_borough,
    z2.borough as dropoff_borough,
    t.passenger_count,
    t.fare_amount
from {{ ref('yellow_taxi_trips_models') }} t
left join {{ ref('taxi_zone_lookup_model') }} z1
on t.pickup_location_id = z1.locationid
left join {{ ref('taxi_zone_lookup_model') }} z2
on t.dropoff_location_id = z2.locationid

In [None]:
%%writefile ./models/schema.yml
version: 2

models:
  - name: trips_with_borough_name
    description: "Combines taxi rides with the borough names for pickup and dropoff locations."
    columns:
      - name: vendor_id
      - name: pickup_datetime
      - name: dropoff_datetime
      - name: pickup_borough
      - name: dropoff_borough
      - name: passenger_count
      - name: fare_amount

In [None]:
!dbt run

In [None]:
!dbt docs generate

In [None]:
!dbt docs serve