## Setup

In [3]:
!mkdir -p ~/.aws

In [4]:
%%writefile ~/.aws/credentials
[default]
aws_access_key_id=
aws_secret_access_key=
region=us-east-1
output=table

Writing /root/.aws/credentials


In [8]:
!mkdir -p ~/.dbt

In [9]:
%%writefile ~/.dbt/profiles.yml
default:
  outputs:
    dev:
      dbname: postgres
      host: database-1.cy8ltogyfgas.us-east-1.rds.amazonaws.com
      pass: 
      port: 5432
      schema: dbt_taxi
      threads: 1
      type: postgres
      user: postgres
  target: dev

Writing /root/.dbt/profiles.yml


In [11]:
%%writefile requirements.txt
dbt-core
dbt-postgres
psycopg2-binary==2.9.3
ipython-sql==0.4.1
boto3==1.24.31

Overwriting requirements.txt


In [None]:
!pip install -r requirements.txt

In [1]:
import boto3
import json
import pandas as pd
from sqlalchemy import create_engine
import psycopg2

In [2]:
DBT_SCHEMA = "dbt_taxi"

In [3]:
def get_secret(secret_name):
    region_name = "us-east-1"
    session = boto3.session.Session()
    client = session.client(
        service_name='secretsmanager',
        region_name=region_name)
    get_secret_value_response = client.get_secret_value(SecretId=secret_name)
    get_secret_value_response = json.loads(get_secret_value_response['SecretString'])
    return get_secret_value_response

In [4]:
db_credentials = get_secret(secret_name='wysde')

USERNAME = db_credentials["RDS_POSTGRES_USERNAME"]
PASSWORD = db_credentials["RDS_POSTGRES_PASSWORD"]
HOST = "database-1.cy8ltogyfgas.us-east-1.rds.amazonaws.com"
PORT = 5432
DBNAME = "sparsh"
CONN = f"postgresql://{USERNAME}:{PASSWORD}@{HOST}:{PORT}/{DBNAME}"

In [5]:
alchemyEngine = create_engine(CONN, pool_recycle=3600)
postgreSQLConnection = alchemyEngine.connect()

## Extract and Load

### external_fhv_tripdata

In [6]:
!wget -q --show-progress https://github.com/DataTalksClub/nyc-tlc-data/releases/download/fhv/fhv_tripdata_2019-02.csv.gz
!gunzip fhv_tripdata_2019-02.csv.gz



In [9]:
df = pd.read_csv("fhv_tripdata_2019-02.csv")
df

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
0,B00037,2019-02-01 00:08:44,2019-02-01 00:23:35,264.0,265.0,,B00037
1,B00037,2019-02-01 00:27:51,2019-02-01 00:32:54,264.0,265.0,,B00037
2,B00037,2019-02-01 00:18:30,2019-02-01 00:25:45,264.0,265.0,,B00037
3,B00037,2019-02-01 00:43:15,2019-02-01 00:48:29,264.0,265.0,,B00037
4,B00037,2019-02-01 00:01:45,2019-02-01 00:09:13,264.0,265.0,,B00037
...,...,...,...,...,...,...,...
1707644,B03129,2019-02-28 23:55:25,2019-03-01 00:06:18,94.0,169.0,,B03129
1707645,B03157,2019-02-28 23:35:26,2019-02-28 23:39:00,264.0,265.0,,B03157
1707646,B03157,2019-02-28 23:13:40,2019-02-28 23:17:24,264.0,265.0,,B03157
1707647,B03157,2019-02-28 23:18:45,2019-02-28 23:56:57,264.0,265.0,,B03157


In [11]:
df.columns = [x.lower() for x in df.columns]
df.sample(10000).to_sql('external_fhv_tripdata', postgreSQLConnection, if_exists='replace', schema=DBT_SCHEMA, index=False)

### ny_yellow_tripdata

In [12]:
!wget -q --show-progress https://github.com/DataTalksClub/nyc-tlc-data/releases/download/yellow/yellow_tripdata_2019-02.csv.gz
!gunzip yellow_tripdata_2019-02.csv.gz



In [13]:
df = pd.read_csv("yellow_tripdata_2019-02.csv")
df

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
0,1,2019-02-01 00:59:04,2019-02-01 01:07:27,1,2.1,1,N,48,234,1,9.0,0.5,0.5,2.0,0.0,0.3,12.3,0.0
1,1,2019-02-01 00:33:09,2019-02-01 01:03:58,1,9.8,1,N,230,93,2,32.0,0.5,0.5,0.0,0.0,0.3,33.3,0.0
2,1,2019-02-01 00:09:03,2019-02-01 00:09:16,1,0.0,1,N,145,145,2,2.5,0.5,0.5,0.0,0.0,0.3,3.8,0.0
3,1,2019-02-01 00:45:38,2019-02-01 00:51:10,1,0.8,1,N,95,95,2,5.5,0.5,0.5,0.0,0.0,0.3,6.8,0.0
4,1,2019-02-01 00:25:30,2019-02-01 00:28:14,1,0.8,1,N,140,263,2,5.0,0.5,0.5,0.0,0.0,0.3,6.3,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7019370,2,2019-02-28 23:29:08,2019-02-28 23:29:11,1,0.0,1,N,193,193,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7019371,2,2019-02-28 22:48:47,2019-02-28 23:50:19,1,0.0,1,N,141,193,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5
7019372,2,2019-02-28 23:41:23,2019-02-28 23:42:23,1,0.0,1,N,264,264,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7019373,2,2019-02-28 23:12:52,2019-02-28 23:14:16,1,0.0,1,N,264,193,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
df.columns = [x.lower() for x in df.columns]
df.sample(10000).to_sql('ny_yellow_tripdata', postgreSQLConnection, if_exists='replace', schema=DBT_SCHEMA, index=False)

### ny_green_tripdata

In [15]:
!wget -q --show-progress https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_2019-02.csv.gz
!gunzip green_tripdata_2019-02.csv.gz



In [16]:
df = pd.read_csv("green_tripdata_2019-02.csv")
df

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2019-02-01 00:10:19,2019-02-01 00:21:43,N,1,92,135,1,2.79,11.0,0.5,0.5,3.08,0.0,,0.3,15.38,1,1,0.00
1,2,2019-02-01 00:02:16,2019-02-01 00:24:37,N,1,66,36,1,4.46,17.5,0.5,0.5,3.76,0.0,,0.3,22.56,1,1,0.00
2,2,2019-02-01 00:37:19,2019-02-01 00:43:07,N,1,255,112,1,1.26,6.0,0.5,0.5,1.46,0.0,,0.3,8.76,1,1,0.00
3,1,2019-02-01 00:10:10,2019-02-01 00:12:21,N,1,75,238,1,0.70,4.0,0.5,0.5,0.00,0.0,,0.3,5.30,2,1,0.00
4,1,2019-02-01 00:30:19,2019-02-01 00:46:14,N,1,75,48,1,3.90,14.5,0.5,0.5,0.00,0.0,,0.3,15.80,2,1,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
575680,2,2019-02-28 23:44:17,2019-02-28 23:51:14,N,1,42,116,1,1.34,7.0,0.5,0.5,1.66,0.0,,0.3,9.96,1,1,0.00
575681,2,2019-02-28 23:56:53,2019-03-01 00:10:06,N,1,42,41,1,2.09,8.5,0.5,0.5,0.00,0.0,,0.3,9.80,2,1,0.00
575682,2,2019-02-28 23:34:53,2019-02-28 23:43:26,N,1,65,144,1,1.96,9.0,0.5,0.5,2.61,0.0,,0.3,15.66,1,1,2.75
575683,2,2019-02-28 23:04:18,2019-02-28 23:08:48,N,1,41,42,1,0.79,5.5,0.5,0.5,0.00,0.0,,0.3,6.80,2,1,0.00


In [17]:
df.columns = [x.lower() for x in df.columns]
df.sample(10000).to_sql('ny_green_tripdata', postgreSQLConnection, if_exists='replace', schema=DBT_SCHEMA, index=False)

## Transformation

In [4]:
!dbt debug

[0m09:30:13  Running with dbt=1.3.1
dbt version: 1.3.1
python version: 3.9.7
python path: /Users/sparshagarwal/anaconda3/envs/env-spacy/bin/python
os info: macOS-10.16-x86_64-i386-64bit
Using profiles.yml file at /Users/sparshagarwal/.dbt/profiles.yml
Using dbt_project.yml file at /Users/sparshagarwal/Desktop/projects/de/de-main/03-processing/dbt/_lab-nyctaxi-lookup/dbt_project.yml

Configuration:
  profiles.yml file [[32mOK found and valid[0m]
  dbt_project.yml file [[32mOK found and valid[0m]

Required dependencies:
 - git [[32mOK found[0m]

Connection:
  host: database-1.cy8ltogyfgas.us-east-1.rds.amazonaws.com
  port: 5432
  user: postgres
  database: sparsh
  schema: dbt_taxi
  search_path: None
  keepalives_idle: 0
  sslmode: None
  Connection test: [[32mOK connection ok[0m]

[32mAll checks passed![0m


In [8]:
!dbt deps

[0m09:02:12  Running with dbt=1.3.1
[0m09:02:13  Installing dbt-labs/dbt_utils
[0m09:02:15    Installed from version 0.8.0
[0m09:02:15    Updated version available: 1.0.0
[0m09:02:15  
[0m09:02:15  Updates available for packages: ['dbt-labs/dbt_utils']                 
Update your versions in packages.yml, then run dbt deps


In [10]:
!dbt seed --full-refresh

[0m09:38:54  Running with dbt=1.3.1
[0m09:38:54  Found 7 models, 11 tests, 0 snapshots, 0 analyses, 477 macros, 0 operations, 1 seed file, 3 sources, 0 exposures, 0 metrics
[0m09:38:54  
[0m09:39:06  Concurrency: 1 threads (target='dev')
[0m09:39:06  
[0m09:39:06  1 of 1 START seed file dbt_taxi.taxi_zone_lookups .............................. [RUN]
[0m09:39:11  1 of 1 OK loaded seed file dbt_taxi.taxi_zone_lookups .......................... [[32mCREATE 265[0m in 4.63s]
[0m09:39:14  
[0m09:39:14  Finished running 1 seed in 0 hours 0 minutes and 20.04 seconds (20.04s).
[0m09:39:14  
[0m09:39:14  [32mCompleted successfully[0m
[0m09:39:14  
[0m09:39:14  Done. PASS=1 WARN=0 ERROR=0 SKIP=0 TOTAL=1


In [14]:
!dbt run

[0m09:53:43  Running with dbt=1.3.1
[0m09:53:44  Found 7 models, 11 tests, 0 snapshots, 0 analyses, 477 macros, 0 operations, 1 seed file, 3 sources, 0 exposures, 0 metrics
[0m09:53:44  
[0m09:53:56  Concurrency: 1 threads (target='dev')
[0m09:53:56  
[0m09:53:56  1 of 7 START sql table model dbt_taxi.dim_zones ................................ [RUN]
[0m09:54:00  1 of 7 OK created sql table model dbt_taxi.dim_zones ........................... [[32mSELECT 265[0m in 4.57s]
[0m09:54:00  2 of 7 START sql view model dbt_taxi.stg_fhv_tripdata .......................... [RUN]
[0m09:54:05  2 of 7 OK created sql view model dbt_taxi.stg_fhv_tripdata ..................... [[32mCREATE VIEW[0m in 4.08s]
[0m09:54:05  3 of 7 START sql view model dbt_taxi.stg_green_tripdata ........................ [RUN]
[0m09:54:09  3 of 7 OK created sql view model dbt_taxi.stg_green_tripdata ................... [[32mCREATE VIEW[0m in 4.36s]
[0m09:54:09  4 of 7 START sql view model dbt_taxi.stg_yello

In [15]:
!dbt test

[0m09:54:41  Running with dbt=1.3.1
[0m09:54:41  Found 7 models, 11 tests, 0 snapshots, 0 analyses, 477 macros, 0 operations, 1 seed file, 3 sources, 0 exposures, 0 metrics
[0m09:54:41  
[0m09:54:50  Concurrency: 1 threads (target='dev')
[0m09:54:50  
[0m09:54:50  1 of 11 START test accepted_values_stg_green_tripdata_Payment_type__False___var_payment_type_values_  [RUN]
[0m09:54:54  1 of 11 PASS accepted_values_stg_green_tripdata_Payment_type__False___var_payment_type_values_  [[32mPASS[0m in 3.58s]
[0m09:54:54  2 of 11 START test accepted_values_stg_yellow_tripdata_Payment_type__False___var_payment_type_values_  [RUN]
[0m09:54:57  2 of 11 PASS accepted_values_stg_yellow_tripdata_Payment_type__False___var_payment_type_values_  [[32mPASS[0m in 3.33s]
[0m09:54:57  3 of 11 START test not_null_dm_monthly_zone_revenue_revenue_monthly_total_amount  [RUN]
[0m09:55:01  3 of 11 PASS not_null_dm_monthly_zone_revenue_revenue_monthly_total_amount ..... [[32mPASS[0m in 3.43s]
[0m0

In [16]:
!dbt docs generate

[0m09:55:39  Running with dbt=1.3.1
[0m09:55:39  Found 7 models, 11 tests, 0 snapshots, 0 analyses, 477 macros, 0 operations, 1 seed file, 3 sources, 0 exposures, 0 metrics
[0m09:55:39  
[0m09:55:47  Concurrency: 1 threads (target='dev')
[0m09:55:47  
[0m09:55:48  Done.
[0m09:55:48  Building catalog
[0m09:55:51  Catalog written to /Users/sparshagarwal/Desktop/projects/de/de-main/03-processing/dbt/_lab-nyctaxi-lookup/target/catalog.json


In [17]:
!dbt docs serve

[0m09:55:59  Running with dbt=1.3.1
[0m09:55:59  Serving docs at 0.0.0.0:8080
[0m09:55:59  To access from your browser, navigate to:  http://localhost:8080
[0m09:55:59  
[0m09:55:59  
[0m09:55:59  Press Ctrl+C to exit.
127.0.0.1 - - [27/Feb/2023 15:26:00] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [27/Feb/2023 15:26:02] "GET /manifest.json?cb=1677491762217 HTTP/1.1" 200 -
127.0.0.1 - - [27/Feb/2023 15:26:02] "GET /catalog.json?cb=1677491762217 HTTP/1.1" 200 -


![lineage-graph](https://user-images.githubusercontent.com/62965911/221533101-79779e99-5d2b-4438-8702-a76a253de226.png)