# Import Yellow Taxi Data - Part 2

Create a Hive external table using the S3 Parquet files.


## Create Hive Table

In [17]:
import urllib3

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

In [18]:
%load_ext sql

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [19]:
%config SqlMagic.named_parameters = "enabled"

In [20]:
%sql trino://admin@${DOCKER_HOST_OR_IP}:8443/hive --connection_arguments '{"http_scheme":"https", "verify": false}'

In [21]:
%%sql 

show catalogs

Catalog
hive
iceberg
system
tpcds
vast


In [22]:
import os
S3A_BUCKET = os.getenv('S3A_BUCKET')

In [23]:
! echo {S3A_BUCKET}

csnow-bucket


In [24]:
%%sql

CREATE SCHEMA IF NOT EXISTS hive.nyt

In [25]:
%%sql

SHOW SCHEMAS

Schema
default
information_schema
nyt
social_media


In [38]:
%%sql

DROP TABLE IF EXISTS nyt.yellow_trip_data

In [39]:
%%sql

CREATE TABLE nyt.yellow_trip_data (
  vendorid VARCHAR,
  tpep_pickup_datetime VARCHAR,
  tpep_dropoff_datetime VARCHAR,
  passenger_count BIGINT,
  trip_distance DOUBLE,
  ratecodeid BIGINT,
  store_and_fwd_flag DOUBLE,
  pulocationid INT,
  dolocationid INT,
  payment_type VARCHAR,
  fare_amount DOUBLE,
  extra DOUBLE,
  mta_tax DOUBLE,
  tip_amount DOUBLE,
  tolls_amount DOUBLE,
  improvement_surcharge DOUBLE,
  total_amount DOUBLE,
  congestion_surcharge DOUBLE,  
  airport_fee DOUBLE,
  dropoff_latitude DOUBLE,
  dropoff_longitude DOUBLE,
  pickup_latitude DOUBLE,
  pickup_longitude DOUBLE,
  surcharge DOUBLE
)
WITH (
  format = 'PARQUET',
  external_location = 's3a://{{S3A_BUCKET}}/yellow_tripdata/'
)

In [43]:
%%sql
    
SELECT format('%,d', COUNT(*)) as row_count FROM nyt.yellow_trip_data

row_count
1763456499


In [41]:
%%sql

SELECT *
FROM nyt.yellow_trip_data
LIMIT 100

passenger_count,trip_distance,ratecodeid,store_and_fwd_flag,pulocationid,dolocationid,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,dropoff_latitude,dropoff_longitude,pickup_latitude,pickup_longitude,surcharge
2,9.8,,0.0,-999,-999,24.1,-999.0,,5.65,4.15,-999.0,33.9,-999.0,-999.0,40.774278,-73.87347099999998,40.762107,-73.978843,0.0
1,2.2,,0.0,-999,-999,9.3,-999.0,,1.54,0.0,-999.0,10.84,-999.0,-999.0,40.747773,-73.97288899999998,40.719872,-73.987342,0.0
1,3.7,,0.0,-999,-999,14.5,-999.0,,2.32,0.0,-999.0,16.82,-999.0,-999.0,40.74506,-73.997792,40.77255,-73.95920099999998,0.0
3,1.7,,0.0,-999,-999,10.1,-999.0,,0.0,0.0,-999.0,10.1,-999.0,-999.0,40.759942,-73.984448,40.738061,-73.99652299999998,0.0
1,1.6,,0.0,-999,-999,8.9,-999.0,,0.0,0.0,-999.0,8.9,-999.0,-999.0,40.752788,-73.98509199999998,40.760026,-73.96490199999998,0.0
1,0.6999999999999998,,0.0,-999,-999,5.3,-999.0,,0.0,0.0,-999.0,5.3,-999.0,-999.0,40.719782,-74.010428,40.722629,-74.003809,0.0
3,3.1,,0.0,-999,-999,12.9,-999.0,,0.0,0.0,-999.0,12.9,-999.0,-999.0,40.751457,-74.001508,40.763873,-73.96203,0.0
1,1.45,,,-999,-999,6.9,-999.0,,0.0,0.0,-999.0,6.9,-999.0,-999.0,40.750332,-73.99134499999998,40.768275,-73.982107,0.0
2,0.76,,,-999,-999,5.7,-999.0,,4.0,0.0,-999.0,9.7,-999.0,-999.0,40.760553,-73.97216,40.752022,-73.97732999999998,0.0
1,0.54,,,-999,-999,4.9,-999.0,,0.0,0.0,-999.0,5.4,-999.0,-999.0,0.0,0.0,0.0,0.0,0.5


In [30]:
%%sql

SHOW CREATE TABLE nyt.yellow_trip_data

Create Table
"CREATE TABLE hive.nyt.yellow_trip_data (  vendorid varchar,  tpep_pickup_datetime varchar,  tpep_dropoff_datetime varchar,  passenger_count bigint,  trip_distance double,  ratecodeid bigint,  store_and_fwd_flag double,  pulocationid integer,  dolocationid integer,  payment_type varchar,  fare_amount double,  extra double,  mta_tax double,  tip_amount double,  tolls_amount double,  improvement_surcharge double,  total_amount double,  congestion_surcharge double,  airport_fee double,  dropoff_latitude double,  dropoff_longitude double,  pickup_latitude double,  pickup_longitude double,  surcharge double ) WITH (  external_location = 's3a://csnow-bucket/yellow_tripdata',  format = 'PARQUET' )"


In [31]:
%%sql

EXPLAIN
    SELECT *
    FROM nyt.yellow_trip_data
    LIMIT 100


Query Plan
"Trino version: 429 Fragment 0 [SINGLE]  Output layout: [vendorid, tpep_pickup_datetime, tpep_dropoff_datetime, passenger_count, trip_distance, ratecodeid, store_and_fwd_flag, pulocationid, dolocationid, payment_type, fare_amount, extra, mta_tax, tip_amount, tolls_amount, improvement_surcharge, total_amount, congestion_surcharge, airport_fee, dropoff_latitude, dropoff_longitude, pickup_latitude, pickup_longitude, surcharge]  Output partitioning: SINGLE []  Output[columnNames = [vendorid, tpep_pickup_datetime, tpep_dropoff_datetime, passenger_count, trip_distance, ratecodeid, store_and_fwd_flag, pulocationid, dolocationid, payment_type, fare_amount, extra, mta_tax, tip_amount, tolls_amount, improvement_surcharge, total_amount, congestion_surcharge, airport_fee, dropoff_latitude, dropoff_longitude, pickup_latitude, pickup_longitude, surcharge]]  │ Layout: [vendorid:varchar, tpep_pickup_datetime:varchar, tpep_dropoff_datetime:varchar, passenger_count:bigint, trip_distance:double, ratecodeid:bigint, store_and_fwd_flag:double, pulocationid:integer, dolocationid:integer, payment_type:varchar, fare_amount:double, extra:double, mta_tax:double, tip_amount:double, tolls_amount:double, improvement_surcharge:double, total_amount:double, congestion_surcharge:double, airport_fee:double, dropoff_latitude:double, dropoff_longitude:double, pickup_latitude:double, pickup_longitude:double, surcharge:double]  │ Estimates: {rows: 100 (38.28kB), cpu: 0, memory: 0B, network: 0B}  └─ Limit[count = 100]  │ Layout: [vendorid:varchar, tpep_pickup_datetime:varchar, tpep_dropoff_datetime:varchar, passenger_count:bigint, trip_distance:double, ratecodeid:bigint, store_and_fwd_flag:double, pulocationid:integer, dolocationid:integer, payment_type:varchar, fare_amount:double, extra:double, mta_tax:double, tip_amount:double, tolls_amount:double, improvement_surcharge:double, total_amount:double, congestion_surcharge:double, airport_fee:double, dropoff_latitude:double, dropoff_longitude:double, pickup_latitude:double, pickup_longitude:double, surcharge:double]  │ Estimates: {rows: 100 (38.28kB), cpu: 38.28k, memory: 0B, network: 0B}  └─ LocalExchange[partitioning = SINGLE]  │ Layout: [vendorid:varchar, tpep_pickup_datetime:varchar, tpep_dropoff_datetime:varchar, passenger_count:bigint, trip_distance:double, ratecodeid:bigint, store_and_fwd_flag:double, pulocationid:integer, dolocationid:integer, payment_type:varchar, fare_amount:double, extra:double, mta_tax:double, tip_amount:double, tolls_amount:double, improvement_surcharge:double, total_amount:double, congestion_surcharge:double, airport_fee:double, dropoff_latitude:double, dropoff_longitude:double, pickup_latitude:double, pickup_longitude:double, surcharge:double]  │ Estimates: {rows: 100 (38.28kB), cpu: 0, memory: 0B, network: 0B}  └─ RemoteSource[sourceFragmentIds = [1]]  Layout: [vendorid:varchar, tpep_pickup_datetime:varchar, tpep_dropoff_datetime:varchar, passenger_count:bigint, trip_distance:double, ratecodeid:bigint, store_and_fwd_flag:double, pulocationid:integer, dolocationid:integer, payment_type:varchar, fare_amount:double, extra:double, mta_tax:double, tip_amount:double, tolls_amount:double, improvement_surcharge:double, total_amount:double, congestion_surcharge:double, airport_fee:double, dropoff_latitude:double, dropoff_longitude:double, pickup_latitude:double, pickup_longitude:double, surcharge:double] Fragment 1 [SOURCE]  Output layout: [vendorid, tpep_pickup_datetime, tpep_dropoff_datetime, passenger_count, trip_distance, ratecodeid, store_and_fwd_flag, pulocationid, dolocationid, payment_type, fare_amount, extra, mta_tax, tip_amount, tolls_amount, improvement_surcharge, total_amount, congestion_surcharge, airport_fee, dropoff_latitude, dropoff_longitude, pickup_latitude, pickup_longitude, surcharge]  Output partitioning: SINGLE []  LimitPartial[count = 100]  │ Layout: [vendorid:varchar, tpep_pickup_datetime:varchar, tpep_dropoff_datetime:varchar, passenger_count:bigint, trip_distance:double, ratecodeid:bigint, store_and_fwd_flag:double, pulocationid:integer, dolocationid:integer, payment_type:varchar, fare_amount:double, extra:double, mta_tax:double, tip_amount:double, tolls_amount:double, improvement_surcharge:double, total_amount:double, congestion_surcharge:double, airport_fee:double, dropoff_latitude:double, dropoff_longitude:double, pickup_latitude:double, pickup_longitude:double, surcharge:double]  │ Estimates: {rows: 100 (38.28kB), cpu: 38.28k, memory: 0B, network: 0B}  └─ TableScan[table = hive:nyt:yellow_trip_data]  Layout: [vendorid:varchar, tpep_pickup_datetime:varchar, tpep_dropoff_datetime:varchar, passenger_count:bigint, trip_distance:double, ratecodeid:bigint, store_and_fwd_flag:double, pulocationid:integer, dolocationid:integer, payment_type:varchar, fare_amount:double, extra:double, mta_tax:double, tip_amount:double, tolls_amount:double, improvement_surcharge:double, total_amount:double, congestion_surcharge:double, airport_fee:double, dropoff_latitude:double, dropoff_longitude:double, pickup_latitude:double, pickup_longitude:double, surcharge:double]  Estimates: {rows: ? (?), cpu: ?, memory: 0B, network: 0B}  surcharge := surcharge:double:REGULAR  fare_amount := fare_amount:double:REGULAR  pickup_longitude := pickup_longitude:double:REGULAR  tpep_dropoff_datetime := tpep_dropoff_datetime:string:REGULAR  congestion_surcharge := congestion_surcharge:double:REGULAR  vendorid := vendorid:string:REGULAR  passenger_count := passenger_count:bigint:REGULAR  tolls_amount := tolls_amount:double:REGULAR  improvement_surcharge := improvement_surcharge:double:REGULAR  dropoff_latitude := dropoff_latitude:double:REGULAR  trip_distance := trip_distance:double:REGULAR  dolocationid := dolocationid:int:REGULAR  pulocationid := pulocationid:int:REGULAR  store_and_fwd_flag := store_and_fwd_flag:double:REGULAR  payment_type := payment_type:string:REGULAR  dropoff_longitude := dropoff_longitude:double:REGULAR  ratecodeid := ratecodeid:bigint:REGULAR  total_amount := total_amount:double:REGULAR  pickup_latitude := pickup_latitude:double:REGULAR  extra := extra:double:REGULAR  tip_amount := tip_amount:double:REGULAR  mta_tax := mta_tax:double:REGULAR  airport_fee := airport_fee:double:REGULAR  tpep_pickup_datetime := tpep_pickup_datetime:string:REGULAR"
