# Import Yellow Taxi Data - Part 2

Create a Hive external table using the S3 Parquet files.


## Create Hive Table

In [7]:
import urllib3

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

In [8]:
%load_ext sql

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [9]:
%config SqlMagic.named_parameters = "enabled"

In [10]:
%sql trino://admin@${DOCKER_HOST_OR_IP}:8443/hive --connection_arguments '{"http_scheme":"https", "verify": false}'

In [11]:
%%sql 

show catalogs

Catalog
hive
iceberg
system
tpcds
vast


In [12]:
import os
S3A_BUCKET = os.getenv('S3A_BUCKET')

In [13]:
! echo {S3A_BUCKET}

csnow-bucket


In [14]:
%%sql

CREATE SCHEMA IF NOT EXISTS hive.nyt

In [15]:
%%sql

SHOW SCHEMAS

Schema
default
information_schema
nyt
social_media


In [45]:
%%sql

DROP TABLE IF EXISTS nyt.yellow_trip_data

In [46]:
%%sql

CREATE TABLE nyt.yellow_trip_data (
  VendorID INT,
  tpep_pickup_datetime TIMESTAMP,
  tpep_dropoff_datetime TIMESTAMP,
  passenger_count INT,
  trip_distance DOUBLE,
  RatecodeID INT,
  store_and_fwd_flag VARCHAR,
  PULocationID INT,
  DOLocationID INT,
  payment_type INT,
  fare_amount DOUBLE,
  extra DOUBLE,
  mta_tax DOUBLE,
  tip_amount DOUBLE,
  tolls_amount DOUBLE,
  improvement_surcharge DOUBLE,
  total_amount DOUBLE,
  congestion_surcharge DOUBLE,  
  airport_fee DOUBLE
)
WITH (
  format = 'PARQUET',
  external_location = 's3a://{{S3A_BUCKET}}/yellow_tripdata/'
)

In [47]:
%%sql
    
SELECT COUNT(*) as row_count FROM nyt.yellow_trip_data

row_count
58665597


In [49]:
%%sql

SELECT *
FROM nyt.yellow_trip_data
LIMIT 100

vendorid,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,ratecodeid,store_and_fwd_flag,pulocationid,dolocationid,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
2,2019-01-30 20:09:16,2019-01-31 20:07:58,1,1.0,1,N,162,107,1,5.5,0.5,0.5,0.0,0.0,0.3,6.8,0.0,0.0
2,2019-01-30 20:20:50,2019-01-30 20:27:04,1,0.68,1,N,107,113,1,6.0,0.5,0.5,1.46,0.0,0.3,8.76,0.0,0.0
2,2019-01-30 20:31:39,2019-01-30 20:46:36,1,1.68,1,N,114,232,1,11.0,0.5,0.5,4.0,0.0,0.3,16.3,0.0,0.0
2,2019-01-30 20:50:23,2019-01-30 21:03:03,1,1.56,1,N,232,114,1,9.5,0.5,0.5,2.16,0.0,0.3,12.96,0.0,0.0
2,2019-01-30 20:13:37,2019-01-30 20:57:05,1,18.6,2,N,132,230,1,52.0,0.0,0.5,11.71,5.76,0.3,70.27,0.0,0.0
2,2019-01-30 20:04:13,2019-01-30 20:18:04,1,2.38,1,N,230,236,1,11.0,0.5,0.5,2.0,0.0,0.3,14.3,0.0,0.0
2,2019-01-30 20:31:00,2019-01-30 20:39:51,1,1.33,1,N,161,142,1,7.5,0.5,0.5,1.76,0.0,0.3,10.56,0.0,0.0
2,2019-01-30 20:47:03,2019-01-30 21:01:16,1,1.78,1,N,142,233,1,10.5,0.5,0.5,2.36,0.0,0.3,14.16,0.0,0.0
2,2019-01-30 20:23:44,2019-01-30 20:34:39,1,1.69,1,N,164,230,2,9.0,0.5,0.5,0.0,0.0,0.3,10.3,0.0,0.0
2,2019-01-30 20:41:43,2019-01-30 20:50:43,1,1.71,1,N,161,107,1,8.0,0.5,0.5,1.86,0.0,0.3,11.16,0.0,0.0


In [19]:
%%sql

SHOW CREATE TABLE nyt.yellow_trip_data

Create Table
"CREATE TABLE hive.nyt.yellow_trip_data (  vendorid integer,  tpep_pickup_datetime timestamp(3),  tpep_dropoff_datetime timestamp(3),  passenger_count double,  trip_distance double,  ratecodeid double,  store_and_fwd_flag varchar,  pulocationid integer,  dolocationid integer,  payment_type integer,  fare_amount double,  extra double,  mta_tax double,  tip_amount double,  tolls_amount double,  improvement_surcharge double,  total_amount double,  congestion_surcharge double,  airport_fee double ) WITH (  external_location = 's3a://csnow-bucket/yellow_tripdata',  format = 'PARQUET' )"
