In [0]:
USE CATALOG `ridhima_workspace`; -- setting catalog and schema
USE SCHEMA `default`;

Sample Queries

In [0]:

-- show streaming table taxi_raw_records (first five rows)
SELECT * FROM taxi_raw_records LIMIT 5;

tpep_pickup_datetime,tpep_dropoff_datetime,trip_distance,fare_amount,pickup_zip,dropoff_zip
2016-02-13T21:47:53.000Z,2016-02-13T21:57:15.000Z,1.4,8.0,10103,10110
2016-02-13T18:29:09.000Z,2016-02-13T18:37:23.000Z,1.31,7.5,10023,10023
2016-02-06T19:40:58.000Z,2016-02-06T19:52:32.000Z,1.8,9.5,10001,10018
2016-02-12T19:06:43.000Z,2016-02-12T19:20:54.000Z,2.3,11.5,10044,10111
2016-02-23T10:27:56.000Z,2016-02-23T10:58:33.000Z,2.6,18.5,10199,10022


In [0]:
-- show streaming table flagged_rides (first five rows)
SELECT * FROM flagged_rides LIMIT 5;

week,zip,fare_amount,trip_distance
2016-01-25T00:00:00.000Z,11109,52.0,3.0
2016-02-15T00:00:00.000Z,7311,60.0,2.0
2016-01-25T00:00:00.000Z,10065,52.0,1.12
2016-02-08T00:00:00.000Z,11422,52.0,0.2
2016-01-11T00:00:00.000Z,11422,52.0,8.7


In [0]:
-- show materialized view weekly_stats (first five rows)
SELECT * FROM weekly_stats LIMIT 5;

week,avg_amount,avg_distance
2016-02-01T00:00:00.000Z,11.990339116719245,2.74633675078864
2016-01-18T00:00:00.000Z,11.966793403573064,2.7421759047182723
2016-02-15T00:00:00.000Z,12.244146522870956,2.8944923763480825
2016-01-04T00:00:00.000Z,11.907765076862436,2.864603862830116
2016-02-08T00:00:00.000Z,12.20651356238698,2.751081374321874


In [0]:
-- show materialized view top_n (first five rows)
SELECT * FROM top_n LIMIT 5;

week,avg_amount,avg_distance,fare_amount,trip_distance,zip
2016-01-04T00:00:00.000Z,11.91,2.865,95.0,5.2,10009
2016-02-15T00:00:00.000Z,12.24,2.894,60.0,2.0,7311
2016-02-22T00:00:00.000Z,12.79,2.973,60.0,0.92,11422


Validation


In [0]:
-- Confirm Bronze record count > Silver (after bad rows dropped) > Gold
SELECT
  (SELECT COUNT(*) FROM taxi_raw_records) AS bronze_count,
  (SELECT COUNT(*) FROM flagged_rides) AS silver_count1,
  (SELECT COUNT(*) FROM weekly_stats) AS silver_count2,
  (SELECT COUNT(*) FROM top_n) AS gold_count;


bronze_count,silver_count1,silver_count2,gold_count
21856,19,10,3


In [0]:
-- Show no records with non-positive trip_distance.
SELECT * FROM taxi_raw_records WHERE trip_distance <= 0;

tpep_pickup_datetime,tpep_dropoff_datetime,trip_distance,fare_amount,pickup_zip,dropoff_zip


In [0]:
-- Show some suspicious rides i.e fares > distance
SELECT * FROM flagged_rides LIMIT 10;

week,zip,fare_amount,trip_distance
2016-01-25T00:00:00.000Z,11109,52.0,3.0
2016-02-15T00:00:00.000Z,7311,60.0,2.0
2016-01-25T00:00:00.000Z,10065,52.0,1.12
2016-02-08T00:00:00.000Z,11422,52.0,0.2
2016-01-11T00:00:00.000Z,11422,52.0,8.7
2015-12-28T00:00:00.000Z,10023,52.0,0.3
2016-01-18T00:00:00.000Z,10020,52.0,0.1
2016-02-22T00:00:00.000Z,10115,55.0,0.18
2016-02-22T00:00:00.000Z,11371,52.0,4.02
2016-02-22T00:00:00.000Z,10017,52.0,0.12


In [0]:
-- Check weekly aggregation for amount, distance.
SELECT * FROM weekly_stats ORDER BY week LIMIT 10;


week,avg_amount,avg_distance
2015-12-28T00:00:00.000Z,12.178038379530918,3.1040618336886974
2016-01-04T00:00:00.000Z,11.907765076862436,2.864603862830116
2016-01-11T00:00:00.000Z,12.332039911308204,2.931263858093132
2016-01-18T00:00:00.000Z,11.966793403573064,2.7421759047182723
2016-01-25T00:00:00.000Z,12.981361426256075,2.8746961102106963
2016-02-01T00:00:00.000Z,11.990339116719245,2.74633675078864
2016-02-08T00:00:00.000Z,12.20651356238698,2.751081374321874
2016-02-15T00:00:00.000Z,12.244146522870956,2.8944923763480825
2016-02-22T00:00:00.000Z,12.79211403184006,2.9734727878563483
2016-02-29T00:00:00.000Z,12.60960960960961,2.973363363363364


In [0]:

-- Validate Gold: highest fares on top
SELECT * FROM top_n;

week,avg_amount,avg_distance,fare_amount,trip_distance,zip
2016-01-04T00:00:00.000Z,11.91,2.865,95.0,5.2,10009
2016-02-15T00:00:00.000Z,12.24,2.894,60.0,2.0,7311
2016-02-22T00:00:00.000Z,12.79,2.973,60.0,0.92,11422
