# Scheduled Execution: Stops at 30 Largest Airports in the US
* paas_cda.stops intersected with geographical POIs from dedicated.airports.airport_wkt

In [1]:
import pandas as pd
from keplergl import KeplerGl
import getpass
from pyhive import presto
import matplotlib.pyplot as plt
import plotly.express as px
import numpy as np
import os
import sys
import time
from datetime import datetime, timedelta

%reload_ext sql
%config SqlMagic.autocommit=False
%config SqlMagic.autolimit=0
%config SqlMagic.displaylimit=200
%config SqlMagic.autopandas=True

In [2]:
# SQL engine
connection = presto.connect(
    host="localhost",
    port=9090,
    catalog="cuebiq"
)

def read_sql(query: str) -> pd.DataFrame:
    return pd.read_sql(query, connection)

In [3]:
%sql presto://localhost:9090/cuebiq/

'Connected: @cuebiq/'

In [4]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 10)

In [5]:
# operations on dates
def get_dates_sequence(
    date_start, 
    date_end, 
    date_format
):
    return [
        (datetime.strptime(date_start, date_format) + timedelta(days=x)).strftime(date_format)
        for x in range(
            0, 
            (datetime.strptime(date_end, date_format) - datetime.strptime(date_start, date_format) + timedelta(days=1)).days
        )
    ]

In [6]:
date_format = "%Y%m%d"

In [7]:
first_date = "20210401"

In [8]:
last_date_to_compute = (datetime.now() - timedelta(days=1)).strftime(date_format)

# Manage execution dates

In [9]:
last_computed_date = read_sql(
    f'''
    SELECT max(processing_date) 
    FROM dedicated.airports.airport_stops
    ''').iloc[0][0]

if last_computed_date is None:
    first_date_to_compute = first_date
    message = f"""
        No computed dates yet
        Computation will start from {first_date_to_compute}
        Computation will end at {last_date_to_compute}
        """
else:
    first_date_to_compute = datetime.strftime(
        datetime.strptime(str(last_computed_date), date_format) + timedelta(days=1), 
        date_format
    )
    if first_date_to_compute > last_date_to_compute:
        print("No dates to execute")
        sys.exit(0)
    else:
        message = f"""
            Last computed date: {last_computed_date}
            Computation will start from {first_date_to_compute}
            Computation will end at {last_date_to_compute}
            """

print(message)

No dates to execute


SystemExit: 0

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
dates_to_compute = get_dates_sequence(first_date_to_compute, last_date_to_compute, date_format)

# Stops intersected with the top 30 Commercial Service Airports

In [15]:
query_stops_unfiltered_string = "with stops as (select * from cuebiq.paas_cda.stop where processing_date = %(i)s), \
\
airport_geo as (select * from dedicated.airports.airport_wkt), \
\
tmp as (select s.cuebiq_id, \
s.device_type_code, \
s.lat, \
s.lng, \
s.geohash_id, \
s.cluster_size, \
s.avg_distance_meters, \
s.avg_accuracy_meters, \
s.std_accuracy_meters, \
s.dwell_time_minutes, \
s.stop_zoned_datetime, \
substr(cast(from_iso8601_timestamp(s.stop_zoned_datetime) as varchar(10)),1,10) local_date, \
s.timezone_offset_seconds, \
s.processing_date, \
s.country_code, \
a.city, \
a.airport_name \
from stops s inner join airport_geo a on st_contains(st_geometryfromtext(a.geometry),st_point(s.lng, s.lat)) \
order by cuebiq_id asc, stop_zoned_datetime asc) \
\
select * from tmp"

In [16]:
for i in dates_to_compute:
    print(f"Executing for {i}...")
    query = query_stops_unfiltered_string % {'i':i}
    print(query)
    res = %sql create table if not exists dedicated.airports.airport_stops as {query}
    print(f"created {res['rows'][0]} rows")
    succedeed=int(res['rows'][0])
    if succedeed==0:
        #first time you create if not insert
        res= %sql insert into dedicated.airports.airport_stops {query}
        print(f"inserted {res['rows'][0]} rows")

with stops as (select * from cuebiq.paas_cda.stop where processing_date = 20210329), airport_geo as (select * from dedicated.airports.airport_wkt), tmp as (select s.cuebiq_id, s.device_type_code, s.lat, s.lng, s.geohash_id, s.cluster_size, s.avg_distance_meters, s.avg_accuracy_meters, s.std_accuracy_meters, s.dwell_time_minutes, s.stop_zoned_datetime, substr(cast(from_iso8601_timestamp(s.stop_zoned_datetime) as varchar(10)),1,10) local_date, s.timezone_offset_seconds, s.processing_date, s.country_code, a.city, a.airport_name, a.geometry from stops s inner join airport_geo a on st_contains(st_geometryfromtext(a.geometry),st_point(s.lng, s.lat)) order by cuebiq_id asc, stop_zoned_datetime asc) select * from tmp
 * presto://localhost:9090/cuebiq/
Done.
160500
with stops as (select * from cuebiq.paas_cda.stop where processing_date = 20210330), airport_geo as (select * from dedicated.airports.airport_wkt), tmp as (select s.cuebiq_id, s.device_type_code, s.lat, s.lng, s.geohash_id, s.cluster