# Pooled Trips Performance Experiments

In [1]:
import os
os.chdir("../../")

In [2]:
%%bash
cd pipeline
make uncompressed
cd ..

cat compressed/db-part-* > compressed.db.tgz
tar -xf compressed.db.tgz
rm compressed.db.tgz


In [3]:
import pandas as pd
import sqlite3
from api.utils.database import rows_to_dicts

In [4]:
con = sqlite3.connect("./pipeline/database.db")

In [5]:
def run_query(query):
    cur = con.cursor()
    rows = cur.execute(query).fetchall()
    if cur.description is None:
        return None
    return pd.DataFrame(rows_to_dicts(cur, rows))

In [6]:
query_tables_views = """
SELECT name
FROM sqlite_master
WHERE type == "table"
OR type == "view"
AND name NOT LIKE "sqlite_%";
"""
run_query(query_tables_views)

Unnamed: 0,name
0,community_area
1,population
2,income
3,covid_spread
4,rideshare


In [7]:
query_count_rideshare_rows = """
SELECT
    count(1) as daily_rows,
    sum(case when ymd == week then 1 else 0 end) as weekly_rows
FROM rideshare
"""
run_query(query_count_rideshare_rows)

Unnamed: 0,daily_rows,weekly_rows
0,3692997,523385


In [8]:
from api.metrics.community import CommunityMetrics
from api.questions.pooled_trips import PooledTripMetrics


def pooled_trips(con):
    metric_community = CommunityMetrics(con)
    metric_pooled = PooledTripMetrics(con)
    
    before_covid = ("2019-02-01", "2020-03-02")
    since_covid = ("2020-03-02", "2021-04-01")
    
    metrics = metric_community.merge_metrics({
        "rideshare_pooled_trip_rate_2019":
            lambda: metric_community.rideshare_pooled_trip_rate(year=2019),
        "avg_cost_per_trip_cents_before":
            lambda: metric_pooled.avg_cost_per_trip_cents_by_area(*before_covid),
        "avg_trips_per_day_before":
            lambda: metric_pooled.avg_trips_per_day_by_area(*before_covid),
        "avg_cost_per_trip_cents_since":
            lambda: metric_pooled.avg_cost_per_trip_cents_by_area(*since_covid),
        "avg_trips_per_day_since":
            lambda: metric_pooled.avg_trips_per_day_by_area(*since_covid)
    })
    
    return metrics

In [9]:
def pooled_trips_two_metrics(con):
    metric_community = CommunityMetrics(con)
    metric_pooled = PooledTripMetrics(con)
    
    before_covid = ("2019-02-01", "2020-03-02")
    since_covid = ("2020-03-02", "2021-04-01")
    
    metrics = metric_community.merge_metrics({
        "rideshare_pooled_trip_rate_2019":
            lambda: metric_community.rideshare_pooled_trip_rate(year=2019),
        "avg_cost_per_trip_cents_before":
            lambda: metric_pooled.avg_cost_per_trip_cents_by_area(*before_covid)
    })
    
    return metrics

In [10]:
%%time
m = pooled_trips(con)
print(f"Got metrics for {len(m)} records.")

Got metrics for 77 records.
CPU times: user 14.4 s, sys: 707 ms, total: 15.1 s
Wall time: 15.4 s


In [11]:
%%time
m = pooled_trips_two_metrics(con)
print(f"Got metrics for {len(m)} records.")

Got metrics for 77 records.
CPU times: user 6.9 s, sys: 309 ms, total: 7.21 s
Wall time: 7.31 s


In [12]:
run_query("ALTER TABLE rideshare RENAME TO rideshare_daily")

In [13]:
query_weekly_rideshare_view = """
CREATE VIEW rideshare AS
SELECT *
FROM rideshare_daily
WHERE ymd == week
"""
run_query(query_weekly_rideshare_view)

In [14]:
run_query(query_tables_views)

Unnamed: 0,name
0,community_area
1,population
2,income
3,covid_spread
4,rideshare_daily
5,rideshare


In [15]:
%%time
m = pooled_trips(con)
print(f"Got metrics for {len(m)} records.")

Got metrics for 77 records.
CPU times: user 6.74 s, sys: 347 ms, total: 7.08 s
Wall time: 7.12 s


In [16]:
%%time
m = pooled_trips_two_metrics(con)
print(f"Got metrics for {len(m)} records.")

Got metrics for 77 records.
CPU times: user 3.68 s, sys: 142 ms, total: 3.82 s
Wall time: 3.84 s


In [17]:
%%bash
cd pipeline
make uncompressed
cd ..

cat compressed/db-part-* > compressed.db.tgz
tar -xf compressed.db.tgz
rm compressed.db.tgz
