# Pooled Trips Performance Experiments

In [1]:
import os
os.chdir("../../")

## Performance Comparison

### Load Fresh Database State

In [2]:
%%bash
cd pipeline
make uncompressed
cd ..

cat compressed/db-part-* > compressed.db.tgz
tar -xf compressed.db.tgz
rm compressed.db.tgz


In [3]:
import pandas as pd
import sqlite3
from api.utils.database import rows_to_dicts

In [4]:
con = sqlite3.connect("./pipeline/database.db")

In [5]:
def run_query(query):
    cur = con.cursor()
    rows = cur.execute(query).fetchall()
    if cur.description is None:
        return None
    return pd.DataFrame(rows_to_dicts(cur, rows))

In [6]:
query_tables_views = """
SELECT name
FROM sqlite_master
WHERE type == "table"
OR type == "view"
AND name NOT LIKE "sqlite_%";
"""
run_query(query_tables_views)

Unnamed: 0,name
0,community_area
1,population
2,income
3,covid_spread
4,rideshare


### Estimate Impact of Daily To Weekly Change

In [7]:
query_count_rideshare_rows = """
SELECT
    count(1) as daily_rows,
    sum(case when ymd == week then 1 else 0 end) as weekly_rows
FROM rideshare
"""
run_query(query_count_rideshare_rows)

Unnamed: 0,daily_rows,weekly_rows
0,3692997,523385


### Prepare Methods For Comparison

In [8]:
from api.metrics.community import CommunityMetrics
from api.questions.pooled_trips import PooledTripMetrics


def pooled_trips(con):
    metric_community = CommunityMetrics(con)
    metric_pooled = PooledTripMetrics(con)
    
    before_covid = ("2019-02-01", "2020-03-02")
    since_covid = ("2020-03-02", "2021-04-01")
    
    metrics = metric_community.merge_metrics({
        "rideshare_pooled_trip_rate_2019":
            lambda: metric_community.rideshare_pooled_trip_rate(year=2019),
        "avg_cost_per_trip_cents_before":
            lambda: metric_pooled.avg_cost_per_trip_cents_by_area(*before_covid),
        "avg_trips_per_day_before":
            lambda: metric_pooled.avg_trips_per_day_by_area(*before_covid),
        "avg_cost_per_trip_cents_since":
            lambda: metric_pooled.avg_cost_per_trip_cents_by_area(*since_covid),
        "avg_trips_per_day_since":
            lambda: metric_pooled.avg_trips_per_day_by_area(*since_covid)
    })
    
    return metrics

In [9]:
def pooled_trips_two_metrics(con):
    metric_community = CommunityMetrics(con)
    metric_pooled = PooledTripMetrics(con)
    
    before_covid = ("2019-02-01", "2020-03-02")
    since_covid = ("2020-03-02", "2021-04-01")
    
    metrics = metric_community.merge_metrics({
        "rideshare_pooled_trip_rate_2019":
            lambda: metric_community.rideshare_pooled_trip_rate(year=2019),
        "avg_cost_per_trip_cents_before":
            lambda: metric_pooled.avg_cost_per_trip_cents_by_area(*before_covid)
    })
    
    return metrics

In [10]:
class CombinedPooledTripMetrics:

    def __init__(self, con):
        self.con = con
    
    def all_metrics_by_period(self, before, since):
        before_start, before_end = before
        since_start, since_end = since
        query_vars = (
            # Check before date range for period
            before_start, before_end,
            # Check since date range for period
            since_start, since_end,
            # Use only before period to compute denominator
            before_end, before_start
        )
        query = """
        SELECT
            pickup_community_area as area_number,
            CASE
                WHEN ymd >= ? AND ymd < ? THEN "before"
                WHEN ymd >= ? AND ymd < ? THEN "since"
                ELSE "other"
                END as period,
            CAST(SUM(n_trips_pooled) as REAL)
                / CAST(SUM(n_trips) as REAL) as pooled_trip_rate,
            CAST(
                SUM(n_trips * avg_cost_no_tip_cents)
                / SUM(n_trips)
                as INTEGER) as cost_per_trip,
            CAST(
                SUM(n_trips)
                / (JULIANDAY(?) - JULIANDAY(?))
                as INTEGER) as trips_per_day
        FROM rideshare
        GROUP BY area_number, period
        HAVING period != "other"
        """
        cur = self.con.cursor()
        cur.execute(query, query_vars)
        rows = rows_to_dicts(cur, cur.fetchall())
        return rows
    
    def community_areas(self):
        """
        Returns all of the community areas.
        """
        query = """
        SELECT
            area_number,
            name,
            part
        FROM community_area
        """
        cur = self.con.cursor()
        cur.execute(query)
        rows = rows_to_dicts(cur, cur.fetchall())
        return rows
    
    def to_metrics(self, rows):
        res = {}
        for area in self.community_areas():
            res[area["area_number"]] = area
        for row in rows:
            area = row["area_number"]
            if row["period"] == "before":
                res[area]["rideshare_pooled_trip_rate_before"] = row["pooled_trip_rate"]
                res[area]["avg_cost_per_trip_cents_before"] = row["cost_per_trip"]
                res[area]["avg_trips_per_day_before"] = row["trips_per_day"]
            elif row["period"] == "since":
                res[area]["avg_cost_per_trip_cents_since"] = row["cost_per_trip"]
                res[area]["avg_trips_per_day_since"] = row["trips_per_day"]
        return list(res.values())


def pooled_trips_combined(con):
    metric_combined = CombinedPooledTripMetrics(con)
    
    before_covid = ("2019-02-01", "2020-03-02")
    since_covid = ("2020-03-02", "2021-04-01")
    
    rows = metric_combined.all_metrics_by_period(
        before_covid,
        since_covid
    )
    metrics = metric_combined.to_metrics(rows)
    
    return metrics

### Time Methods On Daily Rideshares

In [11]:
%%time
m11 = pooled_trips(con)
print(f"Got metrics for {len(m11)} records.")

Got metrics for 77 records.
CPU times: user 15.1 s, sys: 2.01 s, total: 17.2 s
Wall time: 18.5 s


In [12]:
%%time
m12 = pooled_trips_two_metrics(con)
print(f"Got metrics for {len(m12)} records.")

Got metrics for 77 records.
CPU times: user 7.15 s, sys: 758 ms, total: 7.91 s
Wall time: 8.28 s


In [13]:
%%time
m13 = pooled_trips_combined(con)
print(f"Got metrics for {len(m13)} records.")

Got metrics for 77 records.
CPU times: user 9.76 s, sys: 2.2 s, total: 12 s
Wall time: 12.8 s


### Replace Daily Rideshare Table With Weekly Rideshare View

In [14]:
run_query("ALTER TABLE rideshare RENAME TO rideshare_daily")

In [15]:
query_weekly_rideshare_view = """
CREATE VIEW rideshare AS
SELECT *
FROM rideshare_daily
WHERE ymd == week
"""
run_query(query_weekly_rideshare_view)

In [16]:
run_query(query_tables_views)

Unnamed: 0,name
0,community_area
1,population
2,income
3,covid_spread
4,rideshare_daily
5,rideshare


### Time Methods On Weekly Rideshares

In [17]:
%%time
m21 = pooled_trips(con)
print(f"Got metrics for {len(m21)} records.")

Got metrics for 77 records.
CPU times: user 7.35 s, sys: 648 ms, total: 8 s
Wall time: 8.78 s


In [18]:
%%time
m22 = pooled_trips_two_metrics(con)
print(f"Got metrics for {len(m22)} records.")

Got metrics for 77 records.
CPU times: user 3.93 s, sys: 286 ms, total: 4.21 s
Wall time: 4.48 s


In [19]:
%%time
m23 = pooled_trips_combined(con)
print(f"Got metrics for {len(m23)} records.")

Got metrics for 77 records.
CPU times: user 2.01 s, sys: 321 ms, total: 2.33 s
Wall time: 2.52 s


### Reset Database State

In [20]:
%%bash
cd pipeline
make uncompressed
cd ..

cat compressed/db-part-* > compressed.db.tgz
tar -xf compressed.db.tgz
rm compressed.db.tgz


## Conclusion

For the pooled trips question:

- If we want to do this metric online:
    - Reduce the rideshare table to weekly granularity instead of daily
    - Combine metrics into one query rather than separate queries
- Otherwise, we can compute this metric offline

For other questions:

- Other datasets should be much smaller than the rideshare dataset
    - However, computing many metrics separately may still add up
- Taxi dataset should be as big if not bigger than rideshare dataset
    - Could also limit by year