# New York City Taxi Analysis
In this example we show some analysis for yellow and green taxi trips originating in New York City in 2019. 

Orignal example can be found [here](https://github.com/toddwschneider/nyc-taxi-data)

The queries are originally in SQL format and here they are implemented using the pandas API.

### Notes on running these queries:

Bodo is used by defaults, which distributes data chunks across cores automatically.

Dataset size is ~8GB.

To run the code:
1. Make sure you [add your AWS account credentials to Saturn Cloud](https://saturncloud.io/docs/examples/python/load-data/qs-load-data-s3/#create-aws-credentials) to access the data.
2. If you want to run a query in regular pandas:
    1. Comment lines with Jupyter parallel magic (%%px) and bodo decorator (@bodo.jit) from all the code cells.
    2. Then, re-run cells from the beginning.



### Start an IPyParallel cluster
Run the following code in a cell to start an IPyParallel cluster. 4 cores are used in this example. 

In [None]:
import ipyparallel as ipp
import psutil

n = min(psutil.cpu_count(logical=False), 8)
rc = ipp.Cluster(engines="mpi", n=n).start_and_connect_sync(activate=True)

### Verifying your setup
Run the following code to verify that your IPyParallel cluster is set up correctly:

In [None]:
%%px
import bodo

print(f"Hello World from rank {bodo.get_rank()}. Total ranks={bodo.get_size()}")

In [None]:
%%px
import time

import bodo
import numpy as np
import pandas as pd

<a id="loading_data"></a>
## Loading data

In this section, we load the yellow and green taxi trips and weather in central park area in 2019 in pandas DataFrame.

In [None]:
%%px
@bodo.jit(distributed=["trips"], cache=True)
def get_trips():
    start = time.time()
    yellow_taxi = pd.read_csv(
        "s3://bodo-example-data/nyc-taxi/yellow_tripdata_2019.csv",
        usecols=[0, 1, 4, 7, 8],
        parse_dates=["tpep_pickup_datetime"],
    )

    yellow_taxi["cab_type_id"] = 0

    green_taxi = pd.read_csv(
        "s3://bodo-example-data/nyc-taxi/green_tripdata_2019.csv",
        usecols=[0, 1, 5, 6, 8],
        parse_dates=["lpep_pickup_datetime"],
    )
    green_taxi["cab_type_id"] = 1

    # Rename yellow_taxi column to match green_taxi
    yellow_taxi = yellow_taxi.rename(
        columns={"tpep_pickup_datetime": "lpep_pickup_datetime"}, copy=False
    )

    # concat
    trips = pd.concat([green_taxi, yellow_taxi])
    end = time.time()
    print("Time: ", end - start)
    return trips


trips = get_trips()
if bodo.get_rank() == 0:
    print(trips.head())

In [None]:
%%px
@bodo.jit(distributed=["central_park_weather_observations"], cache=True)
def get_cp_weather():
    start = time.time()
    central_park_weather_observations = pd.read_csv(
        "s3://bodo-example-data/nyc-taxi/central_park_weather.csv", parse_dates=["date"]
    )
    central_park_weather_observations["date"] = central_park_weather_observations[
        "date"
    ].dt.date
    end = time.time()
    print("Time: ", end - start)
    return central_park_weather_observations


central_park_weather_observations = get_cp_weather()
if bodo.get_rank() == 0:
    print(central_park_weather_observations.head())

## Query Definitions

This section includes some of the queries using Python (Pandas)

### Q1: Display pickups by geography
This query reports the number of trips for each pickup location.

In [None]:
%%px
@bodo.jit(distributed=["daily_pickups_taxi", "trips"], cache=True)
def get_daily_pickups(trips):
    start = time.time()
    daily_pickups_taxi = trips.loc[
        :, ["cab_type_id", "PULocationID", "lpep_pickup_datetime"]
    ]
    daily_pickups_taxi["pickup_date"] = daily_pickups_taxi[
        "lpep_pickup_datetime"
    ].dt.date
    daily_pickups_taxi = daily_pickups_taxi.groupby(
        ["cab_type_id", "PULocationID", "pickup_date"], as_index=False
    )["lpep_pickup_datetime"].count()
    daily_pickups_taxi = daily_pickups_taxi.rename(
        columns={
            "PULocationID": "pickup_location_id",
            "pickup_date": "date",
            "lpep_pickup_datetime": "trips",
        },
        copy=False,
    )
    daily_pickups_taxi = daily_pickups_taxi.sort_values(
        by=["cab_type_id", "pickup_location_id", "date", "trips"],
        ascending=[True, True, True, False],
    )

    end = time.time()
    print("Time: ", end - start)
    return daily_pickups_taxi


daily_pickups = get_daily_pickups(trips)
if bodo.get_rank() == 0:
    print(daily_pickups.head())

### Q2: JFK Hourly Pickups
This query reports the number of hourly pickups at JFK airport for each car type.

In [None]:
%%px
@bodo.jit(distributed=["jfk_hourly", "trips"], cache=True)
def get_jfk_hourly_pickups(trips):
    start = time.time()
    jfk_hourly = trips.loc[:, ["cab_type_id", "PULocationID", "lpep_pickup_datetime"]]
    jfk_hourly["pickup_hour"] = jfk_hourly["lpep_pickup_datetime"].dt.hour
    jfk_hourly = jfk_hourly.loc[jfk_hourly["PULocationID"] == 132]
    jfk_hourly = jfk_hourly.groupby(
        ["cab_type_id", "pickup_hour", "PULocationID"], as_index=False
    )["lpep_pickup_datetime"].count()
    jfk_hourly = jfk_hourly.rename(
        columns={
            "lpep_pickup_datetime": "trips",
            "PULocationID": "pickup_location_id",
        },
        copy=False,
    )
    jfk_hourly = jfk_hourly.sort_values(
        by=["cab_type_id", "pickup_hour", "pickup_location_id", "trips"],
        ascending=[True, True, True, False],
    )

    end = time.time()
    print("Time: ", end - start)
    return jfk_hourly


jfk_hourly = get_jfk_hourly_pickups(trips)
if bodo.get_rank() == 0:
    print(jfk_hourly.head())

### Q3: Weekday trips
This query reports how many trips are done during weekdays.

In [None]:
%%px
@bodo.jit(distributed=["trips", "trips_weekdays"], cache=True)
def get_weekday_trips(trips):
    start = time.time()
    trips_weekdays = trips.loc[
        :, ["cab_type_id", "lpep_pickup_datetime", "PULocationID", "DOLocationID"]
    ]
    trips_weekdays["pickup_dow"] = trips_weekdays["lpep_pickup_datetime"].dt.dayofweek
    trips_weekdays = trips_weekdays[
        (trips_weekdays["cab_type_id"].isin([0, 1]))
        & (trips_weekdays["lpep_pickup_datetime"] >= pd.to_datetime("2018-07-01"))
        & (trips_weekdays["lpep_pickup_datetime"] < pd.to_datetime("2020-07-01"))
        & (trips_weekdays["pickup_dow"].isin([1, 2, 3, 4, 5]))
    ]
    trips_weekdays = trips_weekdays.groupby(
        ["PULocationID", "DOLocationID"], as_index=False
    ).count()
    trips_weekdays = trips_weekdays.loc[
        :, ["PULocationID", "DOLocationID", "lpep_pickup_datetime"]
    ]
    trips_weekdays = trips_weekdays.rename(
        columns={
            "PULocationID": "pickup_location_id",
            "DOLocationID": "dropoff_location_id",
            "lpep_pickup_datetime": "trips",
        },
        copy=False,
    )
    trips_weekdays = trips_weekdays.sort_values(
        by=["pickup_location_id", "dropoff_location_id", "trips"],
        ascending=[True, True, False],
    )
    end = time.time()
    print("Time: ", end - start)
    return trips_weekdays


wd_trips = get_weekday_trips(trips)
if bodo.get_rank() == 0:
    print(wd_trips.head())

### Q4: Monthly Trips and Weather in Central Park
This query reports monthly travel times; the average distance, number of trips over and its corresponding weather.


In [None]:
%%px
@bodo.jit(
    distributed=["trips", "central_park_weather_observations", "monthly_trips_weather"],
    cache=True,
)
def get_monthly_travels_weather(trips, central_park_weather_observations):
    start = time.time()
    trips = trips.loc[
        :,
        [
            "VendorID",
            "lpep_pickup_datetime",
            "PULocationID",
            "DOLocationID",
            "trip_distance",
        ],
    ]
    trips["date"] = trips["lpep_pickup_datetime"].dt.date
    trips["month"] = trips["lpep_pickup_datetime"].dt.month
    trips["hour"] = trips["lpep_pickup_datetime"].dt.hour
    trips["weekday"] = trips["lpep_pickup_datetime"].dt.dayofweek
    monthly_trips_weather = trips.merge(
        central_park_weather_observations, on="date", how="inner"
    )
    monthly_trips_weather = monthly_trips_weather[
        (monthly_trips_weather["weekday"].isin([1, 2, 3, 4, 5]))
        & (monthly_trips_weather["precipitation"] > 0.1)
    ]
    # 0: morning, 1:midday, 2:afternoon, 3:evening, 4:other
    monthly_trips_weather["time_bucket"] = monthly_trips_weather.hour.replace(
        {
            8: 0,
            9: 0,
            10: 0,
            11: 1,
            12: 1,
            13: 1,
            14: 1,
            15: 1,
            16: 2,
            17: 2,
            18: 2,
            18: 2,
            19: 3,
            20: 3,
            21: 3,
            22: 4,
            23: 4,
            0: 4,
            1: 4,
            2: 4,
            3: 4,
            4: 4,
            5: 4,
            6: 4,
            7: 4,
        }
    )
    monthly_trips_weather = monthly_trips_weather.groupby(
        [
            "PULocationID",
            "DOLocationID",
            "month",
            "weekday",
            "precipitation",
            "time_bucket",
        ],
        as_index=False,
    ).agg({"VendorID": "count", "trip_distance": "mean"})
    monthly_trips_weather = monthly_trips_weather.sort_values(
        by=[
            "PULocationID",
            "DOLocationID",
            "month",
            "weekday",
            "precipitation",
            "time_bucket",
            "VendorID",
        ],
        ascending=[True, True, True, True, True, True, False],
    )
    monthly_trips_weather = monthly_trips_weather.rename(
        columns={
            "VendorID": "trips",
            "trip_distance": "avg_distance",
            "precipitation": "date_with_precipitation",
        },
        copy=False,
    )
    end = time.time()
    print("Time: ", end - start)
    return monthly_trips_weather


monthly_trips_weather = get_monthly_travels_weather(
    trips, central_park_weather_observations
)
if bodo.get_rank() == 0:
    print(monthly_trips_weather.head())

In [None]:
# To stop the cluster run the following command.
rc.cluster.stop_cluster_sync()