<a href="https://colab.research.google.com/github/samjurassic/datascience-demo/blob/main/workshop/DS_Skills_Lab_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Skills Lab: Exploring NYC Taxi Data with Pandas

**Goal:** Learn how to load, explore, clean, merge, and analyze real-world tabular data in Python using `pandas`.

**Learning objectives:**
- Load data from URLs into DataFrames.
- Inspect and summarize data using `head()`, `info()`, and `describe()`.
- Merge datasets and compute aggregates using `groupby()`.
- Visualize results with `seaborn` and `matplotlib`.


Links:

- We will be using NYC taxi data. The code will automatically download the files, but you can find the files and other links here: https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page
- Read the Yellow Taxi data dictionary https://www.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_yellow.pdf


In [None]:
# Core libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Optional: improve display
pd.set_option('display.max_columns', 20)
sns.set_theme(style="whitegrid")

# Versions
print(f"pandas {pd.__version__}, seaborn {sns.__version__}")


In [None]:
# links to data (pandas can load files from links as well as file paths)
# January 2024 data
taxi_url = (
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-01.parquet"
)
taxi_zone_url = "https://d37ci6vzurychx.cloudfront.net/misc/taxi_zone_lookup.csv"

# read files using appropriate pd.read_* function for each format
trips = pd.read_parquet(taxi_url, engine="pyarrow")
taxi_zones = pd.read_csv(taxi_zone_url)

In [None]:
# use .head() to display the first n rows of the dataframe
trips.head()

In [None]:
taxi_zones.head()

In [None]:
# info() shows us the metadata and types of the dataframe
trips.info()

In [None]:
# we are going to create new columns using the datetime type
trips = (
    trips
    .assign(
        trip_timedelta=lambda d: (d.tpep_dropoff_datetime - d.tpep_pickup_datetime),
        pickup_date=lambda d: d["tpep_pickup_datetime"].dt.date,
        pickup_day=lambda d: d["tpep_pickup_datetime"].dt.day,
        pickup_dow=lambda d: d["tpep_pickup_datetime"].dt.day_name(),
        pickup_dow_num=lambda d: d["tpep_pickup_datetime"].dt.day_of_week,
        pickup_hour=lambda d: d["tpep_pickup_datetime"].dt.hour,
    )
)

# you can see the extracted parts of the datetime as the new columns
trips[["trip_timedelta", "tpep_pickup_datetime", "pickup_date", "pickup_day", "pickup_dow", "pickup_dow_num"]].head()


In [None]:
trips["trip_minutes"] = trips["trip_timedelta"].apply(lambda x: x.total_seconds() / 60.0)

In [None]:
trips[["trip_timedelta", "tpep_pickup_datetime", "tpep_dropoff_datetime", "trip_minutes"]].head()

## Exploring data with seaborn

In [None]:
# data quality check using query (SQL-like expressions)
trips.query("tpep_pickup_datetime < '2024-01-01'")

In [None]:
passengers_per_day = (
    trips
    .query("tpep_dropoff_datetime >= '2024-01-01'")
    .groupby("pickup_date", as_index=False)
    .agg({"passenger_count": "sum"})
)

# seaborn plot syntax is generally: sns.barplot(data=df, x="column_a", y="column_b")
sns.barplot(passengers_per_day, x="pickup_date", y="passenger_count")

# rotate ticks
plt.xticks(rotation=85)
plt.show()

In [None]:
# let's look at average total passengers by day of the week
passengers_per_day_of_week = (
    trips
    .query("tpep_dropoff_datetime >= '2024-01-01'")
    # step 1: sum all passengers into day of week groups (e.g. )
    .groupby(["pickup_dow_num", "pickup_dow", "pickup_date"], as_index=False)
    .agg({"passenger_count": "sum"})
    # step 2: average over the days of week in a month (e.g. all Fridays)
    .query("passenger_count > 10000") # outlier filter
    .groupby(["pickup_dow_num", "pickup_dow"], as_index=False)
    .agg({"passenger_count": "mean"})
    .sort_values(by="pickup_dow_num")
)

sns.barplot(passengers_per_day_of_week, x="pickup_dow", y="passenger_count")

# rotate ticks
plt.xticks(rotation=45)
plt.show()

In [None]:
# step 1: sum all passengers into day of week groups (e.g. all mondays in the month)
(trips
 .groupby(["pickup_dow_num", "pickup_dow", "pickup_date"], as_index=False)
 .agg({"passenger_count": "sum"})
 .sort_values(by="pickup_date"))


In [None]:
# step 2: average over the days of week in a month (e.g. all Fridays)
temp_df = _

(temp_df
 .groupby(["pickup_dow_num", "pickup_dow"], as_index=False)
  .agg({"passenger_count": "mean"})
  .sort_values(by="pickup_dow_num"))

### 1 - How many pickups happened at each airport?

We'll merge the taxi zone reference table with trip data to map location IDs to airport names, then group and count.


In [None]:
result_1 = (
    trips
    .merge(taxi_zones, left_on="PULocationID", right_on="LocationID", how="left")
    .query("Zone in ['JFK Airport', 'LaGuardia Airport', 'Newark Airport']")
    .groupby("Zone", as_index=False)
    .agg(pickup_count=('PULocationID', 'count'))
    .sort_values("pickup_count", ascending=False)
)

result_1

In [None]:
sns.barplot(result_1, x="Zone", y="pickup_count")

### 2 - How many dropoffs happened at each NYC airport?

In [None]:
# we are going to do the opposite merge on dropoff ID (DOLocationID)
result_2 = (
    trips
    .merge(taxi_zones, left_on="DOLocationID", right_on="LocationID", how="left")
    .query("Zone in ['JFK Airport', 'LaGuardia Airport', 'Newark Airport']")
    .groupby("Zone", as_index=False)
    .agg(dropoff_count=('DOLocationID', 'count'))
    .sort_values("dropoff_count", ascending=False)
)

result_2

In [None]:
sns.barplot(result_2, x="Zone", y="dropoff_count")

### 3 - What is the total amount of airport fees collected at each NYC airport? (JFK and LaGuardia)

Tip, airport fee is collected by Taxi meter if picked up at an airport

In [None]:
result_3 = (
    trips
    .merge(taxi_zones, left_on="PULocationID", right_on="LocationID", how="left")
    .query("Zone in ['JFK Airport', 'LaGuardia Airport', 'Newark Airport']")
    .groupby("Zone", as_index=False)
    .agg({"Airport_fee": "sum", "PULocationID": "count"})
    .rename(columns={
        "Airport_fee": "airport_fee_sum",
        "PULocationID": "pickup_count"})
)

result_3

In [None]:
sns.barplot(result_3, x="Zone", y="airport_fee_sum")

### 4 - What borough destination had the highest tips per mile?

In [None]:
borough_metrics = (
    trips
    .merge(
        taxi_zones,
        left_on="DOLocationID",
        right_on="LocationID",
        how="left"
    )
    .groupby("Borough", as_index=False)
    .agg(
        total_tips=('tip_amount', 'sum'),
        average_tips=('tip_amount', 'mean'),
        dropoff_count=('DOLocationID', 'count'),
        avg_trip_distance=('trip_distance', 'mean'),
        total_distance=('trip_distance', 'sum')
    )
    .assign(
        avg_trip_distance=lambda d: d['avg_trip_distance'].round(2),
        total_tips=lambda d: d['total_tips'].round(0),
        tip_per_mile=lambda d: d.total_tips / d.total_distance
    )
    .sort_values("total_tips", ascending=False)
)


In [None]:
# check: how many zones per borough
taxi_zones.Borough.value_counts()

In [None]:
borough_metrics

In [None]:
taxi_zones.query("Borough == 'Unknown'") # looks like missing information

In [None]:
sns.barplot(borough_metrics.query("Borough != 'Unknown'"), x="Borough", y="tip_per_mile", hue="Borough")

### 5 - What were the top 10 pickup locations by number of passengers?

In [None]:
result_5 =  (trips
    .merge(taxi_zones, left_on="PULocationID", right_on="LocationID", how="left")
    .groupby("Zone", as_index=False)
    .agg(passenger_count=("passenger_count", "sum"))
    .sort_values(by="passenger_count", ascending=False)[0:10]
)

In [None]:
ax = sns.barplot(result_5, x="Zone", y="passenger_count")

# rotate ticks
plt.xticks(rotation=-65)
plt.show()

## ✅ Wrap-Up

In this lab, you practiced:
- Loading real data into pandas
- Extracting features from datetime columns
- Merging datasets and computing aggregates
- Visualizing simple summaries

**Next Steps**
- Explore averages (`.mean()`, `.median()`) for trip durations or fares.
- Try grouping by hour or day of week.
- Experiment with `pd.to_datetime()` and `.dt` accessors for other date parts.


## Bonus: Machine Learning

### Predicting Trip Duration with a Decision Tree

Now that we've explored taxi trip data, let's take it a step further and **predict trip duration** (in minutes)  
from a few simple features:
- **Day of week**
- **Borough (dropoff location)**
- **Hour of day**
- **Weather** (rain vs. no rain)

We'll use `scikit-learn`s `DecisionTreeRegressor` to build a simple model.


In [None]:
# Install dependencies (Colab safe)
# !pip install scikit-learn requests
import requests
from datetime import datetime

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.metrics import mean_absolute_error, r2_score

In [None]:
# Merge borough
trips_b = trips.merge(
    taxi_zones[["LocationID", "Borough"]],
    left_on="DOLocationID", right_on="LocationID", how="left"
).query("tpep_pickup_datetime >= '2023-12-31' and Borough != 'Unknown'")

trips_b[["trip_minutes", "pickup_hour", "pickup_dow", "Borough"]].head()

In [None]:
# Pick approximate date range from dataset
start_date = trips_b.pickup_date.min()
end_date = trips_b.pickup_date.max()

# Query daily weather summary for NYC from open-meteo
url = (
    f"https://archive-api.open-meteo.com/v1/archive?"
    f"latitude=40.7&longitude=-74.0&start_date={start_date}&end_date={end_date}"
    f"&daily=precipitation_sum&timezone=America%2FNew_York"
)

resp = requests.get(url)
data = resp.json()["daily"]
weather_df = pd.DataFrame({
    "date": pd.to_datetime(data["time"]),
    "rain_total": np.array(data["precipitation_sum"]),
    # precipitation is in mm
    "rain": (np.array(data["precipitation_sum"]) > 6.0).astype(int)
})

weather_df.tail()


In [None]:
weather_df.rain_total.describe()

In [None]:
# merge in weather
trips_w = (
    trips_b
    .assign(date=lambda d: pd.to_datetime(d.pickup_date))
    .merge(weather_df, left_on="date", right_on="date", how="left")
)


In [None]:
# let's see
trips_w.groupby("rain")["trip_minutes"].mean()

In [None]:
# Prepare features
model_df = trips_w[["tpep_pickup_datetime", "trip_minutes", "pickup_hour", "pickup_dow", "Borough", "rain"]].dropna()

# Encode categoricals
model_df = pd.get_dummies(model_df, columns=["pickup_dow", "Borough"], drop_first=True)

# split training data on date
# X is your input variables, y is what you are trying to predict

# get split point based on training fraction
train_frac=0.75
df = model_df.sort_values("tpep_pickup_datetime").reset_index(drop=True)
split_point = df["tpep_pickup_datetime"].quantile(train_frac).strftime("%Y-%m-%d")

print(f"Splitting data on {split_point}")

X_train = df.query("tpep_pickup_datetime < @split_point").drop(columns=["trip_minutes", "tpep_pickup_datetime"])
X_test = df.query("tpep_pickup_datetime >= @split_point").drop(columns=["trip_minutes", "tpep_pickup_datetime"])

y_train = df.query("tpep_pickup_datetime < @split_point")["trip_minutes"]
y_test = df.query("tpep_pickup_datetime >= @split_point")["trip_minutes"]


In [None]:
# # Fit model
tree = DecisionTreeRegressor(max_depth=3, random_state=42)
tree.fit(X_train, y_train)

# # Evaluate by using the model to predict the test data
y_pred = tree.predict(X_test)
print(f"MAE: {mean_absolute_error(y_test, y_pred):.2f}")
print(f"R²: {r2_score(y_test, y_pred):.2f}")

Even a simple tree gives us a quick sense of which features matter most.
We’re not trying for high accuracy — the goal is to **connect data features with model reasoning.**


In [None]:
plt.figure(figsize=(14, 6))
plot_tree(tree, feature_names=X_test.columns, filled=True, fontsize=7)
plt.title("Decision Tree: Predicting Trip Minutes")
plt.show()
