In [1]:
import pl from "npm:nodejs-polars";
import Plot from "https://deno.land/x/plot/mod.ts";
let bcycle_df = await pl.readParquet("../bcycle_data_combined.parquet");
bcycle_df.sample(3);

lon,lat,_bcycle_station_type,region_id,address,name,station_id,is_returning,is_renting,is_installed,num_docks_available,num_bikes_available,last_reported
-121.97804,36.95705,3.0 Dock Station,bcycle_santacruz_region_81,Parking Lot,Pleasure Point - Moran Lake Park,bcycle_santacruz_7700,1,1,1,4,1,Tue Sep 03 2024 04:25:17 GMT-0500 (Central Daylight Time)
-122.06114,36.99329,3.0 Dock Station,bcycle_santacruz_region_49,Parking Lot 126,UCSC – Lot 126 (Arts),bcycle_santacruz_7520,1,1,1,6,0,Fri Mar 15 2024 09:01:36 GMT-0500 (Central Daylight Time)
-122.04326,36.96803,3.0 Dock Station,bcycle_santacruz_region_72,King St,Santa Cruz – Bay & King,bcycle_santacruz_7527,1,1,1,1,7,Thu Feb 29 2024 17:13:57 GMT-0600 (Central Standard Time)


In [None]:
// Display DataFrame info
console.log(bcycle_df.schema);
console.log(`Number of rows: ${bcycle_df.height}`);
console.log(`Number of columns: ${bcycle_df.width}`);

In [None]:
// Display summary statistics
bcycle_df.describe().toString();

In [None]:
// Check for missing values
bcycle_df.nullCount().toString();

In [None]:
// Distribution of the number of bikes available
const bikes_available_data = bcycle_df
  .select("num_bikes_available")
  .toRecords();

Plot.plot({
  marks: [
    Plot.rectY(
      bikes_available_data,
      Plot.binX({ y: "count" }, { x: "num_bikes_available" }),
    ),
    Plot.lineY(
      bikes_available_data,
      Plot.binX({ y: "count" }, { x: "num_bikes_available", curve: "basis" }),
    ),
  ],
  x: { label: "Number of Bikes Available" },
  y: { label: "Frequency" },
  title: "Distribution of Number of Bikes Available",
});

In [None]:
// Distribution of the number of docks available
const docks_available_data = bcycle_df
  .select("num_docks_available")
  .toRecords();

Plot.plot({
  marks: [
    Plot.rectY(
      docks_available_data,
      Plot.binX({ y: "count" }, { x: "num_docks_available" }),
    ),
    Plot.lineY(
      docks_available_data,
      Plot.binX({ y: "count" }, { x: "num_docks_available", curve: "basis" }),
    ),
  ],
  x: { label: "Number of Docks Available" },
  y: { label: "Frequency" },
  title: "Distribution of Number of Docks Available",
});

In [None]:
// Note for self
// bcycle_df.filter(pl.col("station_id").eq("bcycle_santacruz_7431")) // Must wrap in `pl.lit`
// bcycle_df.filter(pl.col("station_id").eq(pl.lit("bcycle_santacruz_7431")));

In [None]:
// Number of bikes available over time for a specific station
const station_id = pl.lit("bcycle_santacruz_7431");
const station_data = bcycle_df
  .filter(pl.col("station_id").eq(station_id))
  .select(["last_reported", "num_bikes_available"])
  .toRecords();

Plot.plot({
  marks: [
    Plot.line(station_data, { x: "last_reported", y: "num_bikes_available" }),
  ],
  x: { label: "Time", tickRotate: 45 },
  y: { label: "Number of Bikes Available" },
  title: `Number of Bikes Available Over Time for Station ID: ${station_id}`,
});

In [None]:
// Number of docks available over time for a specific station
Plot.plot({
  marks: [
    Plot.line(station_data, { x: "last_reported", y: "num_docks_available" }),
  ],
  x: { label: "Time", tickRotate: 45 },
  y: { label: "Number of Docks Available" },
  title: `Number of Docks Available Over Time for Station ID: ${station_id}`,
});

In [None]:
// Average number of bikes available per station
const avg_bikes_per_station = bcycle_df
  .groupBy("station_id")
  .agg(pl.col("num_bikes_available").mean().alias("avg_bikes"))
  .sort("avg_bikes", false)
  .toRecords();

Plot.plot({
  marks: [
    Plot.barX(avg_bikes_per_station, { x: "avg_bikes", y: "station_id" }),
  ],
  x: { label: "Average Number of Bikes Available" },
  y: { label: "Station ID" },
  title: "Average Number of Bikes Available per Station",
});

In [None]:
// Average number of docks available per station
const avg_docks_per_station = bcycle_df
  .groupBy("station_id")
  .agg(pl.col("num_docks_available").mean().alias("avg_docks"))
  .sort("avg_docks", false)
  .toRecords();

Plot.plot({
  marks: [
    Plot.barX(avg_docks_per_station, { x: "avg_docks", y: "station_id" }),
  ],
  x: { label: "Average Number of Docks Available" },
  y: { label: "Station ID" },
  title: "Average Number of Docks Available per Station",
});

In [None]:

let df_with_time = bcycle_df.withColumn(
  "last_reported",
  pl.col("last_reported").cast("datetime"),
);

In [None]:
// Average number of bikes available by day of week and hour
const heatmap_data_bikes = df_with_time
  .groupBy(["day_of_week", "hour"])
  .agg(pl.col("num_bikes_available").mean().alias("avg_bikes"))
  .sort(["day_of_week", "hour"])
  .toRecords();

Plot.plot({
  color: { scheme: "viridis" },
  marks: [
    Plot.cell(heatmap_data_bikes, {
      x: "hour",
      y: "day_of_week",
      fill: "avg_bikes",
      inset: 0.5,
    }),
    Plot.text(heatmap_data_bikes, {
      x: "hour",
      y: "day_of_week",
      text: (d) => d.avg_bikes.toFixed(1),
      fill: "white",
    }),
  ],
  x: { label: "Hour of Day" },
  y: { label: "Day of Week" },
  title: "Average Number of Bikes Available by Day of Week and Hour",
});

In [None]:
// Average number of docks available by day of week and hour
const heatmap_data_docks = df_with_time
  .groupBy(["day_of_week", "hour"])
  .agg(pl.col("num_docks_available").mean().alias("avg_docks"))
  .sort(["day_of_week", "hour"])
  .toRecords();

Plot.plot({
  color: { scheme: "viridis" },
  marks: [
    Plot.cell(heatmap_data_docks, {
      x: "hour",
      y: "day_of_week",
      fill: "avg_docks",
      inset: 0.5,
    }),
    Plot.text(heatmap_data_docks, {
      x: "hour",
      y: "day_of_week",
      text: (d) => d.avg_docks.toFixed(1),
      fill: "white",
    }),
  ],
  x: { label: "Hour of Day" },
  y: { label: "Day of Week" },
  title: "Average Number of Docks Available by Day of Week and Hour",
});