In [30]:
import numpy as np
import pandas as pd

def _to_num(s):
    return pd.to_numeric(s, errors="coerce")

# Sanitize nodes
nodes["lat"] = _to_num(nodes["lat"])
nodes["lng"] = _to_num(nodes["lng"])
nodes = nodes.dropna(subset=["lat", "lng"])
nodes = nodes[(nodes["lat"].between(-90, 90)) & (nodes["lng"].between(-180, 180))]

# Sanitize OD
od["start_lat"] = _to_num(od["start_lat"])
od["start_lng"] = _to_num(od["start_lng"])
od["end_lat"]   = _to_num(od["end_lat"])
od["end_lng"]   = _to_num(od["end_lng"])
od["count"]     = _to_num(od.get("count", 1))

od = od.dropna(subset=["start_lat","start_lng","end_lat","end_lng","count"])
od = od[
    od["start_lat"].between(-90, 90)  & od["start_lng"].between(-180, 180) &
    od["end_lat"].between(-90, 90)    & od["end_lng"].between(-180, 180)
]


In [32]:
# Create an OD dataframe (origin-destination flows)
df["trip"] = 1   # add column with 1 trip per row

od = (
    df.groupby(
        ["start_station_name", "start_lat", "start_lng",
         "end_station_name", "end_lat", "end_lng"]
    )["trip"]
    .count()
    .reset_index()
    .rename(columns={"trip": "count"})
)


In [33]:
import pandas as pd

# adjust to one of your extracted CSVs
df = pd.read_csv("extracted/csvs/202201-citibike-tripdata_1.csv")

print(df.columns[:10])   # quick check
print(len(df))           # number of rows


  df = pd.read_csv("extracted/csvs/202201-citibike-tripdata_1.csv")


Index(['ride_id', 'rideable_type', 'started_at', 'ended_at',
       'start_station_name', 'start_station_id', 'end_station_name',
       'end_station_id', 'start_lat', 'start_lng'],
      dtype='object')
1000000


In [34]:
df = pd.read_csv("extracted/csvs/202201-citibike-tripdata_1.csv", low_memory=False)


## OD - Dataframe

In [35]:
# Add trip counter column
df["trip"] = 1

# Aggregate by start & end stations
od = (
    df.groupby(
        ["start_station_name", "start_lat", "start_lng",
         "end_station_name", "end_lat", "end_lng"]
    )["trip"]
    .count()
    .reset_index()
    .rename(columns={"trip": "count"})
)

print(od.head())


  start_station_name  start_lat  start_lng  end_station_name    end_lat  \
0   1 Ave & E 110 St  40.792327   -73.9383  1 Ave & E 110 St  40.792327   
1   1 Ave & E 110 St  40.792327   -73.9383   1 Ave & E 44 St  40.750020   
2   1 Ave & E 110 St  40.792327   -73.9383   1 Ave & E 68 St  40.765005   
3   1 Ave & E 110 St  40.792327   -73.9383   1 Ave & E 78 St  40.771404   
4   1 Ave & E 110 St  40.792327   -73.9383   1 Ave & E 94 St  40.781721   

     end_lng  count  
0 -73.938300     27  
1 -73.969053      2  
2 -73.958185      1  
3 -73.953517      3  
4 -73.945940     15  


## Nodes Dataframe

In [36]:
# Collect unique station locations
nodes_start = df[["start_station_name", "start_lat", "start_lng"]].rename(
    columns={"start_station_name":"station", "start_lat":"lat", "start_lng":"lng"}
)
nodes_end = df[["end_station_name", "end_lat", "end_lng"]].rename(
    columns={"end_station_name":"station", "end_lat":"lat", "end_lng":"lng"}
)

nodes = pd.concat([nodes_start, nodes_end]).drop_duplicates().reset_index(drop=True)

print(nodes.head())
print(len(nodes), "unique stations")


                   station        lat        lng
0  West End Ave & W 107 St  40.802117 -73.968181
1             4 Ave & 3 St  40.673746 -73.985649
2          1 Ave & E 62 St  40.761227 -73.960940
3          2 Ave & E 96 St  40.783964 -73.947167
4          6 Ave & W 34 St  40.749640 -73.988050
2263 unique stations


## Add a trip flag and build the OD table (plus a nodes table)

In [37]:
# 1 trip per row
df["trip"] = 1

# clean coordinates
for c in ["start_lat","start_lng","end_lat","end_lng"]:
    df[c] = pd.to_numeric(df[c], errors="coerce")
df = df.dropna(subset=["start_lat","start_lng","end_lat","end_lng"])
df = df[
    df["start_lat"].between(-90,90) & df["end_lat"].between(-90,90) &
    df["start_lng"].between(-180,180) & df["end_lng"].between(-180,180)
]

# OD (origin→destination) counts
od = (
    df.groupby(
        ["start_station_name","start_lat","start_lng",
         "end_station_name","end_lat","end_lng"],
        dropna=False
    )["trip"]
    .count()
    .reset_index()
    .rename(columns={"trip":"count"})
)

# Nodes (unique stations)
nodes_start = df[["start_station_name","start_lat","start_lng"]].rename(
    columns={"start_station_name":"station","start_lat":"lat","start_lng":"lng"}
)
nodes_end = df[["end_station_name","end_lat","end_lng"]].rename(
    columns={"end_station_name":"station","end_lat":"lat","end_lng":"lng"}
)
nodes = pd.concat([nodes_start, nodes_end], ignore_index=True).drop_duplicates()

len(od), len(nodes)


(209851, 2260)

## Initialize a Kepler.gl map

In [38]:
from keplergl import KeplerGl

m = KeplerGl(height=650)
m


User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


KeplerGl(height=650)

## Add layers & styling (points + arcs)

In [39]:
# Add data
m.add_data(data=nodes, name="Stations")
m.add_data(data=od,    name="OD Trips")

# Configure: point layer for stations, arc layer for flows
config = {
  "version": "v1",
  "config": {
    "visState": {
      "filters": [],
      "layers": [
        {
          "id": "stations-layer",
          "type": "point",
          "config": {
            "dataId": "Stations",
            "label": "Stations",
            "color": [30, 144, 255],  # DodgerBlue
            "columns": {"lat": "lat", "lng": "lng", "altitude": None},
            "isVisible": True,
            "visConfig": {
              "radius": 4,
              "opacity": 0.8,
              "outline": False
            }
          }
        },
        {
          "id": "od-arc-layer",
          "type": "arc",
          "config": {
            "dataId": "OD Trips",
            "label": "OD Trips",
            "color": [255, 99, 71],   # Tomato
            "columns": {
              "lat0": "start_lat", "lng0": "start_lng",
              "lat1": "end_lat",   "lng1": "end_lng"
            },
            "isVisible": True,
            "visConfig": {
              "opacity": 0.6,
              "thickness": 2,
              "colorRange": {"name":"ColorBrewer Reds-6", "type":"sequential"},
              "sizeRange": [1, 8],
            }
          },
          "visualChannels": {
            "sizeField": {"name": "count", "type": "integer"},
            "sizeScale": "sqrt"
          }
        }
      ],
      "interactionConfig": {
        "tooltip": {
          "fieldsToShow": {
            "Stations": [{"name":"station","format":None}],
            "OD Trips": [
              {"name":"start_station_name","format":None},
              {"name":"end_station_name","format":None},
              {"name":"count","format":","}
            ]
          }
        }
      }
    },
    "mapState": {"latitude":40.75,"longitude":-73.97,"zoom":11,"pitch":45,"bearing":0},
    "mapStyle": {"styleType":"dark"}
  }
}

m.config = config
m


Out of range float values are not JSON compliant
Supporting this message is deprecated in jupyter-client 7, please make sure your message is JSON-compliant
  content = self.pack(content)
Out of range float values are not JSON compliant
Supporting this message is deprecated in jupyter-client 7, please make sure your message is JSON-compliant
  content = self.pack(content)


KeplerGl(config={'version': 'v1', 'config': {'visState': {'filters': [], 'layers': [{'id': 'stations-layer', '…

## Clean the data for Kepler (no NaN / no ±Inf / reasonable dtypes)

In [40]:
import numpy as np
import pandas as pd

def kepler_safe(df, float_cols, int_cols=(), dropna_cols=()):
    """Return a copy of df that is safe to JSON-serialize for Kepler."""
    out = df.copy()

    # Replace infs -> NaN then drop required NaNs
    out = out.replace([np.inf, -np.inf], np.nan)
    if dropna_cols:
        out = out.dropna(subset=list(dropna_cols))

    # Ensure pure float64 for coordinates (json-friendly) and round a bit
    for c in float_cols:
        out[c] = pd.to_numeric(out[c], errors="coerce").astype("float64")
    if float_cols:
        out[float_cols] = out[float_cols].round(6)

    # Int columns (e.g., counts)
    for c in int_cols:
        out[c] = pd.to_numeric(out[c], errors="coerce").fillna(0).astype("int32")

    # Replace remaining NaN with None (valid JSON null)
    out = out.where(pd.notnull(out), None)
    return out


In [41]:
# For safety, keep only the columns Kepler needs
nodes_k = nodes[["station", "lat", "lng"]].copy()
od_k = od[[
    "start_station_name","start_lat","start_lng",
    "end_station_name","end_lat","end_lng","count"
]].copy()

nodes_k = kepler_safe(
    nodes_k,
    float_cols=["lat","lng"],
    dropna_cols=["lat","lng"]
)

od_k = kepler_safe(
    od_k,
    float_cols=["start_lat","start_lng","end_lat","end_lng"],
    int_cols=["count"],
    dropna_cols=["start_lat","start_lng","end_lat","end_lng"]
)

len(nodes_k), len(od_k), od_k["count"].max()


(2260, 209851, 428)

In [42]:
# show only the top N flows to keep the widget light
TOP = 5000
od_k = od_k.sort_values("count", ascending=False).head(TOP)


## Rebuild the map from clean data

In [43]:
from keplergl import KeplerGl

m = KeplerGl(height=650)
m.add_data(nodes_k, name="Stations")
m.add_data(od_k,    name="OD Trips")

# (re)apply your config (same as you used before)
m.config = config   # reuse the config dict you set earlier
m


User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


KeplerGl(config={'version': 'v1', 'config': {'visState': {'filters': [], 'layers': [{'id': 'stations-layer', '…

## Build & customize the Kepler map (points + arcs, with colors)

In [23]:
from keplergl import KeplerGl

# OPTIONAL: pick a map center you like
MAP_CENTER = dict(longitude=-73.98, latitude=40.73, zoom=11.5, pitch=45, bearing=0)

# A small helper so the filter upper bound matches your data
max_count = int(od["count"].max()) if "count" in od.columns else 100

# Kepler configuration: one Point layer (stations) + one Arc layer (OD flows)
kepler_config = {
    "version": "v1",
    "config": {
        "visState": {
            "layers": [
                {
                    "id": "stations_layer",
                    "type": "point",
                    "config": {
                        "dataId": "Stations",
                        "label": "Stations",
                        "color": [34, 63, 154],      # deep blue
                        "columns": {"lat": "lat", "lng": "lng", "altitude": None},
                        "isVisible": True,
                        "visConfig": {
                            "radius": 6,
                            "opacity": 0.7,
                            "outline": False
                        }
                    }
                },
                {
                    "id": "od_arcs_layer",
                    "type": "arc",
                    "config": {
                        "dataId": "OD Trips",
                        "label": "OD Trips",
                        "color": [230, 85, 13],      # warm orange
                        "columns": {
                            "lat0": "start_lat",
                            "lng0": "start_lng",
                            "lat1": "end_lat",
                            "lng1": "end_lng"
                        },
                        "isVisible": True,
                        "visConfig": {
                            "opacity": 0.35,
                            "thickness": 1.0,
                            "colorRange": {
                                "name": "OrRd",
                                "type": "sequential",
                                "category": "ColorBrewer",
                                "colors": ["#fee8c8","#fdbb84","#e34a33"]
                            }
                        }
                    }
                }
            ],
            # We’ll add a count filter in Step 6; leaving empty for now keeps it simple
            "filters": [],
        },
        "mapState": MAP_CENTER,
        "mapStyle": {"styleType": "dark"}  # try "light", "dark", "satellite", etc.
    }
}

# Initialize map and add data
m = KeplerGl(height=700, config=kepler_config)
m.add_data(data=nodes, name="Stations")
m.add_data(data=od,    name="OD Trips")

m  # ← renders the Kepler widget in Jupyter


User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


Out of range float values are not JSON compliant
Supporting this message is deprecated in jupyter-client 7, please make sure your message is JSON-compliant
  content = self.pack(content)
Out of range float values are not JSON compliant
Supporting this message is deprecated in jupyter-client 7, please make sure your message is JSON-compliant
  content = self.pack(content)


KeplerGl(config={'version': 'v1', 'config': {'visState': {'layers': [{'id': 'stations_layer', 'type': 'point',…

In [28]:
od["count"] = od["count"].astype(int)


In [47]:
from keplergl import KeplerGl

# Build a base config with a point layer (stations) and an arc layer (OD flows).
# You can still tweak colors, radii, etc. in the GUI after this.
base_config = {
  "version": "v1",
  "config": {
    "visState": {
      "layers": [
        {
          "id": "stations_layer",
          "type": "point",
          "config": {
            "dataId": "Stations",
            "label": "Stations",
            "color": [34, 150, 243],
            "columns": {"lat": "lat", "lng": "lng", "altitude": None},
            "isVisible": True,
            "visConfig": {
              "radius": 4,
              "opacity": 0.8,
              "outline": False
            }
          }
        },
        {
          "id": "od_arcs",
          "type": "arc",
          "config": {
            "dataId": "OD Trips",
            "label": "OD Trips",
            "color": [255, 99, 71],   # tomato
            "columns": {
              "lat0": "start_lat", "lng0": "start_lng",
              "lat1": "end_lat",   "lng1": "end_lng"
            },
            "isVisible": True,
            "visConfig": {
              "opacity": 0.5,
              "thickness": 2
            }
          }
        }
      ],
      "filters": []
    },
    "mapState": {"latitude": 40.73, "longitude": -73.98, "zoom": 11},
    "mapStyle": {"styleType": "dark"}
  }
}

m = KeplerGl(height=650, config=base_config)
m.add_data(data=nodes, name="Stations")
m.add_data(data=od,    name="OD Trips")
m


User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


Out of range float values are not JSON compliant
Supporting this message is deprecated in jupyter-client 7, please make sure your message is JSON-compliant
  content = self.pack(content)
Out of range float values are not JSON compliant
Supporting this message is deprecated in jupyter-client 7, please make sure your message is JSON-compliant
  content = self.pack(content)


KeplerGl(config={'version': 'v1', 'config': {'visState': {'layers': [{'id': 'stations_layer', 'type': 'point',…

 I pre-set a point layer for stations with small blue markers and an arc layer for OD flows in a contrasting warm color. This helps arcs stand out over the station cloud. I also set transparency (opacity) and thickness to reduce clutter when all flows are visible.

## Add a numeric filter on count from Python
- inject a numeric range filter into the map config so you can slide to “top trips” right away.

In [48]:
import numpy as np

# Make sure 'count' is numeric and finite
od["count"] = pd.to_numeric(od["count"], errors="coerce")
od = od.replace([np.inf, -np.inf], np.nan).dropna(subset=["count"])
od["count"] = od["count"].astype(int)

# Choose a sensible default range: e.g., from the 80th percentile to max
low  = int(od["count"].quantile(0.80))
high = int(od["count"].max())

# Grab current config (which has your layers) and add a numeric range filter
cfg = m.config
cfg["config"]["visState"]["filters"] = [{
    "id": "count_range_filter",
    "dataId": ["OD Trips"],            # dataId is the name you used in add_data
    "name": ["count"],                 # the field to filter on
    "type": "range",                   # <-- force NUMERIC range (not time)
    "enlarged": True,
    "plotType": "histogram",
    "yAxis": None,
    "value": [low, high],              # default selection
    "animationWindow": "free",
    "speed": 1
}]

# Apply the updated config to the map
m.config = cfg
m


KeplerGl(config={'version': 'v1', 'config': {'visState': {'layers': [{'id': 'stations_layer', 'type': 'point',…

## Add a station filter (by name, id, etc.)


In [49]:
# Example: add a category filter on 'start_station_name'
station_name_example = od["start_station_name"].mode().iloc[0]  # most common start
cfg = m.config

cfg["config"]["visState"]["filters"].append({
    "id": "start_station_filter",
    "dataId": ["OD Trips"],
    "name": ["start_station_name"],
    "type": "select",
    "enlarged": False,
    "value": [station_name_example],    # default selection; you can remove to start empty
    "plotType": "histogram"
})

m.config = cfg
m


KeplerGl(config={'version': 'v1', 'config': {'visState': {'layers': [{'id': 'stations_layer', 'type': 'point',…

In [50]:
out_html = "citibike_kepler_map.html"
m.save_to_html(file_name=out_html)
out_html


Map saved to citibike_kepler_map.html!


'citibike_kepler_map.html'