In [382]:
import pandas as pd
import plotly.express as px
from great_tables import (
    GT, md, google_font, style, loc
)

df = pd.read_csv(
    'data/sample_citibike_2023.csv',
    parse_dates=["started_at", "ended_at"]
)

display(df.head())
display(df.dtypes)


# Remove docked bike entries
df = df[df["rideable_type"] != "docked_bike"]

# Format membership status values
df["member_casual"] = df["member_casual"].replace({
    "member": "Member",
    "casual": "Casual"
})

# Format bike type values
df["rideable_type"] = df["rideable_type"].replace({
    "classic_bike": "Classic Bike",
    "electric_bike": "Electric Bike"
})


Columns (6) have mixed types. Specify dtype option on import or set low_memory=False.



Unnamed: 0.1,Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,12536574,6A1558E5EB8E9B3B,classic_bike,2023-06-25 06:44:47,2023-06-25 07:02:43,Walton Ave & E 168 St,8179.03,Icahn Stadium,7514.01,40.836655,-73.918324,40.79346,-73.92389,member
1,20766033,F5DAABF8EAD39B32,classic_bike,2023-08-15 18:05:37,2023-08-15 18:12:06,Grand Army Plaza & Central Park S,6839.1,2 Ave & E 72 St,6925.09,40.764004,-73.973974,40.768762,-73.958408,member
2,17246346,CC4D3C1C193EBCDD,classic_bike,2023-07-05 15:00:28,2023-07-05 15:06:34,Knickerbocker Ave & Thames St,5018.06,Melrose St & Broadway,4801.04,40.705446,-73.929975,40.697481,-73.935877,casual
3,3860641,541D7A53817AF238,classic_bike,2023-03-05 11:29:02,2023-03-05 11:36:36,DeKalb Ave & Vanderbilt Ave,4461.04,Pacific St & Classon Ave,4148.07,40.689425,-73.968898,40.679194,-73.95879,member
4,27285265,EACA2B802BFED979,classic_bike,2023-10-17 15:02:23,2023-10-17 15:06:40,Liberty St & Broadway,5105.01,West Thames St,5114.06,40.708164,-74.010369,40.708347,-74.017134,member


Unnamed: 0                     int64
ride_id                       object
rideable_type                 object
started_at            datetime64[ns]
ended_at              datetime64[ns]
start_station_name            object
start_station_id              object
end_station_name              object
end_station_id                object
start_lat                    float64
start_lng                    float64
end_lat                      float64
end_lng                      float64
member_casual                 object
dtype: object

In [383]:
df["rideable_type"].value_counts()

rideable_type
Classic Bike     897184
Electric Bike    100879
Name: count, dtype: int64

In [384]:
df["month_num"] = df["started_at"].dt.month
df["Month"] = df["started_at"].dt.strftime("%b")

df_monthly_counts = (
    df.
    groupby(["month_num", "Month", "member_casual"])
    .agg(
        ride_count=("ride_id", "count")
    )
    .reset_index()
    .rename(
        columns={
            "ride_count": "Ride Count",
            "member_casual": "Membership Status"
        }
    )
)
df_monthly_counts

Unnamed: 0,month_num,Month,Membership Status,Ride Count
0,1,Jan,Casual,6576
1,1,Jan,Member,44459
2,2,Feb,Casual,5557
3,2,Feb,Member,42537
4,3,Mar,Casual,7878
5,3,Mar,Member,52685
6,4,Apr,Casual,14269
7,4,Apr,Member,64116
8,5,May,Casual,18592
9,5,May,Member,79845


In [None]:
fig = px.line(
    df_monthly_counts,
    x="Month",
    y="Ride Count",
    template="plotly_white",
    color="Membership Status",
    title="<b><span style='color:#0072ce;'>Citi Bike</span> usage throughout 2023</b>",
    subtitle="Monthly ride counts peaked at 113K in August, with Members consistently<br>riding more than Casual users due to routine commuting.",
    color_discrete_sequence=["#d4d4d4", "#0072ce"],
    width=700,
    height=500,
    markers=True # markers on data points
)

# set font colors, sizes, etc
fig.update_layout(
    font_color = "#0a0a0a",
    title_font_family = 'Lora',
    title_font_size = 24,
    font_family = 'Sora',
    font_size = 12,
    # top margin to prevent subtitle overlapping with graph
    margin = {'t': 200},
)

fig.update_traces(
    # line width
    line=dict(width=4),
    # marker size
    marker=dict(size=8)
)

fig.show()

In [386]:
df["Hour num"] = df["started_at"].dt.hour

df_hourly_counts = (
    df.
    groupby(["Hour num"])
    .agg(
        ride_count=("ride_id", "count")
    )
    .rename(
        columns={
            "ride_count": "Ride Count",
        }
    )
    .reset_index()
)

df_hourly_counts["Hour"] = pd.to_datetime(df_hourly_counts["Hour num"], format="%H").dt.strftime("%-I %p")

df_hourly_counts.loc[
    df_hourly_counts["Hour"] == "5 PM",
    "flag"
] = True

df_hourly_counts.loc[
    df_hourly_counts["Hour"] != "5 PM",
    "flag"
] = False

df_hourly_counts

Unnamed: 0,Hour num,Ride Count,Hour,flag
0,0,13812,12 AM,False
1,1,8661,1 AM,False
2,2,5625,2 AM,False
3,3,3707,3 AM,False
4,4,3265,4 AM,False
5,5,7004,5 AM,False
6,6,19266,6 AM,False
7,7,39432,7 AM,False
8,8,60389,8 AM,False
9,9,50769,9 AM,False


In [396]:
fig = px.bar(
    df_hourly_counts,
    x="Hour",
    y="Ride Count",
    template="plotly_white",
    title="<b>When is <span style='color:#0072ce;'>Citi Bike</span> used most during the day?</b>",
    subtitle="Ride counts peak around 5 PM, reflecting <span style='color:#0072ce;'>Citi Bike</span>'s<br>role as a key option for evening commuters.",
    color_discrete_sequence=["#aad0f0", "#0072ce"],
    width=550,
    category_orders={"Hour": df_hourly_counts["Hour"].tolist()},
    color="flag",
)

fig.update_layout(
    xaxis_tickvals=["6 AM", "12 PM", "6 PM", "11 PM"],
    font_color = "#0a0a0a",
    title_font_family = 'Lora',
    title_font_size = 24,
    font_family = 'Sora',
    font_size = 12,
    margin = {'t': 150},
    showlegend=False
)

fig.show()

In [388]:
station_counts = (
    df.
    groupby(["end_station_id"])
    .agg(
        ride_count=("ride_id", "count"),
        end_lat=("end_lat", "first"),
        end_lng=("end_lng", "first")
    )
    .sort_values(by="ride_count", ascending=False)
    .reset_index()
    # .rename(
    #     columns={
    #         "month": "Month",
    #         "ride_count": "Ride Count",
    #         "member_casual": "Membership Status"
    #     }
    # )
)
station_counts.head(10)

Unnamed: 0,end_station_id,ride_count,end_lat,end_lng
0,6140.05,3996,40.74174,-73.994156
1,5329.03,3162,40.717548,-74.013221
2,6726.01,3063,40.760301,-73.998842
3,5905.14,2988,40.734814,-73.992085
4,6173.08,2915,40.742869,-73.989186
5,6331.01,2888,40.749156,-73.9916
6,6822.09,2879,40.765005,-73.958185
7,5980.07,2840,40.73705,-73.990093
8,6364.07,2717,40.749013,-73.988484
9,6602.03,2708,40.756405,-73.990026


In [389]:
fig = px.scatter_map(
    station_counts,
    lat="end_lat",
    lon="end_lng",
    color="ride_count",   # Tourist vs commuter
    size="ride_count",
    zoom=11,
    width=700,
    height=800,
    size_max=12,
    range_color=[0, 2750],
    opacity=0.75,
    color_continuous_scale="blues",
    
    # mapbox_style="carto-positron",  # Clean, minimal map
    title="<b>Where Casual vs Member Riders Start Their Trips in NYC</b>",
)

fig.show()

In [390]:
print(df["rideable_type"].value_counts())

rideable_type
Classic Bike     897184
Electric Bike    100879
Name: count, dtype: int64


In [None]:
df["duration_min"] = (df["ended_at"] - df["started_at"]).dt.total_seconds() / 60

df_avg_duration = (
    df
    .groupby(["member_casual", "rideable_type"])
    .agg(
        {
            "duration_min": "mean"
        }
    )
    .reset_index()
    .rename(
        columns = {
            "duration_min": "duration",
            "member_casual": "Membership Status",
            "rideable_type": "Bike Type",
        }
    )
)

display(df_avg_duration)

df_avg_duration["Average Ride Duration (m)"] = df_avg_duration["duration"].apply(lambda x: f"{x:.3g}m")

fig = px.bar(
    df_avg_duration,
    y="Membership Status",
    x="duration",
    color="Bike Type",
    template="plotly_white",
    barmode="group",
    title="<b>How long are <span style='color:#0072ce;'>Citi Bike</span> rides?</b>",
    subtitle="Average ride durations are consistently longer for casual users<br>across bike types, driven by tourist-oriented trips and pricing<br>models favoring short rides for members.",
    color_discrete_sequence=["#0072ce", "#66b3e1"],
    width=600,
    text="Average Ride Duration (m)",
)

# set font colors, sizes, etc
fig.update_layout(
    font_color = "#0a0a0a",
    title_font_family = 'Lora',
    title_font_size = 24,
    font_family = 'Sora',
    font_size = 12,
    # top margin to prevent subtitle from overlapping
    margin = {'t': 200},
)


fig.update_traces(
    # set text labels outside bars
    textposition = 'outside',
    # prevent text label from extending outside box
    cliponaxis=False
)

# disable x axis
fig.update_xaxes(visible = False)

fig.show()

Unnamed: 0,Membership Status,Bike Type,duration
0,Casual,Classic Bike,23.527075
1,Casual,Electric Bike,22.438904
2,Member,Classic Bike,12.682397
3,Member,Electric Bike,12.054162


In [394]:
df_top_10_stations = (
    df.groupby("start_station_id")
    .agg(
        start_station_name=("start_station_name", "first"),
        total_rides=("ride_id", "count"),
    )
    .sort_values(by="total_rides", ascending=False)
    .rename(
        columns={
            "start_station_name": "Station Name",
            "total_rides": "Ride Count"
        }
    )
    .head(10)
    .reset_index()
)

# display(df_top_10_stations)

table = (
    GT(df_top_10_stations[["Station Name", "Ride Count"]])
    # set title and subtitle
    .tab_header(
        title=md(
            "**NYC's Top 10 <span style='color:#0072ce;'>Citi Bike</span> Stations**"
        ),
        subtitle=md(
            "Students at NYU help fuel <span style='color:#0072ce;'>Citi Bike</span>’s popularity,<br>with 8 of the top 10 stations located within<br>blocks of the campus."
        )
    )
    # left align header
    .opt_align_table_header('left')
    # format ride counts
    .fmt_integer(columns="Ride Count")
    # set table font
    .opt_table_font(google_font('Sora'))
    # set title font
    .tab_style(
        style=style.text(font=google_font('Lora')),
        locations=loc.title()
    )
    # set font sizes
    .tab_options(
        heading_title_font_size='26px',
        heading_subtitle_font_size='16px',
        column_labels_font_size='14px',
        table_font_size = '14px',
        data_row_padding = 1
    )
    # set heatmap
    .data_color(
        columns="Ride Count",
        palette = 'Blues',
        domain=[2000, 4000]
    )
)

table.show()

NYC's Top 10 Citi Bike Stations,NYC's Top 10 Citi Bike Stations
"Students at NYU help fuel Citi Bike’s popularity, with 8 of the top 10 stations located within blocks of the campus.","Students at NYU help fuel Citi Bike’s popularity, with 8 of the top 10 stations located within blocks of the campus."
Station Name,Ride Count
W 21 St & 6 Ave,3982
University Pl & E 14 St,3036
West St & Chambers St,3027
1 Ave & E 68 St,2929
11 Ave & W 41 St,2902
Broadway & W 25 St,2877
W 31 St & 7 Ave,2856
6 Ave & W 33 St,2775
E 17 St & Broadway,2715
Ave A & E 14 St,2689
