In [7]:
# @title Setup
from google.cloud import bigquery
from google.colab import data_table
import bigframes.pandas as bpd

project = 'possible-coast-463614-i9' # Project ID inserted based on the query results selected to explore
location = 'US' # Location inserted based on the query results selected to explore
client = bigquery.Client(project=project, location=location)
data_table.enable_dataframe_formatter()

# I/ Market Demand & Seasonality


## Question 1: How does the demand for yellow taxis fluctuate over time (daily, weekly, monthly, and seasonally)?

In [14]:
# Query the BigQuery View
query_demand_over_time = """
SELECT *
FROM `possible-coast-463614-i9.views_fordashboard.demand_over_time`;
"""

# Run the query and store the result in a DataFrame
demand_over_time_df = client.query(query_demand_over_time).to_dataframe()
demand_over_time_df

Unnamed: 0,trip_date,year,month,week,weekday,total_trips
0,2001-01-01,2001,1,0,2,11
1,2001-08-23,2001,8,33,5,1
2,2002-10-21,2002,10,42,2,36
3,2002-10-22,2002,10,42,3,53
4,2002-10-23,2002,10,42,4,53
...,...,...,...,...,...,...
1261,2025-05-29,2025,5,21,5,112240
1262,2025-05-30,2025,5,21,6,106374
1263,2025-05-31,2025,5,21,7,102949
1264,2025-06-01,2025,6,22,1,3


In [9]:
type(demand_over_time_df)

pandas.core.frame.DataFrame

In [15]:
demand_over_time_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1266 entries, 0 to 1265
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   trip_date    1266 non-null   dbdate
 1   year         1266 non-null   Int64 
 2   month        1266 non-null   Int64 
 3   week         1266 non-null   Int64 
 4   weekday      1266 non-null   Int64 
 5   total_trips  1266 non-null   Int64 
dtypes: Int64(5), dbdate(1)
memory usage: 65.7 KB


In [16]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

In [17]:
# Convert trip_date to datetime format
demand_over_time_df['trip_date'] = pd.to_datetime(demand_over_time_df['trip_date'])
demand_over_time_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1266 entries, 0 to 1265
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   trip_date    1266 non-null   datetime64[ns]
 1   year         1266 non-null   Int64         
 2   month        1266 non-null   Int64         
 3   week         1266 non-null   Int64         
 4   weekday      1266 non-null   Int64         
 5   total_trips  1266 non-null   Int64         
dtypes: Int64(5), datetime64[ns](1)
memory usage: 65.7 KB


In [18]:
demand_over_time_df.year.value_counts(normalize=True)

Unnamed: 0_level_0,proportion
year,Unnamed: 1_level_1
2024,0.2891
2022,0.28831
2023,0.28831
2025,0.120063
2002,0.007109
2001,0.00158
2008,0.00079
2003,0.00079
2007,0.00079
2021,0.00079


In [19]:
# Extract the current year
from datetime import datetime
current_year = datetime.now().year
current_year

2025

In [20]:
# Filter rows where the year is between 2020 and the current year (inclusive)
filtered_demand_over_time_df = demand_over_time_df[(demand_over_time_df['year'] >= 2020) & (demand_over_time_df['year'] <= current_year)]
filtered_demand_over_time_df.year.value_counts(normalize=True)

Unnamed: 0_level_0,proportion
year,Unnamed: 1_level_1
2024,0.293034
2023,0.292234
2022,0.292234
2025,0.121697
2021,0.000801


In [21]:
# 1. Daily Demand Trend
fig_daily = px.line(
    filtered_demand_over_time_df,
    x='trip_date',
    y='total_trips',
    title='Daily Taxi Demand Over Time',
    labels={'trip_date': 'Date', 'total_trips': 'Number of Trips'},
    template='plotly_dark'
)
fig_daily.show()

In [22]:
# 2. Weekly Demand Trend
weekly_demand = filtered_demand_over_time_df.groupby('week')['total_trips'].sum().reset_index()
fig_weekly = px.line(
    weekly_demand,
    x='week',
    y='total_trips',
    title='Weekly Taxi Demand Trend',
    labels={'week': 'Week Number', 'total_trips': 'Number of Trips'},
    template='plotly_dark'
)
fig_weekly.show()

In [23]:
# 3. Monthly Demand Trend
monthly_demand = filtered_demand_over_time_df.groupby('month')['total_trips'].sum().reset_index()
fig_monthly = px.bar(
    monthly_demand,
    x='month',
    y='total_trips',
    title='Monthly Taxi Demand Trend',
    labels={'month': 'Month', 'total_trips': 'Number of Trips'},
    template='plotly_dark'
)
fig_monthly.show()

In [24]:
filtered_demand_over_time_df['quarter'] = filtered_demand_over_time_df['trip_date'].dt.to_period("Q")
filtered_demand_over_time_df



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,trip_date,year,month,week,weekday,total_trips,quarter
16,2021-12-31,2021,12,52,6,23,2021Q4
17,2022-01-01,2022,1,0,7,58857,2022Q1
18,2022-01-02,2022,1,1,1,54917,2022Q1
19,2022-01-03,2022,1,1,2,67990,2022Q1
20,2022-01-04,2022,1,1,3,70308,2022Q1
...,...,...,...,...,...,...,...
1260,2025-05-28,2025,5,21,4,112378,2025Q2
1261,2025-05-29,2025,5,21,5,112240,2025Q2
1262,2025-05-30,2025,5,21,6,106374,2025Q2
1263,2025-05-31,2025,5,21,7,102949,2025Q2


In [25]:
seasonal_demand = filtered_demand_over_time_df.groupby('quarter')['total_trips'].sum().reset_index()
seasonal_demand

Unnamed: 0,quarter,total_trips
0,2021Q4,23
1,2022Q1,8455467
2,2022Q2,9969918
3,2022Q3,8831679
4,2022Q4,9561129
5,2023Q1,8808178
6,2023Q2,9445156
7,2023Q3,7947309
8,2023Q4,9413471
9,2024Q1,8480258


In [26]:
# Convert 'quarter' column to string
seasonal_demand['quarter'] = seasonal_demand['quarter'].astype(str)

In [27]:
# 4. Seasonal Demand Trend (Grouping by Quarters)
fig_seasonal = px.line(
    seasonal_demand,
    x='quarter',
    y='total_trips',
    title='Seasonal Taxi Demand Trend',
    labels={'quarter': 'Quarter', 'total_trips': 'Number of Trips'},
    template='plotly_dark'
)
fig_seasonal.show()

## Question 2: What are the peak hours for yellow taxi trips in different boroughs and zones?

In [None]:
# Function to execute a BigQuery query and return a DataFrame

def query_to_dataframe(query: str) -> pd.DataFrame:
    """
    Executes a SQL query in BigQuery and returns a Pandas DataFrame.

    Parameters:
    - query (str): The SQL query to execute.

    Return:
    - pd.DataFrame : The DataFrame containing the results of the query.
    """
    try:
        df = client.query(query).to_dataframe()
        print(f"Query executed successfully. Retrieved {df.shape[0]} rows.")
        return df
    except Exception as e:
        print(f"Error executing query: {e}")
        return pd.DataFrame()


In [None]:
query_peak_hours_by_zone = """
SELECT *
FROM `nyc-yellow-trips.views_fordashboard.peak_hours_by_zone`
"""
peak_hours_by_zone_df = query_to_dataframe(query_peak_hours_by_zone)
peak_hours_by_zone_df.head()

Query executed successfully. Retrieved 6214 rows.


Unnamed: 0,pickup_hour,Borough,Zone,total_trips
0,14,Manhattan,Upper East Side South,603599
1,15,Manhattan,Upper East Side South,601469
2,17,Manhattan,Midtown Center,589463
3,18,Manhattan,Midtown Center,586912
4,15,Manhattan,Upper East Side North,580087


In [None]:
peak_hours_by_zone_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6214 entries, 0 to 6213
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   pickup_hour  6214 non-null   Int64 
 1   Borough      6214 non-null   object
 2   Zone         6214 non-null   object
 3   total_trips  6214 non-null   Int64 
dtypes: Int64(2), object(2)
memory usage: 206.4+ KB


In [None]:
fig_borough = px.line(
    peak_hours_by_zone_df.groupby(["pickup_hour", "Borough"])["total_trips"].sum().reset_index(),
    x="pickup_hour",
    y="total_trips",
    color="Borough",
    title="Viewing peak hours by Borough",
    labels={"pickup_hour": "Heure", "total_trips": "Nombre de courses"},
    template="plotly_dark"
)

fig_borough.show()

In [None]:
# Pivot table pour avoir une matrice heures x Borough
pivot_df = peak_hours_by_zone_df.pivot_table(
    values="total_trips",
    index="pickup_hour",
    columns="Borough",
    aggfunc="sum"
)

pivot_df.head()

Borough,Bronx,Brooklyn,EWR,Manhattan,N/A,Queens,Staten Island,Unknown
pickup_hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3455,38095,21,3472772,1473,459153,253,35003
1,2463,27519,18,2411930,1155,184706,201,19127
2,1924,19443,22,1634402,962,70299,144,11836
3,1583,14854,27,1068674,758,40662,114,7921
4,3472,14936,86,679716,692,44291,192,5639


In [None]:
# Remplacement des NaN par 0
#pivot_df = pivot_df.fillna(0)

# Heatmap of the busiest areas by hour
fig_zone = go.Figure(
    data=go.Heatmap(
        z=pivot_df.values,
        x=pivot_df.columns,
        y=pivot_df.index,
        colorscale="YlGnBu"
    )
)

fig_zone.update_layout(
    title="Heatmap des Heures de Pointe par Arrondissement",
    xaxis_title="Borough",
    yaxis_title="Hour",
    template="plotly_dark"
)

fig_zone.show()

In [None]:
# Aggregate trips per borough and hour
borough_hourly_demand = peak_hours_by_zone_df.groupby(["Borough", "pickup_hour"])["total_trips"].sum().reset_index()
borough_hourly_demand.head()

Unnamed: 0,Borough,pickup_hour,total_trips
0,Bronx,0,3455
1,Bronx,1,2463
2,Bronx,2,1924
3,Bronx,3,1583
4,Bronx,4,3472


In [None]:
# Create a facet bar chart
fig_facet = px.bar(
    borough_hourly_demand,
    x="pickup_hour",
    y="total_trips",
    color="Borough",
    facet_col="Borough",
    facet_col_wrap=3,  # Arrange in rows of 3
    title="Peak Hours for Yellow Taxi Trips by Borough",
    labels={"pickup_hour": "Hour of the Day", "total_trips": "Total Trips"},
    template="plotly_dark"
)

fig_facet.update_layout(
    showlegend=False,
    height=600,
    width=1200
)

fig_facet.show()

# II/ Customer Behavior & Ride Characteristics

## Question 4: What are the most popular pickup and drop-off locations, and how do they change over time?

In [None]:
query_popular_pickup_dropoff = """
SELECT *
FROM `nyc-yellow-trips.views_fordashboard.popular_pickup_dropoff`
"""
popular_pickup_dropoff_df = query_to_dataframe(query_popular_pickup_dropoff)
popular_pickup_dropoff_df.head()

Query executed successfully. Retrieved 11099188 rows.


Unnamed: 0,trip_date,year,month,week,weekday,pickup_borough,pickup_zone,dropoff_borough,dropoff_zone,total_trips
0,2023-12-31,2023,12,53,1,Manhattan,Midtown North,Manhattan,Garment District,16
1,2023-12-31,2023,12,53,1,Manhattan,TriBeCa/Civic Center,Queens,JFK Airport,6
2,2023-12-31,2023,12,53,1,Queens,Jamaica,Manhattan,Murray Hill,1
3,2023-12-31,2023,12,53,1,Manhattan,West Village,Brooklyn,Bushwick South,2
4,2023-12-31,2023,12,53,1,Manhattan,East Harlem South,Manhattan,Upper West Side South,13


In [None]:
popular_pickup_dropoff_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11099188 entries, 0 to 11099187
Data columns (total 10 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   trip_date        dbdate
 1   year             Int64 
 2   month            Int64 
 3   week             Int64 
 4   weekday          Int64 
 5   pickup_borough   object
 6   pickup_zone      object
 7   dropoff_borough  object
 8   dropoff_zone     object
 9   total_trips      Int64 
dtypes: Int64(5), dbdate(1), object(4)
memory usage: 899.7+ MB


In [None]:
popular_pickup_dropoff_df.year.value_counts(normalize=True)

Unnamed: 0_level_0,proportion
year,Unnamed: 1_level_1
2022,0.219346
2023,0.210555
2024,0.204484
2021,0.198744
2020,0.166783
2002,3.3e-05
2009,2.6e-05
2008,1.4e-05
2019,1.1e-05
2003,2e-06


In [None]:
# Filter rows where the year is between 2020 and the current year (inclusive)
filtered_popular_pickup_dropoff_df = popular_pickup_dropoff_df[(popular_pickup_dropoff_df['year'] >= 2020) & (popular_pickup_dropoff_df['year'] <= current_year)]

In [None]:
# Aggregate total trips by month
monthly_trips = (
    filtered_popular_pickup_dropoff_df.groupby(["year", "month"])["total_trips"]
    .sum()
    .reset_index()
)
monthly_trips.head()

Unnamed: 0,year,month,total_trips
0,2020,1,6142292
1,2020,2,6050544
2,2020,3,2865488
3,2020,4,205218
4,2020,5,269698


In [None]:

# Convert to datetime
monthly_trips["date"] = pd.to_datetime(
    monthly_trips["year"].astype(str) + "-" + monthly_trips["month"].astype(str)
)

# Plot
fig = px.line(
    monthly_trips,
    x="date",
    y="total_trips",
    markers=True,
    title="Monthly Trends in Taxi Demand",
    labels={"total_trips": "Total Trips", "date": "Date"},
)

fig.update_traces(line=dict(width=3))
fig.update_layout(template="plotly_dark", xaxis_tickangle=-45)
fig.show()


In [None]:
# Aggregate trips by pickup zone
top_pickups = (
    filtered_popular_pickup_dropoff_df.groupby("pickup_zone")["total_trips"]
    .sum()
    .nlargest(10)
    .reset_index()
)

top_pickups.head()

Unnamed: 0,pickup_zone,total_trips
0,Upper East Side South,7646809
1,JFK Airport,6990503
2,Upper East Side North,6865658
3,Midtown Center,6712201
4,Midtown East,5423663


In [None]:
# Plot
fig = px.bar(
    top_pickups,
    x="total_trips",
    y="pickup_zone",
    orientation="h",
    text="total_trips",
    title="Top 10 Pickup Locations",
    labels={"total_trips": "Total Trips", "pickup_zone": "Pickup Zone"},
)

fig.update_traces(marker_color="royalblue", texttemplate="%{text:,}", textposition="inside")
fig.update_layout(template="plotly_dark", xaxis_title="Total Trips")
fig.show()

In [None]:
# Aggregate total trips by borough and weekday
heatmap_data = (
    filtered_popular_pickup_dropoff_df.groupby(["pickup_borough", "weekday"])["total_trips"]
    .sum()
    .unstack()
)

heatmap_data.head()

weekday,1,2,3,4,5,6,7
pickup_borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Bronx,21600,29201,31870,33272,32786,34584,24393
Brooklyn,131909,114268,125246,135549,139546,146886,146512
EWR,1183,735,532,658,917,921,706
Manhattan,16311120,17501378,20516034,21787393,22277466,21612440,20720059
,5809,4162,3793,4059,4568,4702,4871


In [None]:

# Create heatmap
fig = go.Figure(
    data=go.Heatmap(
        z=heatmap_data.values,
        x=heatmap_data.columns,
        y=heatmap_data.index,
        colorscale="viridis",
        colorbar=dict(title="Total Trips"),
    )
)

fig.update_layout(
    title="Taxi Demand Heatmap by Borough & Weekday",
    xaxis_title="Weekday",
    yaxis_title="Pickup Borough",
    template="plotly_dark",
)
fig.show()


## Question 5: What is the average trip distance, and how does it vary by borough, time of day, and season?

In [None]:
query_avg_trip_distance_analysis = """
SELECT *
FROM `nyc-yellow-trips.views_fordashboard.avg_trip_distance_analysis`
"""
avg_trip_distance_analysis_df = query_to_dataframe(query_avg_trip_distance_analysis)
avg_trip_distance_analysis_df.head()

Query executed successfully. Retrieved 906483 rows.


Unnamed: 0,trip_date,year,month,season,pickup_hour,pickup_borough,dropoff_borough,avg_trip_distance
0,2001-01-01,2001,1,Winter,0,Manhattan,Brooklyn,10.23
1,2001-01-01,2001,1,Winter,0,Queens,Manhattan,19.225
2,2001-01-01,2001,1,Winter,0,Manhattan,Manhattan,3.323333
3,2001-01-01,2001,1,Winter,0,Queens,Unknown,18.63
4,2001-01-01,2001,1,Winter,1,Manhattan,Unknown,6.95


In [None]:
# Filter rows where the year is between 2020 and the current year (inclusive)
filtered_avg_trip_distance_analysis_df = avg_trip_distance_analysis_df[(avg_trip_distance_analysis_df['year'] >= 2020) & (avg_trip_distance_analysis_df['year'] <= current_year)]
filtered_avg_trip_distance_analysis_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 906121 entries, 357 to 906477
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   trip_date          906121 non-null  dbdate 
 1   year               906121 non-null  Int64  
 2   month              906121 non-null  Int64  
 3   season             906121 non-null  object 
 4   pickup_hour        906121 non-null  Int64  
 5   pickup_borough     906121 non-null  object 
 6   dropoff_borough    906121 non-null  object 
 7   avg_trip_distance  906121 non-null  float64
dtypes: Int64(3), dbdate(1), float64(1), object(3)
memory usage: 64.8+ MB


In [None]:
# Convert trip_date to datetime
filtered_avg_trip_distance_analysis_df["trip_date"] = pd.to_datetime(filtered_avg_trip_distance_analysis_df["trip_date"])

# Aggregate average trip distance by date
daily_avg_distance = (
    filtered_avg_trip_distance_analysis_df.groupby("trip_date")["avg_trip_distance"]
    .mean()
    .reset_index()
)

daily_avg_distance.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,trip_date,avg_trip_distance
0,2020-01-01,10.179849
1,2020-01-02,10.524574
2,2020-01-03,10.514377
3,2020-01-04,10.343253
4,2020-01-05,9.904315


In [None]:
# Plot
fig = px.line(
    daily_avg_distance,
    x="trip_date",
    y="avg_trip_distance",
    title="Average Trip Distance Over Time",
    labels={"avg_trip_distance": "Avg Trip Distance (miles)", "trip_date": "Date"},
    markers=True,
)

fig.update_traces(line=dict(width=3))
fig.update_layout(template="plotly_dark", xaxis_tickangle=-45)
fig.show()


## Question 6: How many trips have only one passenger versus multiple passengers, and does this change seasonally?

In [None]:
query_passenger_trends_by_season = """
SELECT *
FROM `nyc-yellow-trips.views_fordashboard.passenger_trends_by_season`
"""
passenger_trends_by_season_df = query_to_dataframe(query_passenger_trends_by_season)
passenger_trends_by_season_df.head()

Query executed successfully. Retrieved 3630 rows.


Unnamed: 0,trip_date,year,month,season,passenger_category,total_trips
0,2001-01-01,2001,1,Winter,Single Passenger,9
1,2001-01-01,2001,1,Winter,Multiple Passengers,2
2,2001-08-23,2001,8,Summer,Single Passenger,1
3,2002-10-21,2002,10,Fall,Single Passenger,31
4,2002-10-21,2002,10,Fall,Multiple Passengers,5


In [None]:
# Filter rows where the year is between 2020 and the current year (inclusive)
filtered_passenger_trends_by_season_df = passenger_trends_by_season_df[(passenger_trends_by_season_df['year'] >= 2020) & (passenger_trends_by_season_df['year'] <= current_year)]
filtered_passenger_trends_by_season_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3593 entries, 33 to 3625
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   trip_date           3593 non-null   dbdate
 1   year                3593 non-null   Int64 
 2   month               3593 non-null   Int64 
 3   season              3593 non-null   object
 4   passenger_category  3593 non-null   object
 5   total_trips         3593 non-null   Int64 
dtypes: Int64(3), dbdate(1), object(2)
memory usage: 207.0+ KB


In [None]:
# Aggregate total trips by passenger category
passenger_counts = filtered_passenger_trends_by_season_df.groupby("passenger_category")["total_trips"].sum().reset_index()
passenger_counts

Unnamed: 0,passenger_category,total_trips
0,Multiple Passengers,37482648
1,Single Passenger,118592201


In [None]:
# Create a donut chart
fig = px.pie(
    passenger_counts,
    names="passenger_category",
    values="total_trips",
    title="Proportion of Single vs. Multiple Passenger Trips",
    hole=0.4,  # This makes it a donut chart
    color="passenger_category",
    color_discrete_map={"Single Passenger": "red", "Multiple Passengers": "blue"}
)

fig.update_traces(textinfo="percent+label")

# Show the figure
fig.show()

In [None]:
# Aggregate data
treemap_data = filtered_passenger_trends_by_season_df.groupby(["season", "passenger_category"])["total_trips"].sum().reset_index()

# Define a seasonal color palette
season_colors = {
    "Winter": "blue",
    "Spring": "green",
    "Summer": "yellow",
    "Fall": "orange",
}

# Create the treemap
fig = px.treemap(
    treemap_data,
    path=["season", "passenger_category"],
    values="total_trips",
    color="season",
    color_discrete_map=season_colors,
    title="Passenger Distribution by Season (Treemap)",
)

fig.update_traces(textinfo="label+percent entry")

# Show the plot
fig.show()


In [None]:
# Convert trip_date to datetime
filtered_passenger_trends_by_season_df["trip_date"] = pd.to_datetime(filtered_passenger_trends_by_season_df["trip_date"])

In [None]:
# Stacked Area Chart: Passenger Distribution Over Time

  # This helps visualize the proportion of single vs. multiple passengers over time.

fig = px.area(
    filtered_passenger_trends_by_season_df,
    x="trip_date",
    y="total_trips",
    color="passenger_category",
    title="Proportion of Single vs. Multiple Passenger Trips Over Time",
    labels={"total_trips": "Total Trips", "trip_date": "Date", "passenger_category": "Passenger Category"},
)

fig.update_layout(template="plotly_dark")
fig.show()
