# NYC Taxi Data Visualizations

This notebook creates interactive and static visualizations from the NYC Yellow Taxi analysis.

In [1]:
import polars as pl
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set style for matplotlib
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

print('Libraries loaded successfully!')

Libraries loaded successfully!


## Load Data

In [2]:
# Load the analysis results
df = pl.read_parquet('outputs/result.parquet')
print(f'Loaded {len(df)} months of data')
print(f'Columns: {df.columns}')
print(f'Shape: {df.shape}')

# Convert to pandas for easier plotting
df_pd = df.to_pandas()

# Create datetime column
df_pd['date'] = df['year_month'].str.to_date(format='%Y-%m').to_pandas()

print('\nData loaded and processed!')

FileNotFoundError: The system cannot find the file specified. (os error 2): outputs/result.parquet

This error occurred with the following context stack:
	[1] 'parquet scan'
	[2] 'sink'


In [None]:
# Display summary statistics
print('Summary Statistics')
print('=' * 60)
print(f'Date range: {df_pd["year_month"].min()} to {df_pd["year_month"].max()}')
print(f'Total trips: {df_pd["trip_count"].sum():,.0f}')
print(f'Total revenue: ${df_pd["total_revenue"].sum():,.2f}')
print(f'Average monthly trips: {df_pd["trip_count"].mean():,.0f}')
print(f'Average monthly revenue: ${df_pd["total_revenue"].mean():,.2f}')
print(f'Average fare: ${df_pd["avg_fare"].mean():.2f}')
print(f'Average distance: {df_pd["avg_distance"].mean():.2f} miles')

## 1. Trip Count Over Time

In [None]:
fig = px.line(
    df_pd,
    x='date',
    y='trip_count',
    title='NYC Yellow Taxi Trips Over Time (2020-2025)',
    labels={'date': 'Date', 'trip_count': 'Number of Trips'},
    template='plotly_white'
)
fig.update_traces(line_color='#FFD700', line_width=3)
fig.update_layout(
    hovermode='x unified',
    font=dict(size=12),
    height=500,
    showlegend=False
)
fig.show()

# Save to outputs
Path('outputs').mkdir(exist_ok=True)
fig.write_html('outputs/trips_over_time.html')
print('Saved: trips_over_time.html')

## 2. Revenue Over Time

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=df_pd['date'],
    y=df_pd['total_revenue'] / 1_000_000,
    mode='lines',
    name='Total Revenue',
    line=dict(color='#2ECC71', width=3),
    fill='tozeroy',
    fillcolor='rgba(46, 204, 113, 0.2)'
))
fig.update_layout(
    title='NYC Yellow Taxi Revenue Over Time',
    xaxis_title='Date',
    yaxis_title='Revenue (Million $)',
    template='plotly_white',
    hovermode='x unified',
    height=500,
    showlegend=False
)
fig.show()

fig.write_html('outputs/revenue_over_time.html')
print('Saved: revenue_over_time.html')

## 3. Average Fare and Distance (Dual Axis)

In [None]:
fig = make_subplots(specs=[[{'secondary_y': True}]])

fig.add_trace(
    go.Scatter(
        x=df_pd['date'],
        y=df_pd['avg_fare'],
        name='Avg Fare ($)',
        line=dict(color='#3498DB', width=3)
    ),
    secondary_y=False
)

fig.add_trace(
    go.Scatter(
        x=df_pd['date'],
        y=df_pd['avg_distance'],
        name='Avg Distance (mi)',
        line=dict(color='#E74C3C', width=3)
    ),
    secondary_y=True
)

fig.update_layout(
    title='Average Fare and Trip Distance Over Time',
    template='plotly_white',
    hovermode='x unified',
    height=500
)
fig.update_xaxes(title_text='Date')
fig.update_yaxes(title_text='Average Fare ($)', secondary_y=False)
fig.update_yaxes(title_text='Average Distance (miles)', secondary_y=True)

fig.show()

fig.write_html('outputs/avg_fare_distance.html')
print('Saved: avg_fare_distance.html')

## 4. Monthly Heatmap

In [None]:
# Pivot data for heatmap
heatmap_data = df_pd.pivot_table(
    values='trip_count',
    index='month',
    columns='year',
    aggfunc='mean'
)

fig, ax = plt.subplots(figsize=(14, 8))
sns.heatmap(
    heatmap_data,
    annot=True,
    fmt='.0f',
    cmap='YlOrRd',
    cbar_kws={'label': 'Trip Count'},
    ax=ax,
    linewidths=0.5
)
ax.set_title('Monthly Trip Count Heatmap by Year', fontsize=16, pad=20)
ax.set_xlabel('Year', fontsize=12)
ax.set_ylabel('Month', fontsize=12)
plt.tight_layout()
plt.savefig('outputs/monthly_heatmap.png', dpi=300, bbox_inches='tight')
plt.show()

print('Saved: monthly_heatmap.png')

## 5. Year-over-Year Comparison

In [None]:
yearly_totals = df_pd.groupby('year').agg({
    'trip_count': 'sum',
    'total_revenue': 'sum'
}).reset_index()

fig = go.Figure()
fig.add_trace(go.Bar(
    x=yearly_totals['year'],
    y=yearly_totals['trip_count'] / 1_000_000,
    marker_color='#9B59B6',
    text=yearly_totals['trip_count'] / 1_000_000,
    texttemplate='%{text:.1f}M',
    textposition='outside'
))

fig.update_layout(
    title='Annual Trip Count Comparison',
    xaxis_title='Year',
    yaxis_title='Total Trips (Millions)',
    template='plotly_white',
    height=500,
    showlegend=False
)
fig.show()

fig.write_html('outputs/yearly_comparison.html')
print('Saved: yearly_comparison.html')

## 6. Trip Distribution by Year (Box Plot)

In [None]:
fig = go.Figure()

for year in sorted(df_pd['year'].unique()):
    year_data = df_pd[df_pd['year'] == year]
    fig.add_trace(go.Box(
        y=year_data['trip_count'] / 1000,
        name=str(year),
        boxmean='sd'
    ))

fig.update_layout(
    title='Monthly Trip Count Distribution by Year',
    yaxis_title='Trips (Thousands)',
    xaxis_title='Year',
    template='plotly_white',
    height=500
)
fig.show()

fig.write_html('outputs/trip_distribution.html')
print('Saved: trip_distribution.html')

## 7. Interactive Dashboard

In [None]:
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=(
        'Monthly Trips',
        'Monthly Revenue (Million $)',
        'Average Fare ($)',
        'Average Distance (mi)'
    ),
    vertical_spacing=0.12,
    horizontal_spacing=0.1
)

# Trips
fig.add_trace(
    go.Scatter(
        x=df_pd['date'],
        y=df_pd['trip_count']/1000,
        mode='lines',
        line=dict(color='#3498DB', width=2),
        name='Trips (K)'
    ),
    row=1, col=1
)

# Revenue
fig.add_trace(
    go.Scatter(
        x=df_pd['date'],
        y=df_pd['total_revenue']/1_000_000,
        mode='lines',
        line=dict(color='#2ECC71', width=2),
        name='Revenue (M)'
    ),
    row=1, col=2
)

# Avg Fare
fig.add_trace(
    go.Scatter(
        x=df_pd['date'],
        y=df_pd['avg_fare'],
        mode='lines',
        line=dict(color='#E67E22', width=2),
        name='Avg Fare'
    ),
    row=2, col=1
)

# Avg Distance
fig.add_trace(
    go.Scatter(
        x=df_pd['date'],
        y=df_pd['avg_distance'],
        mode='lines',
        line=dict(color='#E74C3C', width=2),
        name='Avg Dist'
    ),
    row=2, col=2
)

fig.update_layout(
    title_text='NYC Yellow Taxi Dashboard (2020-2025)',
    showlegend=False,
    template='plotly_white',
    height=900
)

fig.update_yaxes(title_text='Trips (Thousands)', row=1, col=1)
fig.update_yaxes(title_text='Revenue (Million $)', row=1, col=2)
fig.update_yaxes(title_text='Fare ($)', row=2, col=1)
fig.update_yaxes(title_text='Distance (mi)', row=2, col=2)

fig.show()

fig.write_html('outputs/dashboard.html')
print('Saved: dashboard.html')

## Summary

All visualizations have been created and saved to the `outputs/` directory:

**Interactive HTML Files:**
- `trips_over_time.html` - Time series of trip counts
- `revenue_over_time.html` - Revenue trends over time
- `avg_fare_distance.html` - Dual-axis chart of fare and distance
- `yearly_comparison.html` - Annual trip count comparison
- `trip_distribution.html` - Distribution of trips by year
- `dashboard.html` - Combined dashboard with all metrics

**Static PNG Files:**
- `monthly_heatmap.png` - Heatmap of monthly patterns