In [2]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import os
import plotly.io as pio

In [19]:
def load_data(file_path):
    """Load the data from a CSV file."""
    return pd.read_csv(file_path)

def filter_data_by_service_mode(df, selected_routes):
    """Filter the data to include only the selected routes."""
    return df[df['service_mode'].isin(selected_routes)]

def clean_data(df):
    """Clean the data by removing rows with missing values."""
    return df.dropna()

# Define file path
file_path = os.path.join('data', 'MBTA_2023_System-Wide_Passenger_Survey_Data.csv')

# Load and filter data
selected_routes = ['Bus', 'Silver Line BRT', 'All Modes', 'Rapid Transit or Bus Rapid Transit']
df = load_data(file_path)
df = filter_data_by_service_mode(df, selected_routes)
df = clean_data(df)

# MBTA Usage by Income Level

In [None]:
def preprocess_income_data(df):
    """Preprocess the income data, categorizing income levels."""
    income_order = [
        'Less than 30% of Area Median Income',
        '30% to 60% of Area Median Income',
        '60% to 80% of Area Median Income',
        '80% to 100% of Area Median Income',
        '100% to 140% of Area Median Income',
        '140% to 200% of Area Median Income',
        '200% or more of Area Median Income',
        'Prefer not to say'
    ]
    income_data = df[df['measure'] == 'Household Income']
    income_data['category'] = pd.Categorical(income_data['category'], categories=income_order, ordered=True)
    return income_data


def plot_income_data(income_data):
    """Plot aggregated weighted_percent by category for Bus mode only."""
    # Filter for Bus mode
    bus_data = income_data[income_data['service_mode'] == 'Bus']
    # Aggregate data by category
    agg_data = bus_data.groupby('category', as_index=False)['weighted_percent'].sum()
    # Plot
    fig = px.bar(
        agg_data,
        x='category',
        y='weighted_percent',
        title='MBTA Bus Ridership by Household Income',
        labels={'weighted_percent': 'Weighted Percentage', 'category': 'Income Category'}
    )
    fig.update_layout(xaxis_title='Income Category', yaxis_title='Weighted Percentage')
    fig.show()
    pio.write_html(fig, "./Plots/Ridership/Income_demographics_bus.html")

# Plot ridership by income
income_data = preprocess_income_data(df)
plot_income_data(income_data)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy





Based on the above graph, majority of the MBTA riders seems to be in the lower income range (upto 60% of Area Median Income)

# Race and Ethnicity

In [None]:
def preprocess_race_data(df):
    """Preprocess the race data for plotting."""
    return df[df['measure'] == 'Race']

def plot_race_data(race_data):
    """Plot aggregated weighted_percent by category for Bus mode only."""
    # Filter for Bus mode
    bus_data = race_data[race_data['service_mode'] == 'Bus']
    # Aggregate data by category
    agg_data = bus_data.groupby('category', as_index=False)['weighted_percent'].sum()
    # Plot
    fig = px.bar(
        agg_data,
        x='category',
        y='weighted_percent',
        title='MBTA Bus Ridership by Race and Ethnicity',
        labels={'weighted_percent': 'Weighted Percentage', 'category': 'Race/Ethnicity'}
    )
    fig.update_layout(xaxis_title='Race/Ethnicity', yaxis_title='Weighted Percentage')
    fig.show()
    pio.write_html(fig, "./Plots/Ridership/Race_demographics_bus.html")

# Plot ridership by race
race_data = preprocess_race_data(df)
plot_race_data(race_data)

# Age

In [None]:
def preprocess_age_data(df):
    """Preprocess the age data for plotting."""
    return df[df['measure'] == 'Age']

def plot_age_data(age_data):
    """Plot aggregated weighted_percent by category for Bus mode only."""
    # Filter for Bus mode
    bus_data = age_data[age_data['service_mode'] == 'Bus']
    # Aggregate data by category
    agg_data = bus_data.groupby('category', as_index=False)['weighted_percent'].sum()
    # Plot
    fig = px.bar(
        agg_data,
        x='category',
        y='weighted_percent',
        title='MBTA Bus Ridership by Age Group',
        labels={'weighted_percent': 'Weighted Percentage', 'category': 'Age Group'}
    )
    fig.update_layout(xaxis_title='Age Group', yaxis_title='Weighted Percentage')
    fig.show()
    pio.write_html(fig, "./Plots/Ridership/Age_demographics_bus.html")

age_data = preprocess_age_data(df)
plot_age_data(age_data)
    

In [29]:
def preprocess_gender_data(df):
    """Preprocess the gender data for plotting."""
    return df[df['measure'] == 'Gender']

def plot_gender_data(gender_data):
    """Plot aggregated weighted_percent by category for Bus mode only."""
    # Filter for Bus mode
    bus_data = gender_data[gender_data['service_mode'] == 'Bus']
    # Aggregate data by category
    agg_data = bus_data.groupby('category', as_index=False)['weighted_percent'].sum()
    # Plot
    fig = px.bar(
        agg_data,
        x='category',
        y='weighted_percent',
        title='MBTA Bus Ridership by Gender',
        labels={'weighted_percent': 'Weighted Percentage', 'category': 'Gender'}
    )
    fig.update_layout(xaxis_title='Gender', yaxis_title='Weighted Percentage')
    fig.show()
    pio.write_html(fig, "./Plots/Ridership/Gender_demographics_bus.html")

gender = preprocess_gender_data(df)
plot_gender_data(gender)


From the above we can see that the majority of people who use the MBTA are regular commuters between the ages of 18-44, and that a higher percentage of women use the MBTA compared to Men

In [30]:
def preprocess_reduced_fare_data(df):
    """Preprocess the reduced fare data for plotting."""
    return df[df['measure'] == 'Reduced Fares']

def plot_reduced_fare_data(reduced_fare_data):
    """Plot aggregated weighted_percent by category for Bus mode only."""
    # Filter for Bus mode
    bus_data = reduced_fare_data[reduced_fare_data['service_mode'] == 'Bus']
    # Aggregate by category
    agg_data = bus_data.groupby('category', as_index=False)['weighted_percent'].sum()
    # Plot
    fig = px.bar(
        agg_data,
        x='category',
        y='weighted_percent',
        title='MBTA Bus Ridership by Reduced Fare Category',
        labels={'weighted_percent': 'Weighted Percentage', 'category': 'Category'}
    )
    fig.update_layout(xaxis_title='Category', yaxis_title='Weighted Percentage')
    fig.show()
    pio.write_html(fig, "./Plots/Ridership/Fare_demographics_bus.html")

rf = preprocess_reduced_fare_data(df)
plot_reduced_fare_data(rf)


Seems like most people pay the full fare for the MBTA, though around 10% use subsidized fares from their employers

In [31]:
def preprocess_trip_purpose_data(df):
    """Preprocess the trip purpose data for plotting."""
    return df[df['measure'] == 'Trip Purpose']

def plot_trip_purpose_data(trip_purpose_data):
    """Plot aggregated weighted_percent by category for Bus mode only."""
    # Filter for Bus mode
    bus_data = trip_purpose_data[trip_purpose_data['service_mode'] == 'Bus']
    # Aggregate data by category
    agg_data = bus_data.groupby('category', as_index=False)['weighted_percent'].sum()
    # Plot
    fig = px.bar(
        agg_data,
        x='category',
        y='weighted_percent',
        title='MBTA Bus Ridership by Trip Purpose',
        labels={'weighted_percent': 'Weighted Percentage', 'category': 'Trip Purpose'}
    )
    fig.update_layout(xaxis_title='Trip Purpose', yaxis_title='Weighted Percentage')
    fig.show()
    pio.write_html(fig, "./Plots/Ridership/Trip_Purpose_demographics_bus.html")

tp = preprocess_trip_purpose_data(df)
plot_trip_purpose_data(tp)

## Usage by Car Owners

In [32]:
def preprocess_usage_by_car_owners(df):
    """Preprocess data for usage by car owners."""
    car_owners = df[(df['measure'] == 'Usable Cars') & (df['category'].isin(['1', '2', '3 or more']))]
    car_owner_groups = car_owners['reporting_group'].unique()
    return df[(df['measure'] == 'Frequency') & (df['reporting_group'].isin(car_owner_groups))]

def plot_usage_by_car_owners(frequency_data):
    """Plot aggregated weighted_percent by frequency for car owners using Bus mode only."""
    # Filter for Bus mode
    bus_data = frequency_data[frequency_data['service_mode'] == 'Bus']
    # Aggregate data by frequency category
    agg_data = bus_data.groupby('category', as_index=False)['weighted_percent'].sum()
    # Plot
    fig = px.bar(
        agg_data,
        x='category',
        y='weighted_percent',
        title='Usage Frequency of MBTA Bus Services by Car Owners',
        labels={'weighted_percent': 'Weighted Percentage', 'category': 'Frequency of Use'}
    )
    fig.update_layout(xaxis_title='Frequency of Use', yaxis_title='Weighted Percentage')
    fig.show()
    pio.write_html(fig, "./Plots/Ridership/Car_Owners_Frequency_bus.html")

car_owners = preprocess_usage_by_car_owners(df)
plot_usage_by_car_owners(car_owners)

    

The above graph shows that even though people have usable cars, there are still many people who use the MBTA around 5 days a week. It would be reasonable to assume this is for work. Most of these people seem to opt for the bus