In [94]:
import pandas as pd
import altair as alt
from vega_datasets import data 

In [95]:
import sys
!"{sys.executable}" -m pip install vega_datasets




[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: C:\Users\Raabiyaal Ishaq\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [96]:
# Data Loading and Cleaning
FILE_PATH = r"..\data\Main Data Compiled.xlsx"
df = pd.read_excel(FILE_PATH, sheet_name='Table 1.1')

df['Year'] = pd.to_numeric(df['Years'])
df = df[df['Year'] >= 1980]
# Find relevant figures & convert figures to trillion
# I asked ChatGPT how to separate positive and negative values from my data,
# and it suggested using the .where() method.                                
df['Surplus'] = df['Surplus or Deficit (-)'].where(df['Surplus or Deficit (-)'] > 0, 0) / 1000
df['Deficit'] = df['Surplus or Deficit (-)'].where(df['Surplus or Deficit (-)'] < 0, 0) / 1000
df['Receipts'] = df['Receipts'] / 1000
df['Outlays'] = df['Outlays'] / 1000

# Convert wide data to Long format
# Prompt: how to prepare data for plotting multiple series in Altair.
# Output: Use pd.melt() into long format.
lines_df = df.melt(
    id_vars=['Year'],
    value_vars=['Receipts', 'Outlays'],
    var_name='Category',
    value_name='Amount'
)
# Create base chart
base = alt.Chart(df).encode(
    x=alt.X(
        'Year:O',
        title='Fiscal Year',
        axis=alt.Axis(
            labelAngle=90,
            labelFontSize=10,
            titleFontSize=12,
            labelExpr="datum.label % 4 == 0 ? datum.label : ''"
        )
    )
)

color_scale = alt.Scale(
    domain=['Surplus', 'Deficit', 'Receipts', 'Outlays'],
    range=['#4CAF50', '#E53935', '#1E88E5', '#FB8C00']
)
# Create green shaded area for budget surplus values (above zero line).
surplus_area = (
    base.transform_calculate(Category="'Surplus'")
    .mark_area(opacity=0.5)
    .encode(
        y=alt.Y('Surplus:Q', title='Trillions of U.S. Dollars'),
        y2=alt.datum(0),
        color=alt.Color('Category:N', scale=color_scale, legend=alt.Legend(title='Federal Budget Components'))
    )
)
# Create red shaded area for budget deficit values (below zero line).
deficit_area = (
    base.transform_calculate(Category="'Deficit'")
    .mark_area(opacity=0.5)
    .encode(
        y='Deficit:Q',
        y2=alt.datum(0),
        color=alt.Color('Category:N', scale=color_scale, legend=None)
    )
)
# Plot Receipts and Outlays as lines for comparison over time.
lines = alt.Chart(lines_df).mark_line(strokeWidth=2).encode(
    x='Year:O',
    y='Amount:Q',
    color=alt.Color('Category:N', scale=color_scale, legend=alt.Legend(title=None))
)
# Combine all parts  
chart1 = (surplus_area + deficit_area + lines).properties(
    title='U.S. Federal Budget: Surplus (Green) vs. Deficit (Red), 1980–2024',
    width=900,
    height=450
)

chart1


In [97]:

# Load and clean data from OMB Table 1.2
df = pd.read_excel(FILE_PATH, sheet_name='Table 1.2')
df['Year'] = pd.to_numeric(df['Years'])
df = df.dropna(subset=['Year'])
df = df[df['Year'] >= 1980]

# Remove '%' and convert relevant columns to numeric
for col in ['Receipts', 'Outlays', 'Surplus or Deficit (-)']:
    df[col] = pd.to_numeric(df[col].astype(str).str.replace('%', ''))

# Reshape to long format 
df_long = df.melt(
    id_vars='Year',
    value_vars=['Receipts', 'Outlays', 'Surplus or Deficit (-)'],
    var_name='Category',
    value_name='Percent_of_GDP'
)

# Plot budget categories as percent of GDP
chart = (
    alt.Chart(df_long)
    .mark_line(strokeWidth=2)
    .encode(
        x=alt.X(
            'Year:O',
            title='Fiscal Year',
            axis=alt.Axis(
                labelAngle=90,
                labelFontSize=10,
                titleFontSize=12,
                labelExpr="datum.label % 4 == 0 ? datum.label : ''"
            )
        ),
        y=alt.Y('Percent_of_GDP:Q', title='Percent of GDP', axis=alt.Axis(format='.0%')),
color=alt.Color(
    'Category:N',
    scale=alt.Scale(
        domain=['Receipts', 'Outlays', 'Surplus or Deficit (-)'],
        range=['#1E88E5', '#FB8C00', 'black']
    ),
        legend=alt.Legend(title=None), 
        )
    )
)

# Add horizontal zero line for reference
zero_line = alt.Chart(pd.DataFrame({'y': [0]})).mark_rule(
    color='black', strokeDash=[5,5]
).encode(y='y:Q')

# Combine both layers into final chart
chart2 = (
    (zero_line + chart)
    .properties(
        title='U.S. Federal Budget as Percent of GDP (1980–2024)',
        width=900,
        height=450
    )
)

chart2


In [98]:
# Read and clean the data
df = pd.read_excel(FILE_PATH, sheet_name='Table 2.1')
df = df[df['Year'] >= 1980]

# Reshape data to long format for Altair
df_melted = df.melt(
    id_vars=['Year'],
    value_vars=[
        'Individual Income Taxes',
        'Corporation Income Taxes',
        'Social Ins. and Retirement Receipts',
        'Excise Taxes',
        'Other'
    ],
    var_name='Revenue Source',
    value_name='Amount'
)

# Build stacked bar chart 
chart_bar = (
    alt.Chart(df_melted)
    .mark_bar()
    .encode(
        x=alt.X(
            'Year:O',
            title='Fiscal Year',
            axis=alt.Axis(
                labelAngle=90,
                #ChatGPT prompt: how the label only if the year is evenly
                #  divisible by 4; otherwise, leave it blank in altair
                # ChatGPT: The expression datum.label % 4 == 0 checks division
                labelExpr="datum.label % 4 == 0 ? datum.label : ''"  
            )
        ),
        y=alt.Y(
            'Amount:Q',
            stack='normalize',
            title='Share of Total Receipts (%)'
        ),
        color=alt.Color(
            'Revenue Source:N',
            legend=alt.Legend(title=None),
            scale=alt.Scale(
                domain=[
                    'Individual Income Taxes',
                    'Corporation Income Taxes',
                    'Social Ins. and Retirement Receipts',
                    'Excise Taxes',
                    'Other'
                ],
                range=['#1E88E5', '#FB8C00', '#43A047', '#8E24AA', '#FDD835']
            )
        )
    )
    .properties(
        title='Share of Federal Receipts by Source (1980–2024)',
        width=900,
        height=450
    )
)

chart_bar


In [99]:
# Read,clean and filter data to show every 5th year
df = pd.read_excel(FILE_PATH, sheet_name='Table 2.2')
df = df[df['Year'] % 5 == 0]
df_melted = df.melt(
    id_vars=['Year'],
    var_name='Subcategory',
    value_name='Percentage'
)

# Create heatmap visualization
heatmap = (
    alt.Chart(df_melted)
    .mark_rect(stroke='white', strokeWidth=0.5)
    .encode(
        x=alt.X('Year:O', title='Year'),
        y=alt.Y('Subcategory:N', title='Category'),
        color=alt.Color(
            'Percentage:Q',
            scale=alt.Scale(scheme='tealblues'),
            title='Percentage (%)'
        )
    )
    .properties(
        title='Percentage Contribution of Subcategories Over Time (Every 5 Years)',
        width=800,
        height=250
    )
    # ChatGPT prompt: How can I increase font size of axis labels and chart title in Altair?
    # ChatGPT suggested .configure 
    .configure_axis(
        labelFontSize=10,
        titleFontSize=14,

    )
    .configure_title(fontSize=16)
)

heatmap


In [100]:
# Read and clean data
df = pd.read_excel(FILE_PATH, sheet_name='Table 3.2A')

df_long = (
    df.melt(
        id_vars=[df.columns[0]],  
        var_name='Year',
        value_name='Outlays'
    )
    .rename(columns={df.columns[0]: 'Subfunction'})
)

# Convert to numeric and filter valid years
df_long['Year'] = pd.to_numeric(df_long['Year'], errors='coerce')
df_long['Outlays'] = pd.to_numeric(df_long['Outlays'], errors='coerce')
df_long = df_long.dropna(subset=['Year', 'Outlays'])
df_long = df_long[df_long['Year'] >= 1980]


# Customizing color for each category 
category_config = [
    {'name': 'Defense and Veterans Affairs', 'color': "#D06359"},
    {'name': 'International Affairs', 'color': '#F28E2B'},
    {'name': 'Science, Energy & Environment', 'color': "#80E157"},
    {'name': 'Commerce and Infrastructure', 'color': '#76B7B2'},
    {'name': 'Agri. & Community Development', 'color': '#59A14F'},
    {'name': 'Education & Labor', 'color': '#EDC948'},
    {'name': 'Justice and Government', 'color': '#B07AA1'},
    {'name': 'Net Interest', 'color': '#FF9DA7'},
    {'name': 'Health and Medicare', 'color': "#52BFEB"},
    {'name': 'Social Insurance', 'color': "#1884AE"},
]


# Extract domain and color range
color_domain = [item['name'] for item in category_config]
color_range = [item['color'] for item in category_config]

# Create order column for stacking 
subfunction_order = {name: i for i, name in enumerate(color_domain)}
df_long['order'] = df_long['Subfunction'].map(subfunction_order)

# Create normalized stacked bar chart
chart_bar = (
    alt.Chart(df_long)
    .mark_bar()
    .encode(
        x=alt.X(
            'Year:O',
            title='Fiscal Year',
            axis=alt.Axis(
                labelAngle=90,
                labelExpr="datum.value % 4 == 0 ? datum.value : ''"  # show every 4th year
            )
        ),
        y=alt.Y(
            'Outlays:Q',
            stack='normalize',
            title='Share of Total Outlays (%)',
            axis=alt.Axis(format='%')
        ),
        color=alt.Color(
            'Subfunction:N',
            legend=alt.Legend(title=None),
            scale=alt.Scale(domain=color_domain, range=color_range)
        ),
        order=alt.Order('order:Q')
    )
    .properties(
        title='Share of Federal Outlays by Function (1980–2024)',
        width=900,
        height=450
    )
    .configure_axis(
        labelFontSize=11,
        titleFontSize=13
    )
    .configure_title(
        fontSize=16
    )
)

chart_bar


In [101]:
# Read Data 
df = pd.read_excel(FILE_PATH, sheet_name='Table 7.1')

# Clean column names
df.columns = df.columns.str.strip().str.replace('\n', ' ')

# Convert to numeric
df[['Year', 'Gross Federal Debt', 'Government Accounts', 'Federal Reserve', 'Public']] = (
    df[['Year', 'Gross Federal Debt', 'Government Accounts', 'Federal Reserve', 'Public']]
    .apply(pd.to_numeric, errors='coerce')
)

# Filter for 1980+
df = df[df['Year'] >= 1980]

# Convert from millions to trillions
df[['Gross Federal Debt', 'Government Accounts', 'Federal Reserve', 'Public']] /= 1_000_000

# Reshape for stacked bars
df_long = df.melt(
    id_vars='Year',
    value_vars=['Government Accounts', 'Federal Reserve', 'Public'],
    var_name='Debt Component',
    value_name='Amount'
)

# Base chart (stacked bars) 
base = alt.Chart(df_long).encode(
    x=alt.X(
        'Year:O',
        title='Fiscal Year',
        axis=alt.Axis(labelAngle=90, labelFontSize=9)
    ),
    y=alt.Y(
        'Amount:Q',
        stack='zero',
        title='Debt (Trillions of Dollars)'
    ),
    color=alt.Color(
        'Debt Component:N',
        legend=alt.Legend(title=None), 
        scale=alt.Scale(
            domain=['Government Accounts', 'Federal Reserve', 'Public'],
            range=['#fdae61', '#4575b4', '#91cf60']
        )
    )
)

# Add stacked bars 
bars = base.mark_bar(opacity=0.85)

# Overlay total debt line
line = (
    alt.Chart(df)
    .mark_line(color='black', strokeWidth=2)
    .encode(
        x='Year:O',
        y='Gross Federal Debt:Q',
    )
)

# Combine 
final_chart = (bars + line).properties(
    title='Gross Federal Debt and Its Composition by Holder (1980–2024)',
    width=900,
    height=450
)

final_chart


In [102]:
# Reading and Preparing Data 
df = pd.read_excel(FILE_PATH, sheet_name='Debt Holding')
df = df.dropna(how='all').dropna(axis=1, how='all')
df_long = df.melt(var_name='Category', value_name='Amount')
df_long = df_long[~df_long['Category'].isin(['Total Public Debt', 'Total Privately Held'])]
df_long['Amount'] = pd.to_numeric(df_long['Amount'], errors='coerce')
df_long = df_long.dropna(subset=['Amount'])
total = df_long['Amount'].sum()

# Donut chart
chart = (
    alt.Chart(df_long)
    .mark_arc(innerRadius=60)  
    .encode(
        theta='Amount:Q',
        color=alt.Color(
            'Category:N',
            legend=alt.Legend(title=None),  
            scale=alt.Scale(scheme='tableau20')
        )
    )
    .properties(
        title='U.S. Federal Debt Ownership by Holder (2024)',
        width=400,
        height=400
    )
)

chart


In [None]:
# Read and prepare data 
df = pd.read_excel(FILE_PATH, sheet_name='State-wise')
# ChatPT prompt: “I want to pull text inside parentheses from each value in a pandas
# ChatGPT suggested str.extract and regex 
df['State Code'] = df['State or Territory Name'].str.extract(r'\((\w{2})\)')
df['Awarded Amount (FY)'] = pd.to_numeric(df['Awarded Amount (FY)'], errors='coerce')
df['Awarded Amount (FY)'] = df['Awarded Amount (FY)'] / 1_000_000_000

# Load U.S. states geometry from Vega Datasets
# ChatGPT prompt: How can I plot a map of the United States in Altair?
# ChatGPT suggested data.us_10m.url 
states = alt.topo_feature(data.us_10m.url, 'states')

# Map state abbreviations to FIPS codes 
state_fips_map = {
    'AL': 1, 'AK': 2, 'AZ': 4, 'AR': 5, 'CA': 6, 'CO': 8, 'CT': 9, 'DE': 10, 'DC': 11, 'FL': 12,
    'GA': 13, 'HI': 15, 'ID': 16, 'IL': 17, 'IN': 18, 'IA': 19, 'KS': 20, 'KY': 21, 'LA': 22,
    'ME': 23, 'MD': 24, 'MA': 25, 'MI': 26, 'MN': 27, 'MS': 28, 'MO': 29, 'MT': 30, 'NE': 31,
    'NV': 32, 'NH': 33, 'NJ': 34, 'NM': 35, 'NY': 36, 'NC': 37, 'ND': 38, 'OH': 39, 'OK': 40,
    'OR': 41, 'PA': 42, 'RI': 44, 'SC': 45, 'SD': 46, 'TN': 47, 'TX': 48, 'UT': 49, 'VT': 50,
    'VA': 51, 'WA': 53, 'WV': 54, 'WI': 55, 'WY': 56
}
df['id'] = df['State Code'].map(state_fips_map)

# Build choropleth 
chart = (
    alt.Chart(states)
    .mark_geoshape(stroke='white')
    .encode(
        color=alt.Color(
            'Awarded Amount (FY):Q',
            title='Awarded Amount (Billions $)',
            scale=alt.Scale(scheme='blues')
        ),
        tooltip=[
        # Chatgpt prompt: I want info displayed when i hover ove map
        #Chatgpt suggested tooltip 
            alt.Tooltip('State or Territory Name:N', title='State'),
            alt.Tooltip('Awarded Amount (FY):Q', format=',.2f', title='Awarded Amount ($B)'),
            alt.Tooltip('Percent of Total:Q', format='.2f', title='Share (%)')
        ]
    )
    .transform_lookup(
        lookup='id',
        from_=alt.LookupData(
            df,
            'id',
            ['State or Territory Name', 'Awarded Amount (FY)', 'Percent of Total']
        )
    )
    .project(type='albersUsa')
    .properties(
        title='Federal Awards by State (Latest Fiscal Year, in Billions $)',
        width=700,
        height=450
    )
)

chart
