In [None]:
# Importing libraries
import pandas as pd
from sodapy import Socrata
import plotly.graph_objects as go
#from plotly.subplots import make_subplots

# API Authentication: public datasets only do not require token information
client = Socrata("data.cdc.gov", None)

# Fetch birth rate data
# The dataset ID "yt7u-eiyg" is for the CDC dataset for Birth Rate by Age Range
socrata_dataset_identifier = "yt7u-eiyg"

# SoQL query. Syntax can be found at [https://dev.socrata.com/docs/queries/]
# query = (""" SELECT * WHERE year >= 2010 LIMIT 100 "")
query = (""" SELECT * """)

# API query outputs JSON, sodapy converts to Python list of dictionaries
results = client.get(
    socrata_dataset_identifier,
    query=query
)

# convert to pandas DataFrame
results_df = pd.DataFrame.from_records(results)
client.close()

print('shape of data: {}'.format(results_df.shape))
results_df.tail()


In [None]:
# Importing necessary libraries
import pandas as pd
import plotly.graph_objects as go

# Assuming the data is already loaded into 'results_df' from your provided code

# Convert 'year' and 'birth_rate' columns to appropriate types
results_df['year'] = results_df['year'].astype(int)
results_df['birth_rate'] = results_df['birth_rate'].astype(float)

# Filter for relevant years
filtered_df = results_df[(results_df['year'].isin([2000, 2005, 2010, 2015]))]

# Extract only the relevant columns
filtered_df = filtered_df[['year', 'age_group', 'birth_rate']]

# Create a pivot table with years as columns and age groups as rows
pivot_df = filtered_df.pivot_table(index='age_group', columns='year', values='birth_rate')

# Sankey diagram requires source, target, and value information
years = [2000, 2005, 2010, 2015]
age_groups = filtered_df['age_group'].unique().tolist()

# Create mapping for the nodes (both years and age groups)
nodes = [f'{age_group} {year}' for year in years for age_group in age_groups]
node_indices = {node: i for i, node in enumerate(nodes)}

# Prepare source, target, and value lists for the Sankey diagram
sources = []
targets = []
values = []

# Loop over the years and build source/target relationships
for i in range(len(years) - 1):
    for age_group in age_groups:
        source = node_indices[f'{age_group} {years[i]}']
        target = node_indices[f'{age_group} {years[i+1]}']
        value = pivot_df.loc[age_group, years[i]]
        
        sources.append(source)
        targets.append(target)
        values.append(value)

# Define node labels
labels = nodes

# Create the Sankey diagram
fig = go.Figure(go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=labels
    ),
    link=dict(
        source=sources,  # Indices of the source nodes
        target=targets,  # Indices of the target nodes
        value=values     # Flow values
    )
))

# Set title and layout
fig.update_layout(title_text="Birth Rate Flow Across Age Groups (2000 to 2015)", font_size=10)
fig.show()