In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Re-importing the merged_data.csv file
# Load the datasets
data_2019 = pd.read_csv('District2019AllStudents.csv')
data_2020 = pd.read_csv('District2020AllStudents.csv')
data_2021 = pd.read_csv('District2021AllStudents.csv')
data_2022 = pd.read_csv('District2022AllStudents.csv')

# Merge the datasets
school = pd.concat([data_2019, data_2020, data_2021, data_2022])

# Optionally, you can reset the index
school.reset_index(drop=True, inplace=True)

# Save the merged dataset
school.to_csv('merged_data.csv', index=False)

df = pd.read_csv('merged_data.csv')

# Clean column names
df.columns = df.columns.str.strip().str.replace(' ', '_').str.replace(r'[^A-Za-z0-9_]+', '').str.lower()

# Clean the "school_year" column by keeping only the second year in the range and adding '20' as a prefix if necessary
df['school_year'] = df['school_year'].str.split('-').str[1]
df['school_year'] = df['school_year'].apply(lambda x: '20' + x if len(x) == 2 else x)

# Convert the 'school_year' to integer for proper sorting and comparison
df['school_year'] = df['school_year'].astype(int)

df


FileNotFoundError: [Errno 2] No such file or directory: 'District2019AllStudents.csv'

In [None]:
df.drop(columns=['date_of_report', 'state_district_id','district_name','subgroup_code','subgroup_name'], inplace=True)

In [None]:
# Aggregate by county
county_agg = df.groupby(['county', 'school_year'])['pk12_total'].sum().reset_index()

# Sort the DataFrame by 'county' and 'school_year' to ensure the difference is calculated year-over-year
county_agg = county_agg.sort_values(by=['county', 'school_year'])

# Calculate the difference in 'pk12_total' from one year to the next within each 'county'
county_agg['diff_pk12_total'] = county_agg.groupby('county')['pk12_total'].diff()

county_agg.head(1000)  # Display the first few rows to verify the 'diff_pk12_total' calculation


In [None]:
# First, we create a column with the previous year's pk12_total
county_agg['prev_year_pk12_total'] = county_agg.groupby('county')['pk12_total'].shift(1)

# Then, calculate the percentage difference (current year - previous year) / previous year * 100
county_agg['percent_diff_pk12_total'] = (county_agg['diff_pk12_total'] / county_agg['prev_year_pk12_total']) * 100

# Filter data for the years 2020, 2021, and 2022 again to include the new percentage changes
data_2020 = county_agg[county_agg['school_year'] == 2020]
data_2021 = county_agg[county_agg['school_year'] == 2021]
data_2022 = county_agg[county_agg['school_year'] == 2022]

In [None]:
# To show every 5th label on the y-axis, we can adjust the set of labels to be displayed after plotting

# Create a figure with 3 subplots for the percentage differences
fig, axes = plt.subplots(3, 1, figsize=(10, 15))

# Define a common function to set every 5th label
def set_every_fifth_label(ax):
    # Get all the current y-tick labels on the plot
    labels = ax.get_yticklabels()
    # Calculate which labels to keep (every 5th label)
    new_labels = [label if i % 5 == 0 else '' for i, label in enumerate(labels)]
    # Set the new list of labels on the plot
    ax.set_yticklabels(new_labels)

# Plot for 2020 with adjusted y-tick labels
sns.barplot(x='percent_diff_pk12_total', y='county', data=data_2020, ax=axes[0], palette='coolwarm')
axes[0].set_title('Percentage Difference in PK12 Total by County for 2020')
axes[0].set_xlabel('Percentage Difference in PK12 Total')
axes[0].set_ylabel('County')
set_every_fifth_label(axes[0])

# Plot for 2021 with adjusted y-tick labels
sns.barplot(x='percent_diff_pk12_total', y='county', data=data_2021, ax=axes[1], palette='coolwarm')
axes[1].set_title('Percentage Difference in PK12 Total by County for 2021')
axes[1].set_xlabel('Percentage Difference in PK12 Total')
axes[1].set_ylabel('County')
set_every_fifth_label(axes[1])

# Plot for 2022 with adjusted y-tick labels
sns.barplot(x='percent_diff_pk12_total', y='county', data=data_2022, ax=axes[2], palette='coolwarm')
axes[2].set_title('Percentage Difference in PK12 Total by County for 2022')
axes[2].set_xlabel('Percentage Difference in PK12 Total')
axes[2].set_ylabel('County')
set_every_fifth_label(axes[2])

# Adjust the layout and show the plots
plt.tight_layout()
plt.show()
