# Education data prep

### Standardizing county names, and general cleaning/refactoring

- We have 3 education datasets:
  - graduation rates
  - student mobility
  - student mobility by demographics

Fortunately, just ONE of them (grad rate) has a county column. Using the district column present in all 3 datasets, we will merge the county column with the other two datasets

In [6]:
import pandas as pd
import df_util
from df_util import head
input_path = lambda name: f'../input-data/{name}.csv'
work_path = lambda name: f'../working-data/{name}.csv'

df_grad_rate = pd.read_csv(input_path('dist_grad_rate__cfyh-6xxg'))
df_mobility = pd.read_csv(input_path('dist_student_mobility__6wcd-ysh5'))
df_mobility_demo = pd.read_csv(input_path('dist_mobility_demographics__rg84-k4d3'))

## Manual edits of county name

In [7]:
df = df_grad_rate.copy()[['organization_name', 'county_name']]

org_name_to_county_name_map = {
    'STATE TOTAL': 'STATE TOTAL',
    'CHARTER SCHOOL INSTITUTE': 'DENVER',
    'MOUNTAIN BOCES': 'CHAFFEE',
    'CENTENNIAL BOCES': 'WELD',
    'SAN JUAN BOCES': 'LA PLATA',
    'EXPEDITIONARY BOCES': 'DENVER',
}
for org_name, new_county_name in org_name_to_county_name_map.items():
    df.loc[df.organization_name == org_name, 'county_name'] = new_county_name

county = df
head(county)

2 cols x 185 rows


Unnamed: 0,organization_name,county_name
0,STATE TOTAL,STATE TOTAL
1,MAPLETON 1,ADAMS
2,ADAMS 12 FIVE STAR SCHOOLS,ADAMS


### Do the stuff
1. Merge county column
2. Refactor county and bring to front.
3. Drop unneeded columns
4. Drop duplicates (for some reason the 'STATE TOTAL' row is duplicated 2-4 times on some of the datasets. Weird.)
5. Repeat for the other two datasets

In [8]:
df = df_mobility.copy()
df = (county
    .merge(df, how='left')
    .drop(columns=['category', 'school_year', 'org_code'])
    .drop_duplicates()
    .move_col('county_name', 0)
    .rename(columns={'organization_name': 'dist', 'county_name':'county'})
)
df.to_csv(work_path('dist_mobility_rate'), index=False)
head(df)

58 cols x 184 rows


Unnamed: 0,county,dist,total_pupil_count_all_students,total_stable_pupil_count_all_students,total_stability_rate_all_students,total_mobile_student_count_all_students,total_student_mobility_rate_all_students,total_instances_of_mobility_all_students,total_mobility_incidence_rate_all_students,students_with_disabilities_pupil_count,...,homeless_student_mobility_rate,homeless_instances_of_mobility,homeless_mobility_incidence_rate,gifted_talented_pupil_count,gifted_talented_stable_student_count,gifted_talented_stability_rate,gifted_talented_mobile_student_count,gifted_talented_student_mobility_rate,gifted_talented_instances_of_mobility,gifted_talented_mobility_incidence_rate
0,STATE TOTAL,STATE TOTAL,939283.0,705064.0,75.1,231706.0,24.7,253577.0,27.0,84121.0,...,45.3,11558.0,54.2,73344.0,66620.0,90.8,6641.0,9.1,7366.0,10.0
1,ADAMS,MAPLETON 1,9037.0,5077.0,56.2,3919.0,43.4,4133.0,45.7,735.0,...,32.7,79.0,36.9,250.0,205.0,82.0,44.0,17.6,47.0,18.8
2,ADAMS,ADAMS 12 FIVE STAR SCHOOLS,49889.0,34283.0,68.7,15424.0,30.9,16854.0,33.8,4339.0,...,57.2,481.0,68.2,3590.0,3225.0,89.8,361.0,10.1,404.0,11.3


In [9]:
df = df_mobility_demo.copy()
df = (county
    .merge(df, how='left')
    .drop(columns=['category', 'school_year', 'org_code'])
    .drop_duplicates()
    .move_col('county_name', 0)
    .rename(columns={'organization_name': 'dist', 'county_name':'county'})
)
df.to_csv(work_path('dist_mobility_rate_demographics'), index=False)
head(df)

72 cols x 184 rows


Unnamed: 0,county,dist,total_pupil_count,total_stable_student_count,total_stability_rate,total_mobile_student_count,total_student_mobility_rate,total_instances_of_mobility,total_mobility_incidence_rate,total_female_pupil_count,...,total_native_hawaiian_or_other_pacific_islander_student_mobility_rate,total_native_hawaiian_or_other_pacific_islander_instances_of_mobility,total_native_hawaiian_or_other_pacific_islander_mobility_incidence_rate,total_two_or_more_races_pupil_count,total_two_or_more_races_stable_student_count,total_two_or_more_races_stability_rate,total_two_or_more_races_mobile_student_count,total_two_or_more_races_student_mobility_rate,total_two_or_more_races_instances_of_mobility,total_two_or_more_races_mobility_incidence_rate
0,STATE TOTAL,STATE TOTAL,939283.0,705064.0,75.1,231706.0,24.7,253577.0,27.0,458512.0,...,34.8,840.0,38.0,29329.0,21501.0,73.3,7718.0,26.3,8433.0,28.8
2,ADAMS,MAPLETON 1,9037.0,5077.0,56.2,3919.0,43.4,4133.0,45.7,4450.0,...,70.8,17.0,70.8,219.0,129.0,58.9,90.0,41.1,91.0,41.6
3,ADAMS,ADAMS 12 FIVE STAR SCHOOLS,49889.0,34283.0,68.7,15424.0,30.9,16854.0,33.8,24340.0,...,45.3,42.0,48.8,662.0,455.0,68.7,203.0,30.7,222.0,33.5


In [10]:
df = df_grad_rate.copy()
df = df.drop(columns='county_name')
df = (county
    .merge(df, how='left')
    .drop(columns=['organization_code'])
    .drop_duplicates()
    .move_col('county_name', 0)
    .rename(columns={'organization_name': 'dist', 'county_name':'county'})
)
df.to_csv(work_path('dist_grad_rate'), index=False)
head(df)

37 cols x 184 rows


Unnamed: 0,county,dist,students_with_disabilities_final_grad_base,students_with_disabilities_graduates_total,students_with_disabilities_graduation_rate,students_with_disabilities_completers_total,students_with_disabilities_completion_rate,limited_english_proficient_final_grad_base,limited_english_proficient_graduates_total,limited_english_proficient_graduation_rate,...,homeless_final_grad_base,homeless_graduates_total,homeless_graduation_rate,homeless_completers_total,homeless_completion_rate,gifted_talented_final_grad_base,gifted_talented_graduates_total,gifted_talented_graduation_rate,gifted_talented_completers_total,gifted_talented_completion_rate
0,STATE TOTAL,STATE TOTAL,5775.0,3099.0,53.7,3222.0,55.8,6171.0,3289.0,53.3,...,2394.0,1175.0,49.1,1262.0,52.7,6604.0,6048.0,91.6,6156.0,93.2
2,ADAMS,MAPLETON 1,49.0,18.0,36.7,19.0,38.8,219.0,73.0,33.3,...,41.0,12.0,29.3,16.0,39.0,44.0,27.0,61.4,27.0,61.4
3,ADAMS,ADAMS 12 FIVE STAR SCHOOLS,250.0,118.0,47.2,127.0,50.8,379.0,257.0,67.8,...,106.0,62.0,58.5,65.0,61.3,227.0,201.0,88.5,208.0,91.6
