# Education data prep

### Standardizing county names, and general cleaning/refactoring

- We have 3 education datasets:
  - graduation rates
  - student mobility
  - student mobility by demographics

Fortunately, just ONE of them (grad rate) has a county column. Using the district column present in all 3 datasets, we will merge the county column with the other two datasets

In [1]:
import pandas as pd
df_grad_rate = pd.read_csv('../input-data/dist_grad_rate__cfyh-6xxg.csv')
df_mobility = pd.read_csv('../input-data/dist_student_mobility__6wcd-ysh5.csv')
df_mobility_demo = pd.read_csv('../input-data/dist_mobility_demographics__rg84-k4d3.csv')

## Manual edits of county name

In [2]:
df = df_grad_rate.copy()[['organization_name', 'county_name']]

org_name_to_county_name_map = {
    'STATE TOTAL': 'STATE TOTAL',
    'CHARTER SCHOOL INSTITUTE': 'DENVER',
    'MOUNTAIN BOCES': 'CHAFFEE',
    'CENTENNIAL BOCES': 'WELD',
    'SAN JUAN BOCES': 'LA PLATA',
    'EXPEDITIONARY BOCES': 'DENVER',
}
for org_name, new_county_name in org_name_to_county_name_map.items():
    df.loc[df.organization_name == org_name, 'county_name'] = new_county_name

county = df
county

Unnamed: 0,organization_name,county_name
0,STATE TOTAL,STATE TOTAL
1,MAPLETON 1,ADAMS
2,ADAMS 12 FIVE STAR SCHOOLS,ADAMS
3,ADAMS COUNTY 14,ADAMS
4,BRIGHTON 27J,ADAMS
...,...,...
180,MOUNTAIN BOCES,CHAFFEE
181,CENTENNIAL BOCES,WELD
182,SAN JUAN BOCES,LA PLATA
183,EXPEDITIONARY BOCES,DENVER


### Do the stuff
1. Merge county column
2. Refactor county and bring to front.
3. Drop unneeded columns
4. Drop duplicates (for some reason the 'STATE TOTAL' row is duplicated 2-4 times on some of the datasets. Weird.)
5. Repeat for the other two datasets

In [11]:
df = df_mobility.copy()
df = (county
    .merge(df, how='left')
    .rename(columns={'organization_name': 'school_dist'})
    .drop(columns=['category', 'school_year', 'org_code'])
    .drop_duplicates()
)
df.insert(0, 'county', df.pop('county_name'))
df.to_csv('../working-data/dist_mobility_rate.csv', index=False)
df

Unnamed: 0,county,school_dist,total_pupil_count_all_students,total_stable_pupil_count_all_students,total_stability_rate_all_students,total_mobile_student_count_all_students,total_student_mobility_rate_all_students,total_instances_of_mobility_all_students,total_mobility_incidence_rate_all_students,students_with_disabilities_pupil_count,...,homeless_student_mobility_rate,homeless_instances_of_mobility,homeless_mobility_incidence_rate,gifted_talented_pupil_count,gifted_talented_stable_student_count,gifted_talented_stability_rate,gifted_talented_mobile_student_count,gifted_talented_student_mobility_rate,gifted_talented_instances_of_mobility,gifted_talented_mobility_incidence_rate
0,STATE TOTAL,STATE TOTAL,939283.0,705064.0,75.1,231706.0,24.7,253577.0,27.0,84121.0,...,45.3,11558.0,54.2,73344.0,66620.0,90.8,6641.0,9.1,7366.0,10.0
1,ADAMS,MAPLETON 1,9037.0,5077.0,56.2,3919.0,43.4,4133.0,45.7,735.0,...,32.7,79.0,36.9,250.0,205.0,82.0,44.0,17.6,47.0,18.8
2,ADAMS,ADAMS 12 FIVE STAR SCHOOLS,49889.0,34283.0,68.7,15424.0,30.9,16854.0,33.8,4339.0,...,57.2,481.0,68.2,3590.0,3225.0,89.8,361.0,10.1,404.0,11.3
3,ADAMS,ADAMS COUNTY 14,8265.0,5510.0,66.7,3038.0,36.8,3397.0,41.1,876.0,...,49.7,529.0,59.7,377.0,317.0,84.1,75.0,19.9,89.0,23.6
4,ADAMS,BRIGHTON 27J,17152.0,13109.0,76.4,3982.0,23.2,4294.0,25.0,1405.0,...,67.9,287.0,74.9,703.0,630.0,89.6,70.0,10.0,76.0,10.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
179,DENVER,CHARTER SCHOOL INSTITUTE,12020.0,5323.0,44.3,6660.0,55.4,6800.0,56.6,708.0,...,72.3,69.0,73.4,343.0,172.0,50.1,169.0,49.3,174.0,50.7
180,CHAFFEE,MOUNTAIN BOCES,184.0,76.0,41.3,113.0,61.4,116.0,63.0,36.0,...,57.1,21.0,60.0,1.0,1.0,100.0,0.0,0.0,1.0,100.0
181,WELD,CENTENNIAL BOCES,146.0,0.0,0.0,146.0,100.0,146.0,100.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
182,LA PLATA,SAN JUAN BOCES,84.0,0.0,0.0,84.0,100.0,84.0,100.0,5.0,...,0.0,0.0,0.0,3.0,0.0,0.0,3.0,100.0,3.0,100.0


In [12]:
df = df_mobility_demo.copy()
df = (county
    .merge(df, how='left')
    .rename(columns={'organization_name': 'school_dist'})
    .drop(columns=['category', 'school_year', 'org_code'])
    .drop_duplicates()
)
df.insert(0, 'county', df.pop('county_name'))
df.to_csv('../working-data/dist_mobility_rate_demographics.csv', index=False)
df

Unnamed: 0,county,school_dist,total_pupil_count,total_stable_student_count,total_stability_rate,total_mobile_student_count,total_student_mobility_rate,total_instances_of_mobility,total_mobility_incidence_rate,total_female_pupil_count,...,total_native_hawaiian_or_other_pacific_islander_student_mobility_rate,total_native_hawaiian_or_other_pacific_islander_instances_of_mobility,total_native_hawaiian_or_other_pacific_islander_mobility_incidence_rate,total_two_or_more_races_pupil_count,total_two_or_more_races_stable_student_count,total_two_or_more_races_stability_rate,total_two_or_more_races_mobile_student_count,total_two_or_more_races_student_mobility_rate,total_two_or_more_races_instances_of_mobility,total_two_or_more_races_mobility_incidence_rate
0,STATE TOTAL,STATE TOTAL,939283.0,705064.0,75.1,231706.0,24.7,253577.0,27.0,458512.0,...,34.8,840.0,38.0,29329.0,21501.0,73.3,7718.0,26.3,8433.0,28.8
2,ADAMS,MAPLETON 1,9037.0,5077.0,56.2,3919.0,43.4,4133.0,45.7,4450.0,...,70.8,17.0,70.8,219.0,129.0,58.9,90.0,41.1,91.0,41.6
3,ADAMS,ADAMS 12 FIVE STAR SCHOOLS,49889.0,34283.0,68.7,15424.0,30.9,16854.0,33.8,24340.0,...,45.3,42.0,48.8,662.0,455.0,68.7,203.0,30.7,222.0,33.5
4,ADAMS,ADAMS COUNTY 14,8265.0,5510.0,66.7,3038.0,36.8,3397.0,41.1,3966.0,...,0.0,0.0,0.0,55.0,28.0,50.9,26.0,47.3,28.0,50.9
5,ADAMS,BRIGHTON 27J,17152.0,13109.0,76.4,3982.0,23.2,4294.0,25.0,8452.0,...,30.6,12.0,33.3,423.0,304.0,71.9,117.0,27.7,127.0,30.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180,DENVER,CHARTER SCHOOL INSTITUTE,12020.0,5323.0,44.3,6660.0,55.4,6800.0,56.6,6311.0,...,51.9,15.0,55.6,187.0,92.0,49.2,94.0,50.3,96.0,51.3
181,CHAFFEE,MOUNTAIN BOCES,184.0,76.0,41.3,113.0,61.4,116.0,63.0,103.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
182,WELD,CENTENNIAL BOCES,146.0,0.0,0.0,146.0,100.0,146.0,100.0,64.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
183,LA PLATA,SAN JUAN BOCES,84.0,0.0,0.0,84.0,100.0,84.0,100.0,36.0,...,0.0,0.0,0.0,3.0,0.0,0.0,3.0,100.0,3.0,100.0


In [13]:
df = df_grad_rate.copy()
df = df.drop(columns='county_name')
df = (county
    .merge(df, how='left')
    .rename(columns={'organization_name': 'school_dist'})
    .drop(columns=['organization_code'])
    .drop_duplicates()
)
df.insert(0, 'county', df.pop('county_name'))
df.to_csv('../working-data/dist_grad_rate.csv', index=False)
df

Unnamed: 0,county,school_dist,students_with_disabilities_final_grad_base,students_with_disabilities_graduates_total,students_with_disabilities_graduation_rate,students_with_disabilities_completers_total,students_with_disabilities_completion_rate,limited_english_proficient_final_grad_base,limited_english_proficient_graduates_total,limited_english_proficient_graduation_rate,...,homeless_final_grad_base,homeless_graduates_total,homeless_graduation_rate,homeless_completers_total,homeless_completion_rate,gifted_talented_final_grad_base,gifted_talented_graduates_total,gifted_talented_graduation_rate,gifted_talented_completers_total,gifted_talented_completion_rate
0,STATE TOTAL,STATE TOTAL,5775.0,3099.0,53.7,3222.0,55.8,6171.0,3289.0,53.3,...,2394.0,1175.0,49.1,1262.0,52.7,6604.0,6048.0,91.6,6156.0,93.2
2,ADAMS,MAPLETON 1,49.0,18.0,36.7,19.0,38.8,219.0,73.0,33.3,...,41.0,12.0,29.3,16.0,39.0,44.0,27.0,61.4,27.0,61.4
3,ADAMS,ADAMS 12 FIVE STAR SCHOOLS,250.0,118.0,47.2,127.0,50.8,379.0,257.0,67.8,...,106.0,62.0,58.5,65.0,61.3,227.0,201.0,88.5,208.0,91.6
4,ADAMS,ADAMS COUNTY 14,59.0,32.0,54.2,32.0,54.2,170.0,86.0,50.6,...,99.0,52.0,52.5,57.0,57.6,30.0,27.0,90.0,27.0,90.0
5,ADAMS,BRIGHTON 27J,66.0,33.0,50.0,35.0,53.0,110.0,63.0,57.3,...,41.0,21.0,51.2,21.0,51.2,63.0,51.0,81.0,52.0,82.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180,DENVER,CHARTER SCHOOL INSTITUTE,111.0,16.0,14.4,17.0,15.3,141.0,31.0,22.0,...,121.0,11.0,9.1,11.0,9.1,47.0,23.0,48.9,27.0,57.4
181,CHAFFEE,MOUNTAIN BOCES,19.0,14.0,73.7,16.0,84.2,17.0,10.0,58.8,...,14.0,9.0,64.3,10.0,71.4,2.0,1.0,50.0,1.0,50.0
182,WELD,CENTENNIAL BOCES,6.0,1.0,16.7,1.0,16.7,4.0,1.0,25.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
183,LA PLATA,SAN JUAN BOCES,1.0,0.0,0.0,0.0,0.0,1.0,1.0,100.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,100.0
