#### Set Up

In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np
pd.set_option("display.max_rows", None, "display.max_columns", None)

elem = pd.read_csv("output/clean_elementary_schools.csv")
middle = pd.read_csv("output/clean_middle_schools.csv")
high = pd.read_csv("output/clean_high_schools.csv")

original_elem_rows = len(elem)
original_middle_rows = len(middle)
original_high_rows = len(high)

print(f'Original number of rows in the elementary school rankings dataset before filtering out any counties outside NYC: {original_elem_rows}')
print(f'Original number of rows in the middle school rankings dataset before filtering out any counties outside NYC: {original_middle_rows}')
print(f'Original number of rows in the high school rankings dataset before filtering out any counties outside NYC: {original_high_rows}')

Original number of rows in the elementary school rankings dataset before filtering out any counties outside NYC: 2404
Original number of rows in the middle school rankings dataset before filtering out any counties outside NYC: 1430
Original number of rows in the high school rankings dataset before filtering out any counties outside NYC: 1094


#### Drop all rows where county is not in the Bronx, Brooklyn, Richmond, Queens and/or New York County

In [2]:
elem['county'].value_counts()
elem = elem[elem['county'].str.contains("Richmond County|New York|Brooklyn|Kings County|Queens|Bronx")]

elem.loc[elem['county'].str.contains('7|8|9|10|11|12'), 'county'] = 'Bronx County'
elem.loc[elem['county'].str.contains('13|14|17|21|22|24|27'), 'county'] = 'Kings County'
elem.loc[elem['county'].str.contains('3|4|5|6'), 'county'] = 'New York County'

In [3]:
#middle['county'].value_counts()
middle = middle[middle['county'].str.contains("Richmond County|New York|Brooklyn|Kings County|Queens|Bronx")]
middle.loc[middle['county'].str.contains('7|8|9|10|11|12|Bronx'), 'county'] = 'Bronx County'
middle.loc[middle['county'].str.contains('13|14|17|20|21|22|24|27|Brooklyn'), 'county'] = 'Kings County'
middle.loc[middle['county'].str.contains('1|2|3|4|5|6|New York'), 'county'] = 'New York County'

In [4]:
#high['county'].value_counts()
high = high[high['county'].str.contains("Richmond County|New York|Brooklyn|Kings County|Queens|Bronx")]

In [5]:
filtered_elem_rows = len(elem)
filtered_middle_rows = len(middle)
filtered_high_rows = len(high)

print(f'Final number of rows in the elementary school rankings dataset after filtering out counties outside NYC: {filtered_elem_rows}')
print(f'Final number of rows in the middle school rankings dataset after filtering out counties outside NYC: {filtered_middle_rows}')
print(f'Final number of rows in the high school rankings dataset after filtering out counties outside NYC: {filtered_high_rows}')

Final number of rows in the elementary school rankings dataset after filtering out counties outside NYC: 911
Final number of rows in the middle school rankings dataset after filtering out counties outside NYC: 574
Final number of rows in the high school rankings dataset after filtering out counties outside NYC: 394


#### Use pd.cut( ) for School Rankings - Divide Schools into 4-evenly distributed bins

In [6]:
# Elementary School
# elem['ranking'].count()
cut_labels_4 = ['Top 25th Percentile', '50th Percentile', '75th Percentile', 'Bottom 25th Percentile']
cut_bins = [0, 597, 1194, 1791, 2388]
elem['ranking_quartile'] = pd.cut(elem['ranking'], bins=cut_bins, labels=cut_labels_4)

In [7]:
# Middle School
# middle['ranking'].count()
cut_labels_4 = ['Top 25th Percentile', '50th Percentile', '75th Percentile', 'Bottom 25th Percentile']
cut_bins = [0, 35, 710, 1065, 1420]
middle['ranking_quartile'] = pd.cut(middle['ranking'], bins=cut_bins, labels=cut_labels_4)

In [8]:
# High School
# high['ranking'].count()
cut_labels_4 = ['Top 25th Percentile', '50th Percentile', '75th Percentile', 'Bottom 25th Percentile']
cut_bins = [0, 275, 550, 825, 1100]
high['ranking_quartile'] = pd.cut(high['ranking'], bins=cut_bins, labels=cut_labels_4)

#### Create a Classification by Generating Binary Values  for School Ranks


In [9]:
# Elementary Schools
dum_elem = pd.get_dummies(elem, columns=["ranking_quartile"], prefix=["type_is"] )

# Merge with elem df
elem = elem.merge(dum_elem)
#elem


In [10]:
# Middle Schools
dum_middle = pd.get_dummies(middle, columns=["ranking_quartile"], prefix=["type_is"] )

# Merge with elem df
middle = middle.merge(dum_middle)
#middle

In [11]:
# High Schools
dum_high = pd.get_dummies(high, columns=["ranking_quartile"], prefix=["type_is"] )

# Merge with elem df
high = high.merge(dum_high)
#high

#### Create a Classification by Generating Binary Values for Percentage Free Lunch Recipient Status
* Within NYC 72% of students are eligible for discounted or free lunch
* Binary value of whether school's student population is greater than (>) 72% eligibe for reduced or free lunch

In [12]:
# Use np.where to create Bool column --> True denotes greater than 72% of students are eligible for free lunch
elem['lunch_eligibity_high'] = np.where(elem['free_lunch_recipient'] > 72, True, False)
#elem.head()

middle['lunch_eligibity_high'] = np.where(middle['free_lunch_recipient'] > 72, True, False)
#middle.head()

high['lunch_eligibity_high'] = np.where(high['free_lunch_recipient'] > 72, True, False)
#high.head()

In [13]:
# Generate binary values using get_dummies for elementary school
dum_e = pd.get_dummies(elem, columns=["lunch_eligibity_high"])
dum_m = pd.get_dummies(middle, columns=["lunch_eligibity_high"])
dum_h = pd.get_dummies(high, columns=["lunch_eligibity_high"])

# Merge with main df
elem = elem.merge(dum_e)
middle = middle.merge(dum_m)
high = high.merge(dum_h)

In [14]:
elem.head()

Unnamed: 0,ranking,school_name,school_type,address,city,zipcode,county,district,is_charter,is_magnet,is_title_i,total_students,full_time_teachers,student_teacher_ratio,free_lunch_recipient,white,black,hispanic,asian,total_expenditure_per_pupil,ranking_quartile,type_is_Top 25th Percentile,type_is_50th Percentile,type_is_75th Percentile,type_is_Bottom 25th Percentile,lunch_eligibity_high,lunch_eligibity_high_False,lunch_eligibity_high_True
0,1,Anderson School (The),Public,100 W 77th St,New York,10024,New York County,New York City Geographic District # 3,No,No,No,521,26.0,20.0,15.0,250 (48.0%),13 (2.5%),47 (9.0%),152 (29.2%),"$10,823",Top 25th Percentile,1,0,0,0,False,1,0
1,2,New Explorations Into Sciencetech And Math Hig...,"Public, Alternative",111 Columbia St,New York,10002,New York County,New York City Geographic District # 1,No,No,No,1752,95.0,18.4,23.5,710 (40.5%),123 (7.0%),202 (11.5%),578 (33.0%),"$10,124",Top 25th Percentile,1,0,0,0,False,1,0
2,3,Ps 77 Lower Lab School,Public,1700 3rd Ave,New York,10128,New York County,New York City Geographic District # 2,No,No,No,360,18.0,20.0,10.8,182 (50.6%),1 (0.3%),17 (4.7%),117 (32.5%),"$11,769",Top 25th Percentile,1,0,0,0,False,1,0
3,4,Success Academy Charter School-Union Square,"Public, Charter",40 Irving Pl-2nd Fl,New York,10003,New York County,Success Academy Charter School-Union Square,Yes,No,Yes,720,48.0,14.9,44.3,183 (25.4%),150 (20.8%),203 (28.2%),140 (19.4%),"$18,938",Top 25th Percentile,1,0,0,0,False,1,0
4,5,Tag Young Scholars,Public,240 E 109th St,New York,10029,New York County,New York City Geographic District # 4,No,No,No,594,24.0,24.7,34.3,101 (17.0%),99 (16.7%),90 (15.2%),231 (38.9%),"$9,029",Top 25th Percentile,1,0,0,0,False,1,0


In [15]:
middle.head()

Unnamed: 0,ranking,school_name,school_type,address,city,zipcode,county,district,is_charter,is_magnet,is_title_i,total_students,fullt_time_teachers,student_teacher_ratio,free_lunch_recipient,white,black,hispanic,asian,total_expenditure_per_pupil,ranking_quartile,type_is_Top 25th Percentile,type_is_50th Percentile,type_is_75th Percentile,type_is_Bottom 25th Percentile,lunch_eligibity_high,lunch_eligibity_high_False,lunch_eligibity_high_True
0,1,Baccalaureate School For Global Education,Public,34-12 36th Ave,Long Island City,11106,Queens County,New York City Geographic District #30,No,No,No,536,30.0,17.9,35.6,144 (26.9%),21 (3.9%),101 (18.8%),247 (46.1%),"$11,568",Top 25th Percentile,1,0,0,0,False,1,0
1,2,Success Academy Charter School-Bronx 2,"Public, Charter",450 St Paul's Pl-5th Fl,Bronx,10456,Bronx County,Success Academy Charter School-Bronx 2,Yes,No,Yes,754,41.0,18.3,87.3,10 (1.3%),474 (62.9%),257 (34.1%),6 (0.8%),"$16,154",Top 25th Percentile,1,0,0,0,True,0,1
2,3,Ps 122 Mamie Fay,Public,21-21 Ditmars Blvd,Astoria,11105,Queens County,New York City Geographic District #30,No,No,No,1362,75.0,18.1,47.6,623 (45.7%),53 (3.9%),262 (19.2%),346 (25.4%),"$10,762",Top 25th Percentile,1,0,0,0,False,1,0
3,4,Anderson School (The),Public,100 W 77th St,New York,10024,New York County,New York City Geographic District # 3,No,No,No,521,26.0,20.0,15.0,250 (48.0%),13 (2.5%),47 (9.0%),152 (29.2%),"$10,823",Top 25th Percentile,1,0,0,0,False,1,0
4,5,Success Academy Charter School-Harlem 3,"Public, Charter, Alternative",410 E 100th St-4th Fl,New York,10029,New York County,Success Academy Charter School-Harlem 3,Yes,No,Yes,1111,67.0,16.7,74.5,19 (1.7%),584 (52.6%),366 (32.9%),12 (1.1%),"$24,480",Top 25th Percentile,1,0,0,0,True,0,1


In [16]:
high.head()

Unnamed: 0,ranking,school_name,school_type,address,city,zipcode,county,district,is_charter,is_magnet,is_title_i,total_students,fullt_time_teachers,student_teacher_ratio,free_lunch_recipient,white,black,hispanic,asian,total_expenditure_per_pupil,ranking_quartile,type_is_Top 25th Percentile,type_is_50th Percentile,type_is_75th Percentile,type_is_Bottom 25th Percentile,lunch_eligibity_high,lunch_eligibity_high_False,lunch_eligibity_high_True
0,1,Bronx High School Of Science (The),Public,75 W 205th St,Bronx,10468,Bronx County,New York City Geographic District #10,No,No,No,3020,141.0,21.4,42.6,673 (22.3%),80 (2.6%),214 (7.1%),"1,934 (64.0%)","$9,289",Top 25th Percentile,1,0,0,0,False,1,0
1,2,Stuyvesant High School,Public,345 Chambers St,New York,10282,New York County,New York City Geographic District # 2,No,No,No,3319,158.0,20.9,43.0,626 (18.9%),29 (0.9%),99 (3.0%),"2,409 (72.6%)","$10,131",Top 25th Percentile,1,0,0,0,False,1,0
2,3,Queens High School For The Sciences At York Co...,Public,94-50 159th St,Jamaica,11433,Queens County,New York City Geographic District #28,No,No,No,475,27.0,17.5,60.8,29 (6.1%),15 (3.2%),19 (4.0%),387 (81.5%),"$11,832",Top 25th Percentile,1,0,0,0,False,1,0
3,5,High School Of American Studies At Lehman College,Public,2925 Goulden Ave,Bronx,10468,Bronx County,New York City Geographic District #10,No,No,No,412,25.0,16.4,23.8,233 (56.6%),16 (3.9%),40 (9.7%),85 (20.6%),"$13,250",Top 25th Percentile,1,0,0,0,False,1,0
4,6,Townsend Harris High School,Public,149-11 Melbourne Ave,Flushing,11367,Queens County,New York City Geographic District #25,No,No,No,1183,56.0,21.1,47.5,229 (19.4%),61 (5.2%),146 (12.3%),654 (55.3%),"$11,244",Top 25th Percentile,1,0,0,0,False,1,0


In [17]:
# Export to csv
elem.to_csv("output/final_elementary.csv")
middle.to_csv("output/final_middle.csv")
high.to_csv("output/final_high.csv")