In [4]:
%matplotlib inline

from __future__ import print_function
from statsmodels.compat import lzip
import statsmodels
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.stats.api as sms
import matplotlib.pyplot as plt
from scipy import stats
import pickle

from statsmodels.stats.outliers_influence import OLSInfluence

### Processing Functions

In [15]:
# Remove white space from around the entries

def strip_entries(df, cols):
    strip = lambda x : x.strip()
    for col in cols:
        df[col] = df[col].apply(strip)
        
# Remove empty strings

def replace_empty_values(df):
    df.replace(r'\s+', np.nan, inplace=True, regex=True)
    df.replace('', np.nan, inplace=True, regex=True)
    df.replace('–', np.nan, inplace=True, regex=True)
    df.replace('†', np.nan, inplace=True, regex=True)
    df.replace('N/A', np.nan, inplace=True, regex=True)

def replace_zeros(df,cols):
    for col in cols:
        df[col] = df[col].replace('="0.00"','0')
        df[col] = df[col].replace('="0"','0')       

def cast_as_float(df, cols):
    for col in cols:
        df[col] = df[col].astype(float, copy=False)
        
def cast_as_int(df, cols):
    for col in cols:
        df[col] = df[col].astype(int, copy=False)

### Import and process raw ELSI public school data

In [23]:
# Import raw public schools csv, part 1

raw_elsi_df1 = pd.read_csv('../data/raw/ELSI_csv_columns_cleaned.csv')
print(raw_elsi_df1.shape)
raw_elsi_df1.head()

(65499, 36)


Unnamed: 0,School Name,State Name [Public School] Latest available year,State Abbr [Public School] Latest available year,School Name [Public School] 2014-15,School ID - NCES Assigned [Public School] Latest available year,Agency ID - NCES Assigned [Public School] Latest available year,County Name [Public School] 2014-15,County Number [Public School] 2014-15,ANSI/FIPS State Code [Public School] Latest available year,School Level Code [Public School] 2014-15,...,Total Students All Grades (Includes AE) [Public School] 2014-15,Free Lunch Eligible [Public School] 2014-15,Reduced-price Lunch Eligible Students [Public School] 2014-15,Pupil/Teacher Ratio [Public School] 2014-15,Full-Time Equivalent (FTE) Teachers [Public School] 2014-15,Grade 13 Students [Public School] 2014-15,Grade 12 Students [Public School] 2014-15,Grade 11 Students [Public School] 2014-15,Grade 10 Students [Public School] 2014-15,Grade 9 Students [Public School] 2014-15
0,1 LT CHARLES W. WHITCOMB SCHOOL,Massachusetts,MA,1 LT Charles W. Whitcomb School,250732002639,2507320,MIDDLESEX COUNTY,25017,25,2-Middle,...,1325,627,90,11.38,116.44,†,†,†,†,†
1,1050 ADAIR CO. HIGH,Missouri,MO,ADAIR CO. HIGH,290579000125,2905790,ADAIR COUNTY,29001,29,3-High,...,92,37,11,7.59,12.12,†,12,18,13,17
2,112 ALC AFTER SCHOOL & SUMMER SCH,Minnesota,MN,112 ALC AFTER SCHOOL & SUMMER SCH,270819004415,2708190,CARVER COUNTY,27019,27,3-High,...,12,"=""0""","=""0""",–,–,†,12,"=""0""","=""0""","=""0"""
3,112 ALC MIDDLE SCHOOL,Minnesota,MN,112 ALC MIDDLE SCHOOL,270819004622,2708190,CARVER COUNTY,27019,27,2-Middle,...,159,76,15,184.88,"=""0.86""",†,†,†,†,†
4,12TH STREET ELEMENTARY,Michigan,MI,12th Street Elementary,262895007802,2628950,KALAMAZOO COUNTY,26077,26,1-Primary,...,564,91,24,18.5,30.49,†,†,†,†,†


In [24]:
# Import raw public schools csv, part 2

raw_elsi_df2 = pd.read_csv('../data/raw/ELSI_csv2_columns_cleaned.csv')
print(raw_elsi_df2.shape)
raw_elsi_df2.head()

(65499, 24)


Unnamed: 0,School Name,State Name [Public School] Latest available year,School ID - NCES Assigned [Public School] Latest available year,Agency ID - NCES Assigned [Public School] Latest available year,County Name [Public School] 2014-15,ANSI/FIPS State Code [Public School] Latest available year,Location Address 1 [Public School] 2014-15,Location City [Public School] 2014-15,Location ZIP [Public School] 2014-15,American Indian/Alaska Native Students [Public School] 2014-15,...,Hawaiian Nat./Pacific Isl. Students [Public School] 2014-15,Two or More Races Students [Public School] 2014-15,Total Race/Ethnicity [Public School] 2014-15,Male Students [Public School] 2014-15,Female Students [Public School] 2014-15,Agency Type [District] 2014-15,School Type [Public School] 2014-15,State School ID [Public School] 2014-15,Congressional Code [Public School] 2014-15,Reconstituted flag [Public School] 2014-15
0,1 LT CHARLES W. WHITCOMB SCHOOL,Massachusetts,250732002639,2507320,MIDDLESEX COUNTY,25,25 UNION STREET,MARLBOROUGH,"=""01752""",1,...,"=""0""",39,†,681,644,1-Regular local school district that is NOT a ...,1-Regular school,"=""01700045""",2503,2-No
1,100 BLACK MEN OF THE BAY AREA COMMUNITY,California,"=""062805013190""","=""0628050""",ALAMEDA COUNTY,"=""06""",3400 MALCOLM AVE.,OAKLAND,94607,†,...,†,†,†,†,†,1-Regular local school district that is NOT a ...,1-Regular school,"=""0125856""","=""0613""",2-No
2,1050 ADAIR CO. HIGH,Missouri,290579000125,2905790,ADAIR COUNTY,29,205 W DEWEY,BRASHEAR,63533,"=""0""",...,"=""0""",2,†,41,51,1-Regular local school district that is NOT a ...,1-Regular school,1050001092,2906,2-No
3,10TH STREET SCHOOL,Washington,530486002475,5304860,SNOHOMISH COUNTY,53,7204 27TH AVE NE,MARYSVILLE,98271,4,...,"=""0""",16,†,82,85,1-Regular local school district that is NOT a ...,1-Regular school,1656,5302,2-No
4,112 ALC AFTER SCHOOL & SUMMER SCH,Minnesota,270819004415,2708190,CARVER COUNTY,27,11 PEAVEY RD,CHASKA,55317,"=""0""",...,"=""0""",1,†,10,2,1-Regular local school district that is NOT a ...,4-Alternative/other school,"=""010112067""",2703,2-No


In [25]:
# Combine raw public school csvs

raw_elsi_df = pd.merge(raw_elsi_df1, raw_elsi_df2, how='inner', left_on='School Name', right_on='School Name')
raw_elsi_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 132924 entries, 0 to 132923
Data columns (total 59 columns):
School Name                                                          132924 non-null object
State Name [Public School] Latest available year_x                   132924 non-null object
State Abbr [Public School] Latest available year                     132924 non-null object
School Name [Public School] 2014-15                                  132924 non-null object
School ID - NCES Assigned [Public School] Latest available year_x    132924 non-null object
Agency ID - NCES Assigned [Public School] Latest available year_x    132924 non-null object
County Name [Public School] 2014-15_x                                132924 non-null object
County Number [Public School] 2014-15                                132924 non-null object
ANSI/FIPS State Code [Public School] Latest available year_x         132924 non-null object
School Level Code [Public School] 2014-15                    

In [26]:
new_cols = ['school_name',
       'state_name',\
       'state',\
       'school_name_cl',\
       'school_id',\
       'agency_id',\
       'county_name',\
       'county_number',\
       'fips_state_code',\
       'school_level_code',\
       'charter_school',\
       'magnet_school',\
       'shared_time_school',\
       'urban_centric_locale',\
       'start_of_year_status',\
       'agency_type_dis',\
       'school_type',\
       'school_wide_title_I',\
       'title_I_eligible_school',\
       'longitude',\
       'latitude',\
       'state_school_id',\
       'congressional_code',\
       'virtual_school_status',\
       'national_school_lunch_program',\
       'total_students_all_grades_excl_ae',\
       'total_students_all_grades_incl_ae',\
       'free_lunch_eligible',\
       'reduced_price_lunch_eligible_students',\
       'pupil_teacher_ratio',\
       'fte_teachers',\
       'grade_13_students',\
       'grade_12_students',\
       'grade_11_students',\
       'grade_10_students',\
       'grade_9_students',\
       'state_name_x',\
       'school_id_x',\
       'agency_id_x',\
       'county_name_x',\
       'ansi_fips_state_code',\
       'address',\
       'city',\
       'zip',\
       'american_indian_students',\
       'asian_or_asian_pacif_isl_students',\
       'hispanic_students',\
       'black_students',\
       'white_students',\
       'hawaiian_nat_pacific_isl_students',\
       'two_or_more_races_students',\
       'total_race_ethnicity',\
       'male_students',\
       'female_students',\
       'agency_type_dis_x',\
       'school_type_x',\
       'state_school_id_x',\
       'congressional_code_x',\
       'reconstituted_flag']

In [27]:
# Rename public school columns
raw_elsi_df.columns = new_cols

In [28]:
strip_entries(raw_elsi_df, new_cols)

In [29]:
# Extract only the high schools

elsi_df = raw_elsi_df[raw_elsi_df['school_level_code'] == '3-High']
elsi_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18114 entries, 1 to 132919
Data columns (total 59 columns):
school_name                              18114 non-null object
state_name                               18114 non-null object
state                                    18114 non-null object
school_name_cl                           18114 non-null object
school_id                                18114 non-null object
agency_id                                18114 non-null object
county_name                              18114 non-null object
county_number                            18114 non-null object
fips_state_code                          18114 non-null object
school_level_code                        18114 non-null object
charter_school                           18114 non-null object
magnet_school                            18114 non-null object
shared_time_school                       18114 non-null object
urban_centric_locale                     18114 non-null object
start_of

In [31]:
with open('../data/processed/0216_elsi_public_schools.pkl', 'wb') as picklefile:
    pickle.dump(elsi_df, picklefile)

### Import and process raw ELSI private school data

In [9]:
# Import raw private schools csv

raw_elsi_private_df = pd.read_csv('../data/raw/ELSI_csv_private_schools.csv')
print(raw_elsi_private_df.shape)
raw_elsi_private_df.head()

(26524, 24)


Unnamed: 0,Private School Name,State Name [Private School] Latest available year,School ID - NCES Assigned [Private School] Latest available year,County Name [Private School] 2011-12,City [Private School] 2011-12,Mailing Address [Private School] 2011-12,State Abbr [Private School] Latest available year,ZIP [Private School] 2011-12,School Type [Private School] 2011-12,Urban-centric Locale [Private School] 2011-12,...,Grades 9-12 Students [Private School] 2011-12,Percentage of Black Students [Private School] 2011-12,Percentage of Hispanic Students [Private School] 2011-12,Percentage of Asian/Pacific Islander Students [Private School] 2011-12,Percentage of American Indian/Alaska Native Students [Private School] 2011-12,Percentage of White Students [Private School] 2011-12,Percentage of Hawaiian Nat./Pacific Isl. Students [Private School] 2011-12,Percentage of Two or More Races Students [Private School] 2011-12,Pupil/Teacher Ratio [Private School] 2011-12,Full-Time Equivalent (FTE) Teachers [Private School] 2011-12
0,123 YOU N ME PRESCHOOL,ILLINOIS,A0103186,PEORIA,PEORIA,809 W DETWEILLER DR STE A,IL,61615,7-Early Childhood Program/child care center,12-City: Mid-size,...,–,10,30,6.67,13.33,10,"=""0.00""",30,18.75,1.6
1,1ST BAPTIST CHURCH PRESCHOOL,ARKANSAS,A0970060,BRADLEY,WARREN,310 S MAIN ST,AR,71671,7-Early Childhood Program/child care center,33-Town: Remote,...,–,"=""0.00""","=""0.00""","=""0.00""","=""0.00""",100,"=""0.00""","=""0.00""","=""0.80""",5.0
2,1ST CEREBRAL PALSY OF NJ,NEW JERSEY,"=""02043767""",ESSEX,BELLEVILLE,7 SANFORD AVE,NJ,"=""07109""",4-Special Education,21-Suburb: Large,...,14,44.44,35.19,11.11,"=""0.00""",9.26,"=""0.00""","=""0.00""",4.5,12.0
3,1ST CLASS MONTESSORI,TENNESSEE,A1102054,SHELBY,CORDOVA,1725 APPLING RD,TN,38016,2-Montessori,11-City: Large,...,–,–,–,–,–,–,–,–,5,2.0
4,1ST CLASS MONTESSORI PRESCHOOL,TENNESSEE,K9305911,SHELBY,MEMPHIS,1336 PEABODY AVE,TN,38104,2-Montessori,11-City: Large,...,–,100,"=""0.00""","=""0.00""","=""0.00""","=""0.00""","=""0.00""","=""0.00""",3.16,1.9


In [10]:
raw_elsi_private_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26524 entries, 0 to 26523
Data columns (total 24 columns):
Private School Name                                                              26522 non-null object
State Name [Private School] Latest available year                                26518 non-null object
School ID - NCES Assigned [Private School] Latest available year                 26518 non-null object
County Name [Private School] 2011-12                                             26518 non-null object
City [Private School] 2011-12                                                    26518 non-null object
Mailing Address [Private School] 2011-12                                         26518 non-null object
State Abbr [Private School] Latest available year                                26518 non-null object
ZIP [Private School] 2011-12                                                     26518 non-null object
School Type [Private School] 2011-12                               

In [11]:
private_columns = ['school_name',\
    'state_name',\
    'school_id',\
    'county_name',\
    'city',\
    'Address',\
    'state',\
    'zip',\
    'school_type',\
    'urban_centric_locale',\
    'school_level',\
    'religious_affiliation',\
    'school_community_type',\
    'religious_orientation',\
    'grades_9_12_students',\
    'black_students',\
    'hispanic_students',\
    'asian_or_asian_pacif_isl_students',\
    'american_indian_students',\
    'white_students',\
    'hawaiian_nat_pacific_isl_students',\
    'two_or_more_races_students',\
    'pupil_teacher_ratio',\
    'fte_teachers']

In [12]:
# Rename private school columns
raw_elsi_private_df.columns = private_columns

In [13]:
raw_elsi_private_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26524 entries, 0 to 26523
Data columns (total 24 columns):
school_name                          26522 non-null object
state_name                           26518 non-null object
school_id                            26518 non-null object
county_name                          26518 non-null object
city                                 26518 non-null object
Address                              26518 non-null object
state                                26518 non-null object
zip                                  26518 non-null object
school_type                          26518 non-null object
urban_centric_locale                 26518 non-null object
school_level                         26518 non-null object
religious_affiliation                26518 non-null object
school_community_type                26518 non-null object
religious_orientation                26518 non-null object
grades_9_12_students                 26518 non-null object
black_

In [17]:
# strip_entries(raw_elsi_private_df, raw_elsi_private_df.columns)

In [19]:
# Drop duplicates

raw_elsi_private_df.drop_duplicates(subset=['school_name', 'state_name', 'school_id'], keep='first', inplace=True)

In [20]:
# Extract only high schools

private_df= raw_elsi_private_df[raw_elsi_private_df['school_level']=="2-Secondary (school has one or more of grades 7-12 and does not have any grade lower than 7th grade)."]

In [21]:
private_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2378 entries, 59 to 26516
Data columns (total 24 columns):
school_name                          2378 non-null object
state_name                           2378 non-null object
school_id                            2378 non-null object
county_name                          2378 non-null object
city                                 2378 non-null object
Address                              2378 non-null object
state                                2378 non-null object
zip                                  2378 non-null object
school_type                          2378 non-null object
urban_centric_locale                 2378 non-null object
school_level                         2378 non-null object
religious_affiliation                2378 non-null object
school_community_type                2378 non-null object
religious_orientation                2378 non-null object
grades_9_12_students                 2378 non-null object
black_students       

In [22]:
with open('../data/processed/0216_elsi_private_schools.pkl', 'wb') as picklefile:
    pickle.dump(private_df, picklefile)

### Import and process US News scraped data

In [261]:
# Import scraped data from US News

import pickle
with open("../data/interim/0208_pages_1_to_97_schools.pkl", 'rb') as picklefile: 
    dict1 = pickle.load(picklefile)

with open("../data/interim/0208_pages_98_to_133_schools.pkl", 'rb') as picklefile: 
    dict2 = pickle.load(picklefile)

with open("../data/interim/0208_page_134_schools.pkl", 'rb') as picklefile: 
    dict3 = pickle.load(picklefile)

In [262]:
# Convert dictionaries to dataframes

raw_usnews_df1 = pd.DataFrame.from_dict(dict1, orient="index")
raw_usnews_df2 = pd.DataFrame.from_dict(dict2, orient="index")
raw_usnews_df3 = pd.DataFrame.from_dict(dict3, orient="index")

In [263]:
print(len(raw_usnews_df1))
print(len(raw_usnews_df2))
print(len(raw_usnews_df3))

1888
710
13
2576


In [264]:
# Concat frames into single dataframe

frames = [raw_usnews_df1, raw_usnews_df2, raw_usnews_df3]
usnews_df = pd.concat(frames)

In [265]:
usnews_df.head()

Unnamed: 0,district,college_readiness,medal_award,school_name,rank,city,state,graduation_rate
A and M Cons High School,College Station Independent School District,32.0,Silver,A and M Cons High School,#1699,College Station,TX,95%
A. Crawford Mosley High School,Bay,40.9,Silver,A. Crawford Mosley High School,#1117,Lynn Haven,FL,82%
Abby Kelley Foster Charter Public School,Abby Kelley Foster Charter Public (District),37.2,Silver,Abby Kelley Foster Charter Public School,#1326,Worcester,MA,98%
Aberdeen High,Harford County Public Schools,28.6,Silver,Aberdeen High,#1933,Aberdeen,MD,87%
Abington Heights High School,Abington Heights SD,35.1,Silver,Abington Heights High School,#1481,Clarks Summit,PA,93%


In [608]:
usnews_df['medal_award'].value_counts()

Silver    2125
Gold       486
Name: medal_award, dtype: int64

In [266]:
strip_entries(usnews_df, usnews_df.columns)

### Join ELSI and US News datasets

In [288]:
import urllib
import json
import requests

gmaps_base_url = 'https://maps.googleapis.com/maps/api/geocode/json?'
api_key = 'AIzaSyA0VE3TYAIK3PHhX3WnL_lJMp9OR7SQTdE'
scontext = None

In [302]:
# Method to get exact address from Google API to be used as join field

def get_join_address(df):
    exceptions = 0
    for index, row in df.iterrows():
        if row['join_address'] != '':
            print("Present: " + row['join_address'])
            exceptions = 0
        else:
            full_address = row['school_name'] + "," + row['city'] + "," + row['state']
            full_address_cl = full_address.replace(" ", "+")
            try:
                search_criteria = {'address': full_address_cl, 'key' : api_key}
                url = gmaps_base_url + urllib.parse.urlencode(search_criteria)
                response = requests.get(url).json()
                street_num = response['results'][0]['address_components'][0]['short_name']
                street = response['results'][0]['address_components'][1]['short_name']
                city = response['results'][0]['address_components'][3]['short_name']
                state = response['results'][0]['address_components'][5]['short_name']
                join_address = street_num + " " + street + ", " + city + ", " + state
                print("Success: " + join_address)
                df.loc[index, 'join_address'] = join_address
                exceptions = 0
            except:
                print("Exception: " + row['school_name'])
                exceptions = exceptions + 1
                if exceptions > 25:
                    return "Google API limit reached."

In [294]:
# Filling address with info from Google API

get_join_address(usnews_df)

Present: 1801 Harvey Mitchell Pkwy S, College Station, TX
Present: 501 Mosley Dr, Lynn Haven, FL
Present: # 9 10, Indian Lake East, Worcester County
Present: 251 Paradise Rd, 2, Halls Cross Roads, MD
Present: 222 Noble Rd, South Abington Township, PA
Present: 900 Highland Ave, Abington Township, PA
Present: 555 Dana Ave, San Jose, CA
Present: 2162 24th Ave, SF, CA
Present: 5109 W Enterprise St, Charleston County, US
Present: 1100 Catharine St, Philadelphia, PA
Present: 1776 Raritan Rd, Union County, US
Present: 1776 Raritan Rd, Union County, US
Present: 5715 S 1300 E, Salt Lake County, US
Present: 74 A Van Nu Po, Santa Fe County, US
Present: 1101 Kennedy Rd, Hartford County, US
Present: 2325 Heck Ave, Neptune Township, NJ
Present: 28-04 41st Ave, Queens, NY
Present: 380 Edison Way, Washoe County, US
Present: 30-20 Thomson Ave, Queens, NY
Present: 10720 E 22nd St, Pima County, US
Present: 1 Westinghouse Plaza, Boston, MA
Present: 1200 Pleasant Hill Rd, Contra Costa County, US
Present: 3

'Google API limit reached.'

In [295]:
# Number of missing addresses

len(usnews_df[usnews_df['join_address']==''])

30

In [296]:
len(elsi_df)

18114

In [422]:
# Number of missing addresses

len(elsi_df[elsi_df['join_address'] == ''])

507

In [303]:
# Filling address with info from Google API

get_join_address(elsi_df)

Present: 205 Dewey St, Salt River Township, MO
Exception: 112 ALC AFTER SCHOOL & SUMMER SCH
Exception: 21ST CENTURY ACADEMY
Present: 174 Brush Hill Ave, Hampden County, US
Exception: 270 - HAP - IS
Present: 2400 Lindbergh Dr, Hennepin County, US
Present: 1001 MN-7, Hennepin County, US
Present: 10700 Cedar Lake Rd, Hennepin County, US
Present: 2575 W 88th St, Bloomington, MN
Present: 2575 W 88th St, Bloomington, MN
Present: 18301 MN-7, Hennepin County, US
Present: 18301 MN-7, Hennepin County, US
Present: 18301 MN-7, Hennepin County, US
Present: 5901 Sunnyfield Rd E, Hennepin County, US
Present: 9400 Cedar Lake Rd, Minneapolis, MN
Present: 9400 Cedar Lake Rd, Minneapolis, MN
Present: 9400 Cedar Lake Rd, Minneapolis, MN
Present: 1820 Xenium Ln N, Hennepin County, US
Exception: 4092 - WATERSHED HIGH SCHOOL ALC
Present: 11111 Bren Rd W, Hennepin County, US
Present: 6200 W Broadway Ave, Hennepin County, US
Present: 173 54th St SW, Grand Rapids, MI
Present: 2737 County Rd D, Ramsey County, US

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Exception: AVENAL HIGH
Present: 146 Gettle Rd, Sand Lake, NY
Present: Newland Newland No. 2, NC, 28657
Present: Newland Newland No. 2, NC, 28657
Present: Newland Newland No. 2, NC, 28657
Present: 45-30 36th St, Queens, NY
Present: 257 State Rd, London Grove, PA
Present: 245 Clinton St, Avon, NY
Present: 245 Clinton St, Avon, NY
Present: 245 Clinton St, Avon, NY
Present: 245 Clinton St, Avon, NY
Present: Detroit Rd Avon, OH, 44011
Present: Detroit Rd Avon, OH, 44011
Present: Detroit Rd Avon, OH, 44011
Present: Detroit Rd Avon, OH, 44011
Present: 510 W Avon Rd, Hartford County, US
Present: 510 W Avon Rd, Hartford County, US
Present: 510 W Avon Rd, Hartford County, US
Present: 510 W Avon Rd, Hartford County, US
Present: 210 N Pine St, Bon Homme County, US
Present: 175 Avon Belden Rd, Lorain County, US
Present: 287 W Main St, Norfolk County, US
Present: 700 E Main St, Highlands County, US
Present: 242 South Blvd, Highlands County, US
Present: 1435 W Auburn Rd, Rochester Hills, MI
Present: 

In [423]:
# Drop duplicates from ELSI dataset

elsi_df.drop_duplicates(subset=['school_name', 'state_name', 'school_id'], keep='first', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return func(*args, **kwargs)


In [424]:
elsi_df.shape

(11533, 61)

In [425]:
# Checking to see how many addresses failed

len(elsi_df[elsi_df['join_address']==''])

507

In [581]:
private_df['join_address']=''

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [582]:
# Filling address with info from Google API
get_join_address(private_df)

Success: 42 E 30th St, New York, NY


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Success: 20 West End Ave, New York, NY
Success: # 6 10058, South Mountain, Franklin County
Success: 121 W 14th St, Erie County, US
Success: 948 N 1300 W, Washington County, US
Success: 146 New St, Dekalb County, US
Success: 23123 N State Rd 7, Palm Beach County, US
Success: STE D210 2495, Bridle Trails, King County
Success: 1904 W Gordon Ave, Davis County, US
Success: 151 South St, Hampshire County, US
Success: 13440 Crewe St, Los Angeles, CA
Success: 6600 Nicollet Ave, Hennepin County, US
Success: 408 Covered Bridge Rd, Clayton, NC
Success: 330 Bedford Park Blvd, Bronx, NY
Success: 5501 Westbank Expy, 2, LA
Success: 200 High St, New Haven County, US
Success: 4860 Oregon St, San Diego, CA
Success: 2 Convent Rd, Morris County, US
Success: 315 Hillside Ave, Bergen County, US
Success: 4920 Strathmore Ave, 4, MD
Success: 54 W Main St, Sprague, CT
Success: 2815 Benade Cir, Bryn Athyn, PA
Success: 2815 Benade Cir, Bryn Athyn, PA
Success: # 201 1999, Avondale, Sarasota County
Success: 9717 NE

In [397]:
usnews_df['school_id'] = ''

In [398]:
usnews_df

Unnamed: 0,district,college_readiness,medal_award,school_name,rank,city,state,graduation_rate,join_address,school_name_lower,school_id
A and M Cons High School,College Station Independent School District,32.0,Silver,A and M Cons High School,#1699,College Station,TX,95%,"1801 Harvey Mitchell Pkwy S, College Station, TX",a and m cons high school,
A. Crawford Mosley High School,Bay,40.9,Silver,A. Crawford Mosley High School,#1117,Lynn Haven,FL,82%,"501 Mosley Dr, Lynn Haven, FL",a. crawford mosley high school,
Abby Kelley Foster Charter Public School,Abby Kelley Foster Charter Public (District),37.2,Silver,Abby Kelley Foster Charter Public School,#1326,Worcester,MA,98%,"# 9 10, Indian Lake East, Worcester County",abby kelley foster charter public school,
Aberdeen High,Harford County Public Schools,28.6,Silver,Aberdeen High,#1933,Aberdeen,MD,87%,"251 Paradise Rd, 2, Halls Cross Roads, MD",aberdeen high,
Abington Heights High School,Abington Heights SD,35.1,Silver,Abington Heights High School,#1481,Clarks Summit,PA,93%,"222 Noble Rd, South Abington Township, PA",abington heights high school,
Abington High School,Abington SD,33.0,Silver,Abington High School,#1624,Abington,PA,93%,"900 Highland Ave, Abington Township, PA",abington high school,
Abraham Lincoln High,San Jose Unified School District,47.1,Silver,Abraham Lincoln High,#810,San Jose,CA,94%,"555 Dana Ave, San Jose, CA",abraham lincoln high,
Abraham Lincoln High School,San Francisco Unified School District,55.4,Silver,Abraham Lincoln High School,#508,San Francisco,CA,88%,"2162 24th Ave, SF, CA",abraham lincoln high school,
Academic Magnet High School,Charleston County School District,100,Gold,Academic Magnet High School,#8,North Charleston,SC,100%,"5109 W Enterprise St, Charleston County, US",academic magnet high school,
Academy at Palumbo,School District of Philadelphia,40.1,Silver,Academy at Palumbo,#1160,Philadelphia,PA,94%,"1100 Catharine St, Philadelphia, PA",academy at palumbo,


In [319]:
lower = lambda x : x.lower()

In [320]:
usnews_df['school_name_lower'] = usnews_df['school_name'].apply(lower)

In [322]:
elsi_df['school_name_lower'] = elsi_df['school_name'].apply(lower)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [413]:
# Address join failed due to many one to many matches.
# Try 3: The tricky join

import difflib

def find_best_guess_school_id(usnews_row, elsidf):
    school_name = usnews_row['school_name_lower']
    school_id = usnews_row['school_id']
    if school_id != '':
        print("School ID present for " + school_name + " : " + school_id)
    else:
        location = usnews_row['join_address']
        got_it = False

        name_matches = elsidf[elsidf['school_name_lower'] == school_name]
        if len(name_matches) == 1:
            school_id = name_matches.iloc[0]['school_id']
            got_it = True
            print("Name match for " + school_name + " : " + school_id)
        elif len(name_matches) == 0:
            print("No name match for " + school_name)
        else:
            print("Duplicate name matches for " + school_name + ": " + str(len(name_matches)))

        if got_it == False:
            location_matches = elsidf[elsidf['join_address'] == location]
            if len(location_matches) == 1:
                school_id = location_matches.iloc[0]['school_id']
                got_it = True
                print("Location match for " + school_name + " : " + school_id)
            elif len(location_matches) == 0:
                print("No location match for " + school_name)
            else:
                print("Duplicate location matches for " + school_name)
                diffs = []
                for index, row in location_matches.iterrows():
                    seq=difflib.SequenceMatcher(a=school_name, b=row['school_name_lower'])
                    diffs.append(seq.ratio())
                print(diffs)
                match_index = diffs.index(max(diffs))
                school_id = location_matches.iloc[match_index]['school_id']
                print("Location match for " + school_name + " : " + school_id)
    print('\n')
    return school_id

In [583]:
private_df['school_name_lower'] = private_df['school_name'].apply(lower)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [415]:
# Trying to the tricky join

for index, row in usnews_df.iterrows():
    usnews_df.loc[index, 'school_id'] = find_best_guess_school_id(row, elsi_df)

No name match for a and m cons high school
No location match for a and m cons high school


School ID present for a. crawford mosley high school : 120009000067


No name match for abby kelley foster charter public school
No location match for abby kelley foster charter public school


School ID present for aberdeen high : 240039000679


School ID present for abington heights high school : 420201005091


School ID present for abington high school : 420204003241


School ID present for abraham lincoln high : ="063459005696"


School ID present for abraham lincoln high school : ="063441005581"


School ID present for academic magnet high school : 450144000168


School ID present for academy at palumbo : 421899000962


School ID present for academy for allied health sciences : 341804000584


School ID present for academy for information technology : 341804006134


School ID present for academy for math engineering and science (ames) : 490001700904


School ID present for academy for tech a

In [419]:
# Trying to the tricky join

for index, row in usnews_df.iterrows():
    usnews_df.loc[index, 'school_id'] = find_best_guess_school_id(row, elsi_df)

No name match for a and m cons high school
No location match for a and m cons high school


School ID present for a. crawford mosley high school : 120009000067


No name match for abby kelley foster charter public school
No location match for abby kelley foster charter public school


School ID present for aberdeen high : 240039000679


School ID present for abington heights high school : 420201005091


School ID present for abington high school : 420204003241


School ID present for abraham lincoln high : ="063459005696"


School ID present for abraham lincoln high school : ="063441005581"


School ID present for academic magnet high school : 450144000168


School ID present for academy at palumbo : 421899000962


School ID present for academy for allied health sciences : 341804000584


School ID present for academy for information technology : 341804006134


School ID present for academy for math engineering and science (ames) : 490001700904


School ID present for academy for tech a

In [587]:
# Trying to the tricky join

for index, row in usnews_df.iterrows():
    usnews_df.loc[index, 'school_id'] = find_best_guess_school_id(row, private_df)

No name match for a and m cons high school
No location match for a and m cons high school


School ID present for a. crawford mosley high school : 120009000067


No name match for abby kelley foster charter public school
No location match for abby kelley foster charter public school


School ID present for aberdeen high : 240039000679


School ID present for abington heights high school : 420201005091


School ID present for abington high school : 420204003241


School ID present for abraham lincoln high : ="063459005696"


School ID present for abraham lincoln high school : ="063441005581"


School ID present for academic magnet high school : 450144000168


School ID present for academy at palumbo : 421899000962


School ID present for academy for allied health sciences : 341804000584


School ID present for academy for information technology : 341804006134


School ID present for academy for math engineering and science (ames) : 490001700904


School ID present for academy for tech a

In [350]:
# The moment I realized my join was hopeless.

len((set(usnews_df['school_name_lower'])).difference(set(elsi_df['school_name_lower'])))

1315

In [304]:
with open('../data/processed/0212_usnews_w_join_add.pkl', 'wb') as picklefile:
    pickle.dump(usnews_df, picklefile)

In [341]:
with open('../data/processed/0212_elsi_w_join_add.pkl', 'wb') as picklefile:
    pickle.dump(elsi_df, picklefile)

### Join ELSCI data

In [599]:
join_id_df = pd.merge(usnews_df, elsi_df, how='inner', on='school_id')

### Join Private DB

In [588]:
private_join_id_df = pd.merge(usnews_df, private_df, how='inner', on='school_id')

In [592]:
private_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2378 entries, 59 to 26516
Data columns (total 26 columns):
school_name                          2378 non-null object
state_name                           2378 non-null object
school_id                            2378 non-null object
county_name                          2378 non-null object
city                                 2378 non-null object
Address                              2378 non-null object
state                                2378 non-null object
zip                                  2378 non-null object
school_type                          2378 non-null object
urban_centric_locale                 2378 non-null object
school_level                         2378 non-null object
religious_affiliation                2378 non-null object
school_community_type                2378 non-null object
religious_orientation                2378 non-null object
grades_9_12_students                 2378 non-null object
black_students       

In [591]:
# Three matches!!? 
# The moment I realized none of the private school data was in the US News dataset
# and I wasnt' going to be able to tell my story. Toss private school data and did 
# not use it in this iteration of my project.

private_join_id_df

Unnamed: 0,district,college_readiness,medal_award,school_name_x,rank,city_x,state_x,graduation_rate,join_address_x,school_name_lower_x,...,hispanic_students,asian_or_asian_pacif_isl_students,american_indian_students,white_students,hawaiian_nat_pacific_isl_students,two_or_more_races_students,pupil_teacher_ratio,fte_teachers,join_address_y,school_name_lower_y
0,Laredo Independent School District,86.6,Gold,Early College High School,#72,Laredo,TX,100%,"5241 University Blvd, Webb County, US",early college high school,...,13.48,"=""0.71""","=""0.00""",2.13,"=""0.00""","=""0.00""",8.65,16.3,"1135 N Cleaver St, Chicago, IL",early college high school
1,Franklin Academy,42.4,Silver,Franklin Academy,#1042,Wake Forest,NC,100%,"648 Flaherty Ave, Wake Forest, NC",franklin academy,...,1.22,"=""0.00""","=""0.00""",97.56,"=""0.00""","=""0.00""",3.37,24.3,"140 River Rd, Middlesex County, US",franklin academy
2,Oregon City SD 62,33.8,Silver,Oregon City Senior High School,#1579,Oregon City,OR,88%,"S Beavercreek Rd Caufield, Clackamas County, US",oregon city senior high school,...,3.23,"=""0.00""","=""0.00""",90.32,"=""0.00""",6.45,7.21,4.3,"S Beavercreek Rd Caufield, Clackamas County, US",hera community school


In [428]:
with open('../data/processed/0212_all_join_on_id.pkl', 'wb') as picklefile:
    pickle.dump(join_id_df, picklefile)

In [596]:
join_id_df = replace_empty_values(join_id_df)

In [601]:
join_id_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1368 entries, 0 to 1367
Data columns (total 71 columns):
district                                 1368 non-null object
college_readiness                        1368 non-null object
medal_award                              1368 non-null object
school_name_x                            1368 non-null object
rank                                     1368 non-null object
city_x                                   1368 non-null object
state_x                                  1368 non-null object
graduation_rate                          1368 non-null object
join_address_x                           1368 non-null object
school_name_lower_x                      1368 non-null object
school_id                                1368 non-null object
school_name_y                            1368 non-null object
state_name                               1368 non-null object
state_y                                  1368 non-null object
school_name_cl           

In [602]:
float_cols = ['fte_teachers',\
       'longitude',\
       'latitude',\
       'pupil_teacher_ratio',\
       'graduation_rate']

int_cols = ['total_students_all_grades_excl_ae',\
       'total_students_all_grades_incl_ae',\
       'free_lunch_eligible',\
       'reduced_price_lunch_eligible_students',\
       'american_indian_students',\
       'asian_or_asian_pacif_isl_students',\
       'hispanic_students',\
       'black_students',\
       'white_students',\
       'hawaiian_nat_pacific_isl_students',\
       'two_or_more_races_students',\
       'male_students',\
       'female_students']

drop_cols = ['school_name_y',\
       'state_name_x',\
       'virtual_school_status',\
       'grade_13_students',\
       'grade_12_students',\
       'grade_11_students',\
       'grade_10_students',\
       'grade_9_students',\
       'agency_type_dis_x',\
       'school_type_x',\
       'state_school_id_x',\
       'congressional_code_x',\
       'reconstituted_flag',\
       'join_address_y',\
       'school_name_lower_y',\
       'school_name_lower_x',\
       'school_name_cl',\
       'state_y',\
       'city_y',\
       'rank']

In [470]:
join_id_df['graduation_rate'].replace('%','', inplace=True, regex=True)

In [539]:
replace_zeros(join_id_df, join_id_df.columns)

In [535]:
# Convert rows to float

cast_as_float(join_id_df, float_cols)

In [551]:
# Convert rows to int

cast_as_int(join_id_df, int_cols)

In [543]:
# Get rid of dash characters 

join_id_df.replace('–',np.nan, inplace=True, regex=True)

In [603]:
# Drop cols

join_id_df.drop(drop_cols, axis=1, inplace=True)

In [604]:
join_id_df.drop(['total_race_ethnicity'], axis=1, inplace=True)

In [605]:
join_id_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1368 entries, 0 to 1367
Data columns (total 50 columns):
district                                 1368 non-null object
college_readiness                        1368 non-null object
medal_award                              1368 non-null object
school_name_x                            1368 non-null object
city_x                                   1368 non-null object
state_x                                  1368 non-null object
graduation_rate                          1368 non-null object
join_address_x                           1368 non-null object
school_id                                1368 non-null object
state_name                               1368 non-null object
agency_id                                1368 non-null object
county_name                              1368 non-null object
county_number                            1368 non-null object
fips_state_code                          1368 non-null object
school_level_code        

In [606]:
with open('../data/processed/0215_all_cleaned.pkl', 'wb') as picklefile:
    pickle.dump(join_id_df, picklefile)