In [2]:
import pandas as pd
import numpy as np

from itertools import combinations

# Merging STAR, NCES, and Diveristy data 

Read in previous processed data files as a pandas dataframe

In [3]:

star = pd.read_csv("../Dataset/final_data/star.csv")
nces = pd.read_csv("../Dataset/final_data/nces.csv")
diversity = pd.read_csv("../Dataset/final_data/diversity.csv")


In [4]:
all_df = star.merge(nces, left_on='School_Code', right_on='schoolid_stateassigned')
all_df = all_df.merge(diversity.add_prefix('diversity_'), left_on='School_Code', right_on='diversity_School_Code')

[[i, j] for i,j in combinations(all_df, 2) if all_df[i].equals(all_df[j])]

[['School_Code', 'schoolid_stateassigned'],
 ['School_Code', 'diversity_School_Code'],
 ['schoolid_stateassigned', 'diversity_School_Code'],
 ['schtype_raw_DC', 'diversity_Entity_Type'],
 ['distid_stateassigned', 'diversity_LEA_Code']]

In [5]:
final_drop_cols = ['schoolid_stateassigned', 'diversity_School_Code', 'diversity_Entity_Type', 'diversity_LEA_Code', \
                   'diversity_Not_at-risk_(%)', 'year_1819', 'year_1920', 'diversity_LEA_Name', 'diversity_School_Name',\
                   'diversity_Most_socioeconomically_diverse_flag', 'diversity_Majority_group']
all_df = all_df.drop(final_drop_cols, axis=1)
all_df.rename(columns = {'distid_stateassigned':'Dist_Code', 
                         'diversity_Total_Count_of_Students': 'Total_Count_of_Students', 
                         'diversity_Ward': 'Ward'}, inplace = True)
all_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 231 entries, 0 to 230
Data columns (total 43 columns):
 #   Column                                                                                         Non-Null Count  Dtype  
---  ------                                                                                         --------------  -----  
 0   Unnamed: 0_x                                                                                   231 non-null    int64  
 1   School_Code                                                                                    231 non-null    int64  
 2   STAR_Score_18                                                                                  231 non-null    float64
 3   STAR_Rating_18                                                                                 231 non-null    int64  
 4   STAR_Score_19                                                                                  231 non-null    int64  
 5   STAR_Rating_19        

In [6]:
all_df['STAR_Score_inc'] = all_df.STAR_Score_18 - all_df.STAR_Score_19
all_df['STAR_Score_inc(%)'] = all_df[['STAR_Score_18', 'STAR_Score_19']].pct_change(axis='columns', periods=-1)['STAR_Score_18']
all_df['STAR_Rating_inc'] = all_df.STAR_Rating_18 - all_df.STAR_Rating_19
all_df['STAR_Rating_inc(%)'] = all_df[['STAR_Rating_18', 'STAR_Rating_19']].pct_change(axis='columns', periods=-1)['STAR_Rating_18']

In [7]:
all_df.columns

Index(['Unnamed: 0_x', 'School_Code', 'STAR_Score_18', 'STAR_Rating_18',
       'STAR_Score_19', 'STAR_Rating_19', 'STAR_Score_inc',
       'STAR_Score_change', 'STAR_Rating_change', 'Unnamed: 0_y', 'schoolname',
       'level', 'schtype_raw_DC', 'Dist_Code', 'distname',
       'pp_total_raw_DC_1819', 'pp_site_raw_DC_1819',
       'pp_centshare_raw_DC_1819', 'schoolstloc_raw_DC_1819',
       'enroll_raw_DC_1819', 'pp_total_raw_DC_1920', 'pp_site_raw_DC_1920',
       'pp_centshare_raw_DC_1920', 'schoolstloc_raw_DC_1920',
       'enroll_raw_DC_1920', 'pp_total_raw_DC_inc(%)', 'pp_total_raw_DC_inc',
       'pp_site_raw_DC_inc(%)', 'pp_site_raw_DC_inc',
       'pp_centshare_raw_DC_inc(%)', 'pp_centshare_raw_DC_inc',
       'schoolstloc_raw_DC_inc(%)', 'schoolstloc_raw_DC_inc',
       'enroll_raw_DC_inc(%)', 'enroll_raw_DC_inc', 'diversity_Unnamed: 0',
       'Ward', 'Total_Count_of_Students', 'diversity_At-Risk_(%)',
       'diversity_Socieconomic_Diversity_Score,_2018-19_(%)',
       'div

In [8]:
all_df.head()

Unnamed: 0,Unnamed: 0_x,School_Code,STAR_Score_18,STAR_Rating_18,STAR_Score_19,STAR_Rating_19,STAR_Score_inc,STAR_Score_change,STAR_Rating_change,Unnamed: 0_y,...,Ward,Total_Count_of_Students,diversity_At-Risk_(%),"diversity_Socieconomic_Diversity_Score,_2018-19_(%)","diversity_Socieconomic_Diversity_Score,_2016-17_(%)","diversity_Change_in_Socieconomic_Diversity_Score,_from_2016-17_to_2018-19_(percentage_points)",diversity_risk_level,STAR_Score_inc(%),STAR_Rating_inc,STAR_Rating_inc(%)
0,1,102,35.63,2,38,2,-2.37,-0.062368,0.0,24,...,7,133,75,25,28.417265,-3.417265,4,-0.062368,0,0.0
1,2,109,47.68,3,40,2,7.68,0.192,0.5,23,...,7,360,66,34,37.325907,-3.325907,4,0.192,1,0.5
2,3,113,35.96,2,57,3,-21.04,-0.369123,-0.333333,170,...,8,415,73,27,28.752261,-1.752261,4,-0.369123,-1,-0.333333
3,4,114,51.4,3,36,2,15.4,0.427778,0.5,179,...,5,485,60,40,46.167558,-6.167558,4,0.427778,1,0.5
4,5,114,51.4,3,53,3,-1.6,-0.030189,0.0,179,...,5,485,60,40,46.167558,-6.167558,4,-0.030189,0,0.0


In [9]:
#converting back to a csv file
all_df.to_csv("../Dataset/final_data/final_data.csv")

# Diagnosis of which schools are missing

## diagnosis of which schools are missing due to missing STAR or spending data

In [10]:
DC_finance_2018=pd.read_excel("../Dataset/NCES/DC_1819.xlsx")
DC_STAR_2018=pd.read_csv("../Dataset/OSSEDC/2018/2018_STAR_Score.csv")
DC_finance_2019=pd.read_excel("../Dataset/NCES/DC_1920.xlsx")
DC_STAR_2019=pd.read_csv("../Dataset/OSSEDC/2019/2019_STAR_Score.csv")

DC_STAR_2019.columns = DC_STAR_2019.columns.str.replace(" ", "_")
DC_STAR_2019.columns = DC_STAR_2019.columns.str.replace("Framework_", "")

DC_STAR_2018.columns = DC_STAR_2018.columns.str.replace(" ", "_")

In [11]:
DC_full_STAR_2018=pd.merge(DC_finance_2018,DC_STAR_2018,
                           left_on="schoolid_stateassigned",right_on="School_Code",how="inner")
DC_full_STAR_2019=pd.merge(DC_finance_2019,DC_STAR_2019,
                           left_on="schoolid_stateassigned",right_on="School_Code",how="inner")

In [12]:
miss_financial_2018 = DC_STAR_2018[-DC_STAR_2018["School_Code"].isin(DC_finance_2018.schoolid_stateassigned)]
miss_financial_2019 = DC_STAR_2019[-DC_STAR_2019["School_Code"].isin(DC_finance_2019.schoolid_stateassigned)]

miss_star_2018 = DC_finance_2018[-DC_finance_2018.schoolid_stateassigned.isin(DC_STAR_2018["School_Code"])]
miss_star_2019 = DC_finance_2019[-DC_finance_2018.schoolid_stateassigned.isin(DC_STAR_2019["School_Code"])]

In [13]:
print("Number of STAR without finanicial data is: ",
      miss_financial_2018.shape[0])
print("Number of schools doesn't have a STAR score is :",
      miss_star_2018.shape[0])

print("Number of STAR without finanicial data is: ",
      miss_financial_2019.shape[0])
print("Number of schools doesn't have a STAR score is :",
      miss_star_2019.shape[0])

Number of STAR without finanicial data is:  6
Number of schools doesn't have a STAR score is : 45
Number of STAR without finanicial data is:  10
Number of schools doesn't have a STAR score is : 40


### analysis school type

In [14]:

print("The school type with the most missing star data in 2018 was", miss_star_2018['schtype_raw_DC'].value_counts().index[0], 
      ',', miss_star_2018['schtype_raw_DC'].value_counts()[0], "schools of this type miss star data")
print("The school type with the most missing financial data in 2018 was", miss_financial_2018['School_Type'].value_counts().index[0], 
      ',', miss_financial_2018['School_Type'].value_counts()[0], "schools of this type miss financial data")


print("The school type with the most missing star data in 2019 was", miss_star_2019['schtype_raw_DC'].value_counts().index[0], 
      ',', miss_star_2019['schtype_raw_DC'].value_counts()[0], "schools of this type miss star data")
print("The school type with the most missing financial data in 2019 was", miss_financial_2019['School_Type'].value_counts().index[0], 
      ',', miss_financial_2019['School_Type'].value_counts()[0], "schools of this type miss financial data")

The school type with the most missing star data in 2018 was Charter , 28 schools of this type miss star data
The school type with the most missing financial data in 2018 was Charter , 6 schools of this type miss financial data
The school type with the most missing star data in 2019 was Charter , 31 schools of this type miss star data
The school type with the most missing financial data in 2019 was Charter , 10 schools of this type miss financial data


### analysis school size

In [15]:
print("In 2018, out of", len(miss_star_2018), "schools that miss star score,", len(miss_star_2018[miss_star_2018['enroll_raw_DC']=='NRD']), "schools don't have enrollment data.")
print("The average students number of schools that miss star score is", round(miss_star_2018[miss_star_2018['enroll_raw_DC']!='NRD'].enroll_raw_DC.mean()),
      "Compare to the average students number of every school in DC is", round(DC_finance_2018[DC_finance_2018['enroll_raw_DC']!='NRD'].enroll_raw_DC.mean()))

print("In 2019, out of", len(miss_star_2019), "schools that miss star score,", len(miss_star_2019[miss_star_2019['enroll_raw_DC']=='NRD']), "schools don't have enrollment data.")
print("The average students number of schools that miss star score is", round(miss_star_2019[miss_star_2019['enroll_raw_DC']!='NRD'].enroll_raw_DC.mean()),
      "Compare to the average students number of every school in DC is", round(DC_finance_2019[DC_finance_2019['enroll_raw_DC']!='NRD'].enroll_raw_DC.mean()))

In 2018, out of 45 schools that miss star score, 11 schools don't have enrollment data.
The average students number of schools that miss star score is 326 Compare to the average students number of every school in DC is 394
In 2019, out of 40 schools that miss star score, 1 schools don't have enrollment data.
The average students number of schools that miss star score is 324 Compare to the average students number of every school in DC is 395


## analysis school missing between 2 years

In [None]:
School_Code_2018 = DC_finance_2018[]