In [205]:
#Importing useful packages
import pandas as pd
import numpy as np

In [206]:

# Data cleaning function that drops any relevant rows (i.e x variables) that have missing values present
def drop_missing_rows(df, variables):
    '''
    FUNCTION
    Returns a new data frame with observations of missing values
    in any variable dropped
    
    PARAMETERS:
    df (type df), a dataframe intending to be cleaned
    variables is a list of any x and y variable
    
    
    IMPORTANT - Data must be indexed as the user intends,
    function not intended to include any indexes
    '''
    missing_index = []
    for row in df.isnull().iterrows():
        index_value, column_values = row
        for var in variables:
            if index_value not in missing_index:
                if column_values[var] == True:
                    missing_index.append(index_value)
                else: continue
    new_df = df
    new_df = new_df.drop(missing_index)
    return new_df

# Data cleaning function that eliminates any rows that are not relevant to our research
def drop_irrelevant_columns(df, variables):
    '''
    FUNCTION
    Returns a new data frame with irrelevant columns dropped
    from the original data frame.
    
    PARAMETERS:
    df (type df), a dataframe intending to be cleaned
    variables is a list of any x and y variables
    
    
    IMPORTANT - Data must be indexed as the user intends,
    function not intended to filter missing indexes
    '''
    irrelevant = []
    for columns, rows in df.items():
        if columns not in variables:
            irrelevant.append(columns)
    new_df = df
    new_df = new_df.drop(columns=irrelevant)
    
    return new_df




In [225]:
# Our x variables that we will be using to analyze our data
x_variables = [
    "Average Math Proficiency", 
    "Student Achievement Rating", 
    "Percent of Students Chronically Absent",
    "School Income Estimate",
    "Strong Family-Community Ties %"
]

#Our Y Variable
y_variable = "Community School?"

#Our X and Y variables
all_variables = x_variables
all_variables.append(y_variable)

# Importing our dataset, the 2016 School Explorer from PASSNYC
# We are setting the index to "School Name"
schoolexp = pd.read_csv(r'C:\Users\12265\Downloads\2016 School Explorer.csv').set_index("School Name")

# Creating a new data frame and running our unclean dataframe through the cleaning functions
# to get a new cleaned data frame
schoolexp_cleaned = drop_missing_rows(schoolexp, all_variables)
schoolexp_cleaned = drop_irrelevant_columns(new_schoolexp, all_variables)


for row in schoolexp_cleaned.iterrows():
    index_value, column_values = row
    clean_number = column_values["School Income Estimate"].replace("$","")
    schoolexp_cleaned.at[index_value, "School Income Estimate"] = clean_number

for row in schoolexp_cleaned.iterrows():
    index_value, column_values = row
    clean_number = column_values["School Income Estimate"].replace(",", "")
    schoolexp_cleaned.at[index_value, "School Income Estimate"] = float(clean_number)

for row in schoolexp_cleaned.iterrows():
    index_value, column_values = row
    clean_number = column_values["Strong Family-Community Ties %"].replace("%", "")
    schoolexp_cleaned.at[index_value, "Strong Family-Community Ties %"] = float(clean_number)

for row in schoolexp_cleaned.iterrows():
    index_value, column_values = row
    clean_number = column_values["Percent of Students Chronically Absent"].replace("%", "")
    schoolexp_cleaned.at[index_value, "Percent of Students Chronically Absent"] = float(clean_number)

    
for row in schoolexp_cleaned.iterrows():
    index_value, column_values = row
    Met_Target = column_values["Student Achievement Rating"].replace("Exceeding Target", "Meeting Target")
    schoolexp_cleaned.at[index_value, "Student Achievement Rating"] = Met_Target

for row in schoolexp_cleaned.iterrows():
    index_value, column_values = row
    Not_Met_Target = column_values["Student Achievement Rating"].replace("Approaching Target", "Not Meeting Target")
    schoolexp_cleaned.at[index_value, "Student Achievement Rating"] = Not_Met_Target

    
    

Unnamed: 0_level_0,Community School?,School Income Estimate,Percent of Students Chronically Absent,Strong Family-Community Ties %,Student Achievement Rating,Average Math Proficiency
School Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
P.S. 015 ROBERTO CLEMENTE,Yes,31141.72,18.0,85.0,Not Meeting Target,2.17
P.S. 019 ASHER LEVY,No,56462.88,30.0,86.0,Meeting Target,2.98
P.S. 020 ANNA SILVER,No,44342.61,20.0,80.0,Not Meeting Target,2.54
P.S. 034 FRANKLIN D. ROOSEVELT,No,31454.0,28.0,89.0,Meeting Target,2.47
THE STAR ACADEMY - P.S.63,No,46435.59,23.0,89.0,Meeting Target,2.54
P.S. 064 ROBERT SIMON,No,39415.45,33.0,88.0,Meeting Target,2.48
P.S. 110 FLORENCE NIGHTINGALE,No,43706.73,13.0,87.0,Meeting Target,3.2
P.S. 134 HENRIETTA SZOLD,No,28820.67,36.0,79.0,Meeting Target,2.73
P.S. 140 NATHAN STRAUS,No,34889.24,27.0,83.0,Meeting Target,2.27
P.S. 142 AMALIA CASTRO,No,35545.1,27.0,89.0,Meeting Target,2.31


In [241]:
#Here We Will Calculate Our Summary Statistics
community_schools = schoolexp_cleaned.groupby("Community School?").get_group("Yes")

non_community_schools = schoolexp_cleaned.groupby("Community School?").get_group("No")
avg_math_cs = community_schools.mean(0, numeric_only=True)
avg_math_ncs = non_community_schools.mean( numeric_only=True)

n_community_schools = community_schools.count()
n_non_community_schools = non_community_schools.count()

n_cs_meets_target = community_schools.groupby("Student Achievement Rating").get_group("Meeting Target").count()
n_ncs_meets_target = non_community_schools.groupby("Student Achievement Rating").get_group("Meeting Target").count()
pct_cs_meets_target = n_cs_meets_target / n_community_schools 
pct_ncs_meets_target = n_ncs_meets_target / n_non_community_schools

cs_med_income = community_schools["School Income Estimate"].median()
ncs_med_income = non_community_schools["School Income Estimate"].median()

median_absenteeism_cs = community_schools["Percent of Students Chronically Absent"].median()
median_absenteeism_ncs = non_community_schools["Percent of Students Chronically Absent"].median()
