# Imports

In [1]:
## imports
import pandas as pd
import re
import numpy as np

## print multiple things from same cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Load data and show examples

In [2]:
## read in data 
cep_optin = pd.read_excel("https://frac.org/wp-content/uploads/2021SY-CEP_Database_Export.xlsx")

## clean up colnames and make schoolname lowercase
new_colnames = [re.sub("[^A-Za-z0-9]+", "", col.lower()) for col in cep_optin.columns]
cep_optin.columns = new_colnames
cep_optin['schoolname_lower'] = cep_optin.schoolname.str.lower()

In [3]:
cols_keep = ["schoolname_lower", "individualispjune2020", "participatingincepsy2021"]


In [4]:
## things to grab 1: elementary schools
cep_optin['is_elem'] = np.where(cep_optin.schoolname_lower.str.contains("\s+elem", regex = True),
                True, False)

test_schools = cep_optin.loc[(cep_optin.is_elem) &
               (cep_optin.schoolname_lower.str.contains("elem\\.")) |
               (cep_optin.schoolname_lower.str.contains("elem")) |
               (cep_optin.schoolname_lower.str.contains("elementary")) |
               (cep_optin.schoolname_lower.str.contains("esd")),
            'schoolname_lower'].sample(n = 30, random_state = 470)
test_schools_show_ids = test_schools.iloc[13:23].index
elem_torbind = cep_optin.loc[cep_optin.index.isin(test_schools_show_ids),
                       cols_keep].copy().rename(columns = 
                    {'schoolname_lower': 'schoolname'})
elem_torbind['is_elem_exercise'] = True 
elem_torbind['is_charter_exercise'] = False
elem_torbind['is_highschool_exercise'] = False

## things to grab 2: charter schools
charter_examples = cep_optin.schoolname_lower[cep_optin.schoolname_lower.astype(str).\
                    str.contains("charter")].sample(n = 8,
                    random_state = 422).to_list()
other_examples = cep_optin.schoolname_lower[~cep_optin.schoolname_lower.astype(str).\
                    str.contains("charter")].sample(n = 8,
                    random_state = 422).to_list()
combined_examples = charter_examples + other_examples
charter_torbind = cep_optin.loc[cep_optin.schoolname_lower.isin(combined_examples),
                       cols_keep].copy().rename(columns = 
                    {'schoolname_lower': 'schoolname'})
charter_torbind['is_elem_exercise'] = False
charter_torbind['is_charter_exercise'] = True 
charter_torbind['is_highschool_exercise'] = False 

## things to grab 3: high schools for self directed activities
hs_torbind = cep_optin.loc[cep_optin.schoolname_lower.astype(str).\
                    str.contains("high|hs"), cols_keep].sample(n = 15,
                    random_state = 422).rename(columns = 
                    {'schoolname_lower': 'schoolname'})
hs_torbind['is_elem_exercise'] = False
hs_torbind['is_charter_exercise'] = False 
hs_torbind['is_highschool_exercise'] = True


In [16]:
## rowbind the three using pd.concat 
schools_df = pd.concat([elem_torbind, charter_torbind, hs_torbind]).drop_duplicates(subset = 'schoolname')

## write to private_data
schools_df.to_csv("../../../public_data/schools_df.csv", index = False)

