In [4]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [5]:
# Get all school districts in the US 
lea = pd.read_csv("lea_characteristics.csv", sep=",")

# Get our banned books + districts
df_school_yr_1 = pd.read_csv("PEN_July_1_2021_June_30_2022.csv", sep=",")
df_school_yr_2 = pd.read_csv("PEN_July_1_2022_December_31_2022.csv", sep=",")
books = df_school_yr_1.append(df_school_yr_2)

In [11]:
# Create a new binary variable that is "book bans" and set true given our "data" df
books['district_name'] = books['District'].str.lower()
lea['district_name'] = lea['LEA_NAME'].str.lower()
books["STATE"] = books["State"].str.upper()

# Strip of common words
common_words = ["city school district", "schools", "school", "district", "public", "county", 
                "independent", "cisd", "isd", " sd", " co ", "pblc", "schs", "high", "of", 
                "community", "comm", "area", "the", "department", "consolidated", "  no",
                "r-iii", "r-vii", "r-ix", "r-iv", "r-1", "r-i", 
                "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "#", "-", "."]

for word in common_words: 
    books['district_name'] = books['district_name'].str.replace(word, " ").str.strip()
    lea['district_name'] = lea['district_name'].str.replace(word, " ").str.strip()

In [13]:
# make sure they are unique
pairs = {}
for district in pd.unique(books["district_name"]): 
    pen_names = books[books["district_name"]==district][["STATE", "District"]]
    lea_names = lea[lea["district_name"]==district][["LEA_STATE_NAME", "LEA_NAME"]]
    
    for pen_name in pd.unique(pen_names["District"]): 
        state = pen_names[pen_names["District"] == pen_name]["STATE"].values[0]
        lea_name = lea_names[lea_names["LEA_STATE_NAME"] == state]["LEA_NAME"].values
        if len(lea_name) > 0: 
            lea_name = lea_name[0]
            pairs[pen_name] = lea_name

outliers_pen_to_lea = {
    "Medford School District 549C": "Medford SD 549C", 
    "Virginia Beach City Public Schools": "VA BEACH CITY PBLC SCHS",
    "Harlem School District 122": "Harlem UD 122", 
    "Regional School Unit No. 56": "RSU 56", 
    "ROWVA CUSD 208": "R O W V A CUSD 208", 
    "Cherry Creek School District": "Cherry Creek School District No. 5 in the County of Arapaho", 
    "Raytown Quality Schools": "RAYTOWN C-2", 
    "Williston Basin School District #7": "WILLISTON 1", 
    "Lexington-Richland School District 5": "School District 5 of Lexington and Richland Counties", 
    "Madison County Schools": "MADISON CO SCHOOL DIST", 
}

matching_district_names = pairs | outliers_pen_to_lea

books['matched_name'] = [matching_district_names.get(d, None) for d in books['District']]

0                            INDIAN RIVER
1                         Central York SD
2                            INDIAN RIVER
3        MARLBORO CENTRAL SCHOOL DISTRICT
4                      FREDERICKSBURG ISD
                      ...                
1472            SPOTSYLVANIA CO PBLC SCHS
1473            SPOTSYLVANIA CO PBLC SCHS
1474            SPOTSYLVANIA CO PBLC SCHS
1475    Natrona County School District #1
1476    Natrona County School District #1
Name: matched_name, Length: 4009, dtype: object


In [21]:
# Instead of merging, create a binary variable where a column exists
lea["banned_books"] = np.where((lea['LEA_NAME'].isin(books['matched_name'])) & (lea['LEA_STATE_NAME'].isin(books['STATE'])), True, False)

# print(lea.loc[lea["banned_books"] == True])

      LEA_STATE LEA_STATE_NAME    LEAID  \
174          AK         ALASKA   200180   
956          AR       ARKANSAS   504590   
1083         AR       ARKANSAS   512450   
2056         CA     CALIFORNIA   642510   
2992         CO       COLORADO   802910   
...         ...            ...      ...   
17195        WI      WISCONSIN  5502550   
17373        WI      WISCONSIN  5510170   
17392        WI      WISCONSIN  5510890   
17512        WI      WISCONSIN  5515780   
17581        WY        WYOMING  5604510   

                                                LEA_NAME  \
174                            Anchorage School District   
956                               CONWAY SCHOOL DISTRICT   
1083                      SILOAM SPRINGS SCHOOL DISTRICT   
2056                          William S. Hart Union High   
2992   Cherry Creek School District No. 5 in the Coun...   
...                                                  ...   
17195        Chippewa Falls Area Unified School District   
173

In [27]:
# Now we can merge in additional dataframes of interest

'''
Dropout Count (TOTD912)
Enrollment Base (EBS912)
Dropout Rate (DRP912)
Diploma Counts (TOTDPL)
AFGR Enrollment Base Counts (AFGEB)
'''

# Need a better/more recent dataset

lea_academic_success = pd.read_csv("lea_academic_success.txt", delim_whitespace=True)
lea_academic_success.drop(columns=["SURVYEAR", "FIPST"], inplace=True)

         LEAID  TOTD912  EBS912  DRP912  TOTDPL  AFGEB  AFGR  TOTOHC
0       100002       -4       0    -4.0      -2      5  -2.0      -2
1       100005       -3    1052    -3.0     193    278  69.4       9
2       100006       -1    1648    -1.0     338    469  72.1      13
3       100007       -3    3935    -3.0     900    965  93.3      12
4       100008       -3    2886    -3.0     672    674  99.7      13
...        ...      ...     ...     ...     ...    ...   ...     ...
18434  6600040       -1      -1    -1.0      -1     -1  -1.0      -1
18435  6900030       -1      -1    -1.0      -1     -1  -1.0      -1
18436  7200030       -9  145172    -9.0   25514  41984  60.8      -2
18437  7800002      144    2591     5.6     463     -2  -2.0       0
18438  7800030      136    2474     5.5     495   1269  39.0      21

[18439 rows x 8 columns]


In [35]:
# Merge our new academic success DF with our other data on LEAID
final_df = lea.merge(lea_academic_success, on='LEAID', how='left', indicator=True)

final_df.to_csv("test1.csv")

In [29]:
len(lea)


17604

In [30]:
len(lea_academic_success)

18439