In [2]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [3]:
# Get all school districts in the US 
lea = pd.read_csv("lea_characteristics.csv", sep=",")

# Get our banned books + districts
df_school_yr_1 = pd.read_csv("PEN_July_1_2021_June_30_2022.csv", sep=",")
df_school_yr_2 = pd.read_csv("PEN_July_1_2022_December_31_2022.csv", sep=",")
books = df_school_yr_1.append(df_school_yr_2)

In [4]:
# Create a new binary variable that is "book bans" and set true given our "data" df
books['district_name'] = books['District'].str.lower()
lea['district_name'] = lea['LEA_NAME'].str.lower()
books["STATE"] = books["State"].str.upper()

# Strip of common words
common_words = ["city school district", "schools", "school", "district", "public", "county", 
                "independent", "cisd", "isd", " sd", " co ", "pblc", "schs", "high", "of", 
                "community", "comm", "area", "the", "department", "consolidated", "  no",
                "r-iii", "r-vii", "r-ix", "r-iv", "r-1", "r-i", 
                "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "#", "-", "."]

for word in common_words: 
    books['district_name'] = books['district_name'].str.replace(word, " ").str.strip()
    lea['district_name'] = lea['district_name'].str.replace(word, " ").str.strip()

In [5]:
# make sure they are unique
pairs = {}
for district in pd.unique(books["district_name"]): 
    pen_names = books[books["district_name"]==district][["STATE", "District"]]
    lea_names = lea[lea["district_name"]==district][["LEA_STATE_NAME", "LEA_NAME"]]
    
    for pen_name in pd.unique(pen_names["District"]): 
        state = pen_names[pen_names["District"] == pen_name]["STATE"].values[0]
        lea_name = lea_names[lea_names["LEA_STATE_NAME"] == state]["LEA_NAME"].values
        if len(lea_name) > 0: 
            lea_name = lea_name[0]
            pairs[pen_name] = lea_name

outliers_pen_to_lea = {
    "Medford School District 549C": "Medford SD 549C", 
    "Virginia Beach City Public Schools": "VA BEACH CITY PBLC SCHS",
    "Harlem School District 122": "Harlem UD 122", 
    "Regional School Unit No. 56": "RSU 56", 
    "ROWVA CUSD 208": "R O W V A CUSD 208", 
    "Cherry Creek School District": "Cherry Creek School District No. 5 in the County of Arapaho", 
    "Raytown Quality Schools": "RAYTOWN C-2", 
    "Williston Basin School District #7": "WILLISTON 1", 
    "Lexington-Richland School District 5": "School District 5 of Lexington and Richland Counties", 
    "Madison County Schools": "MADISON CO SCHOOL DIST", 
}

matching_district_names = pairs | outliers_pen_to_lea

books['matched_name'] = [matching_district_names.get(d, None) for d in books['District']]

In [47]:
# Only keep specific rows of a df so that we can start fresh with a LEAIID
cols_to_keep = ["LEA_STATE_NAME", "LEAID", "LEA_NAME"]
lea = lea[cols_to_keep]
merged = lea.copy()
merged["banned_books"] = np.where((lea['LEA_NAME'].isin(books['matched_name'])) & (lea['LEA_STATE_NAME'].isin(books['STATE'])), True, False)
merged = merged.loc[merged["LEA_STATE_NAME"] == "TEXAS"]


Unnamed: 0,LEA_STATE_NAME,LEAID,LEA_NAME,banned_books
15027,TEXAS,4800001,CROSBYTON CISD,False
15028,TEXAS,4800002,SPUR ISD,False
15029,TEXAS,4800003,ROCKSPRINGS ISD,False
15030,TEXAS,4800004,POR VIDA ACADEMY,False
15031,TEXAS,4800005,BLACKWELL CISD,False
...,...,...,...,...
16224,TEXAS,4846680,YSLETA ISD,False
16225,TEXAS,4846710,ZAPATA COUNTY ISD,False
16226,TEXAS,4846740,ZAVALLA ISD,False
16227,TEXAS,4846770,ZEPHYR ISD,False


In [69]:
# Add enrollment to our table
enrollment = pd.read_csv("Enrollment.csv")
enrollment["Total_enrollment"] = enrollment["TOT_ENR_M"] + enrollment["TOT_ENR_F"]
cols = ["LEAID", "Total_enrollment"]
enrollment = enrollment[cols]
enrollment = enrollment.groupby('LEAID')["Total_enrollment"].sum().reset_index() # Sum enrollment across all schools in a district

# Merge with our merged df
# Cast our LEAID to a numeric or else the merge doesn't work correctly
merged['LEAID'] = pd.to_numeric(merged['LEAID'], errors='coerce')
enrollment['LEAID'] = pd.to_numeric(enrollment['LEAID'], errors='coerce')
merged = pd.merge(merged, enrollment, on='LEAID', how='left')

merged

Unnamed: 0,LEA_STATE_NAME,LEAID,LEA_NAME,banned_books,Total_enrollment
0,TEXAS,4800001,CROSBYTON CISD,False,365
1,TEXAS,4800002,SPUR ISD,False,261
2,TEXAS,4800003,ROCKSPRINGS ISD,False,313
3,TEXAS,4800004,POR VIDA ACADEMY,False,273
4,TEXAS,4800005,BLACKWELL CISD,False,158
...,...,...,...,...,...
1198,TEXAS,4846680,YSLETA ISD,False,41189
1199,TEXAS,4846710,ZAPATA COUNTY ISD,False,3521
1200,TEXAS,4846740,ZAVALLA ISD,False,368
1201,TEXAS,4846770,ZEPHYR ISD,False,202


In [73]:
# Calculate teacher to student ratio
teacher_student_ratio = pd.read_csv("School_Support.csv", encoding='latin-1')
teacher_student_ratio

In [25]:
# Instead of merging, create a binary variable where a column exists
lea["banned_books"] = np.where((lea['LEA_NAME'].isin(books['matched_name'])) & (lea['LEA_STATE_NAME'].isin(books['STATE'])), True, False)

# print(lea.loc[lea["banned_books"] == True])

In [8]:
# Now we can merge in additional dataframes of interest

'''
Dropout Count (TOTD912)
Enrollment Base (EBS912)
Dropout Rate (DRP912)
Diploma Counts (TOTDPL)
AFGR Enrollment Base Counts (AFGEB)
'''

# Need a better/more recent dataset

lea_academic_success = pd.read_csv("lea_academic_success.txt", delim_whitespace=True)
lea_academic_success.drop(columns=["SURVYEAR", "FIPST"], inplace=True)

In [9]:
# Merge our new academic success DF with our other data on LEAID
final_df = lea.merge(lea_academic_success, on='LEAID', how='left', indicator=True)

final_df.to_csv("test1.csv")

In [10]:
lea["LEA_STATE"]

0        AL
1        AL
2        AL
3        AL
4        AL
         ..
17599    WY
17600    WY
17601    WY
17602    WY
17603    PR
Name: LEA_STATE, Length: 17604, dtype: object

In [11]:
print(lea.loc[(lea["LEA_STATE"] == "TX") & (lea["banned_books"] == True)])

      LEA_STATE LEA_STATE_NAME    LEAID               LEA_NAME  \
15281        TX          TEXAS  4809860             BELTON ISD   
15287        TX          TEXAS  4810230          BIRDVILLE ISD   
15353        TX          TEXAS  4812780          CANUTILLO ISD   
15391        TX          TEXAS  4814280        CLEAR CREEK ISD   
15413        TX          TEXAS  4815000             CONROE ISD   
15442        TX          TEXAS  4816110  CYPRESS-FAIRBANKS ISD   
15460        TX          TEXAS  4816740             DENTON ISD   
15531        TX          TEXAS  4819650          FORT BEND ISD   
15537        TX          TEXAS  4819840     FREDERICKSBURG ISD   
15540        TX          TEXAS  4820010             FRISCO ISD   
15577        TX          TEXAS  4821390           GRANBURY ISD   
15649        TX          TEXAS  4823910             HUMBLE ISD   
15684        TX          TEXAS  4825170               KATY ISD   
15687        TX          TEXAS  4825260             KELLER ISD   
15698     

In [12]:
print(lea.loc[(lea["LEA_STATE"] == "TX")])

print(lea.loc[(lea["LEA_STATE"] == "TX")])

      LEA_STATE LEA_STATE_NAME    LEAID           LEA_NAME        LEA_ADDRESS  \
15027        TX          TEXAS  4800001     CROSBYTON CISD  204 S HARRISON ST   
15028        TX          TEXAS  4800002           SPUR ISD        P O BOX 550   
15029        TX          TEXAS  4800003    ROCKSPRINGS ISD        P O BOX 157   
15030        TX          TEXAS  4800004   POR VIDA ACADEMY    1135 MISSION RD   
15031        TX          TEXAS  4800005     BLACKWELL CISD        P O BOX 505   
...         ...            ...      ...                ...                ...   
16224        TX          TEXAS  4846680         YSLETA ISD       9600 SIMS DR   
16225        TX          TEXAS  4846710  ZAPATA COUNTY ISD        P O BOX 158   
16226        TX          TEXAS  4846740        ZAVALLA ISD      431 E MAIN ST   
16227        TX          TEXAS  4846770         ZEPHYR ISD       11625 CR 281   
16228        TX          TEXAS  4899130      BENAVIDES ISD        PO DRAWER P   

          LEA_CITY LEA_ZIP 