In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Get all school districts in the US 
lea = pd.read_csv("lea_characteristics.csv", sep=",")

# Get our banned books + districts
df_school_yr_1 = pd.read_csv("PEN_July_1_2021_June_30_2022.csv", sep=",")
df_school_yr_2 = pd.read_csv("PEN_July_1_2022_December_31_2022.csv", sep=",")
books = df_school_yr_1.append(df_school_yr_2)

In [3]:
# Create a new binary variable that is "book bans" and set true given our "data" df
books['district_name'] = books['District'].str.lower()
lea['district_name'] = lea['LEA_NAME'].str.lower()
books["STATE"] = books["State"].str.upper()

# Strip of common words
common_words = ["city school district", "schools", "school", "district", "public", "county", 
                "independent", "cisd", "isd", " sd", " co ", "pblc", "schs", "high", "of", 
                "community", "comm", "area", "the", "department", "consolidated", "  no",
                "r-iii", "r-vii", "r-ix", "r-iv", "r-1", "r-i", 
                "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "#", "-", "."]

for word in common_words: 
    books['district_name'] = books['district_name'].str.replace(word, " ").str.strip()
    lea['district_name'] = lea['district_name'].str.replace(word, " ").str.strip()

In [4]:
# make sure they are unique
pairs = {}
for district in pd.unique(books["district_name"]): 
    pen_names = books[books["district_name"]==district][["STATE", "District"]]
    lea_names = lea[lea["district_name"]==district][["LEA_STATE_NAME", "LEA_NAME"]]
    
    for pen_name in pd.unique(pen_names["District"]): 
        state = pen_names[pen_names["District"] == pen_name]["STATE"].values[0]
        lea_name = lea_names[lea_names["LEA_STATE_NAME"] == state]["LEA_NAME"].values
        if len(lea_name) > 0: 
            lea_name = lea_name[0]
            pairs[pen_name] = lea_name

outliers_pen_to_lea = {
    "Medford School District 549C": "Medford SD 549C", 
    "Virginia Beach City Public Schools": "VA BEACH CITY PBLC SCHS",
    "Harlem School District 122": "Harlem UD 122", 
    "Regional School Unit No. 56": "RSU 56", 
    "ROWVA CUSD 208": "R O W V A CUSD 208", 
    "Cherry Creek School District": "Cherry Creek School District No. 5 in the County of Arapaho", 
    "Raytown Quality Schools": "RAYTOWN C-2", 
    "Williston Basin School District #7": "WILLISTON 1", 
    "Lexington-Richland School District 5": "School District 5 of Lexington and Richland Counties", 
    "Madison County Schools": "MADISON CO SCHOOL DIST", 
}

matching_district_names = pairs | outliers_pen_to_lea

books['matched_name'] = [matching_district_names.get(d, None) for d in books['District']]

In [5]:
# Only keep specific cols of a df so that we can start fresh with a LEAIID
cols_to_keep = ["LEA_STATE_NAME", "LEAID", "LEA_NAME", "LEA_ENR"]
lea = lea[cols_to_keep]
merged = lea.copy()
merged["banned_books"] = np.where((lea['LEA_NAME'].isin(books['matched_name'])) & (lea['LEA_STATE_NAME'].isin(books['STATE'])), True, False)
merged = merged.loc[merged["LEA_STATE_NAME"] == "TEXAS"]


In [247]:
### TLDR; I'm dumb and didn't realize there was an LEA_ENR col. But verified that both ways of caluclating achieves the same result!
'''
# Add enrollment to our table
enrollment = pd.read_csv("Enrollment.csv")
enrollment["Total_enrollment"] = enrollment["TOT_ENR_M"] + enrollment["TOT_ENR_F"]
cols = ["LEAID", "Total_enrollment"]
enrollment = enrollment[cols]
enrollment = enrollment.groupby('LEAID')["Total_enrollment"].sum().reset_index() # Sum enrollment across all schools in a district

# Merge with our merged df
# Cast our LEAID to a numeric or else the merge doesn't work correctly
merged['LEAID'] = pd.to_numeric(merged['LEAID'], errors='coerce')
enrollment['LEAID'] = pd.to_numeric(enrollment['LEAID'], errors='coerce')
merged = pd.merge(merged, enrollment, on='LEAID', how='left')

merged
'''

'\n# Add enrollment to our table\nenrollment = pd.read_csv("Enrollment.csv")\nenrollment["Total_enrollment"] = enrollment["TOT_ENR_M"] + enrollment["TOT_ENR_F"]\ncols = ["LEAID", "Total_enrollment"]\nenrollment = enrollment[cols]\nenrollment = enrollment.groupby(\'LEAID\')["Total_enrollment"].sum().reset_index() # Sum enrollment across all schools in a district\n\n# Merge with our merged df\n# Cast our LEAID to a numeric or else the merge doesn\'t work correctly\nmerged[\'LEAID\'] = pd.to_numeric(merged[\'LEAID\'], errors=\'coerce\')\nenrollment[\'LEAID\'] = pd.to_numeric(enrollment[\'LEAID\'], errors=\'coerce\')\nmerged = pd.merge(merged, enrollment, on=\'LEAID\', how=\'left\')\n\nmerged\n'

In [6]:
# Calculate teacher to student ratio
teacher_student_ratio = pd.read_csv("School_Support.csv", encoding='latin-1')
cols = ["LEAID", "SCH_FTETEACH_TOT"]

teacher_student_ratio = teacher_student_ratio[cols]
teacher_student_ratio = teacher_student_ratio.groupby('LEAID')["SCH_FTETEACH_TOT"].sum().reset_index() # Sum enrollment across all schools in a district
# Cast our LEAID to numeric
merged['LEAID'] = pd.to_numeric(merged['LEAID'], errors='coerce')
teacher_student_ratio['LEAID'] = pd.to_numeric(teacher_student_ratio['LEAID'], errors='coerce')
merged = pd.merge(merged, teacher_student_ratio, on='LEAID', how='left')

merged["SCH_FTETEACH_TOT"] = merged["SCH_FTETEACH_TOT"] / merged["LEA_ENR"]

In [8]:
# Retention table looks quite... sparse
retention = pd.read_csv("Retention.csv", encoding='latin-1')
retention = retention.loc[retention["LEA_STATE"] == "TX"]
retention

Unnamed: 0,LEA_STATE,LEA_STATE_NAME,LEAID,LEA_NAME,SCHID,SCH_NAME,COMBOKEY,JJ,SCH_RET_KG_IND,SCH_RET_G01_IND,...,SCH_RET_G12_TR_M,SCH_RET_G12_TR_F,TOT_RET_G12_M,TOT_RET_G12_F,SCH_RET_G12_LEP_M,SCH_RET_G12_LEP_F,SCH_RET_G12_IDEA_M,SCH_RET_G12_IDEA_F,SCH_RET_G12_504_M,SCH_RET_G12_504_F
78831,TX,TEXAS,4800001,CROSBYTON CISD,1145,CROSBYTON EL,480000101145,No,Yes,Yes,...,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9
78832,TX,TEXAS,4800001,CROSBYTON CISD,1146,CROSBYTON SECONDARY,480000101146,No,-9,-9,...,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9
78833,TX,TEXAS,4800001,CROSBYTON CISD,3621,SP ED CO-OP,480000103621,No,-9,-9,...,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9
78834,TX,TEXAS,4800002,SPUR ISD,4732,SPUR SCHOOL,480000204732,No,No,No,...,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9
78835,TX,TEXAS,4800003,ROCKSPRINGS ISD,4219,ROCKSPRINGS K-12,480000304219,No,No,Yes,...,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87584,TX,TEXAS,4846740,ZAVALLA ISD,5353,ZAVALLA EL,484674005353,No,Yes,Yes,...,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9
87585,TX,TEXAS,4846740,ZAVALLA ISD,5354,ZAVALLA H S,484674005354,No,-9,-9,...,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9
87586,TX,TEXAS,4846770,ZEPHYR ISD,5356,ZEPHYR SCHOOL,484677005356,No,Yes,No,...,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9
87587,TX,TEXAS,4899130,BENAVIDES ISD,5627,BENAVIDES SECONDARY,489913005627,No,-9,-9,...,0,0,1,0,0,0,0,0,0,0


In [9]:
# Merge in county information
county_info = pd.read_excel("county_info.xlsx")

In [10]:
# Add in county information for all LEAIDs
cols = ["LEAID", "NMCNTY"]
county_info = county_info[cols]
county_info['LEAID'] = pd.to_numeric(county_info['LEAID'], errors='coerce')

# Get the most common county for each school district
county_info = county_info.groupby('LEAID')['NMCNTY'].agg(lambda x: x.mode().iloc[0])

merged = pd.merge(merged, county_info, on='LEAID', how='left')

In [11]:
merged = merged.dropna() # only 9 instances of NA

# Calculate the Political Leaning per county
tpi = pd.read_csv("TPI.csv")
pattern1 = r'(?P<Prefix1>[A-Za-z]+)-(?P<Percentage1>\d+)%' # REGEX for the pattern
pattern2 = r'(?P<Prefix2>[A-Za-z]+)-(?P<Percentage2>\d+)%' # REGEX for the pattern

df_extracted1 = tpi['Current TPI (20-22)'].str.extract(pattern1)
df_extracted2 = tpi['Previous TPI (18-20)'].str.extract(pattern2)

# Concatenate the extracted columns with the original DataFrame
df_result = pd.concat([tpi, df_extracted1, df_extracted2], axis=1)
df_result["Prefix1"] = np.where(df_result["Prefix1"] == "R", 1, -1).astype(float)
df_result["Prefix2"] = np.where(df_result["Prefix2"] == "R", 1, -1).astype(float)

# Case our columns to perform our calculation
df_result["Percentage1"] = df_result["Percentage1"].astype(float)
df_result["Percentage2"] = df_result["Percentage2"].astype(float)

df_result["overall_TPI"] = ((df_result["Prefix1"] * df_result["Percentage1"] * 0.01) + (df_result["Prefix2"] * df_result["Percentage2"] * 0.01)) / 2

In [12]:
merged.sort_values(by="NMCNTY")

Unnamed: 0,LEA_STATE_NAME,LEAID,LEA_NAME,LEA_ENR,banned_books,SCH_FTETEACH_TOT,NMCNTY
334,TEXAS,4813200,CAYUGA ISD,576,False,0.085938,Anderson County
1039,TEXAS,4840500,SLOCUM ISD,423,False,0.131560,Anderson County
877,TEXAS,4834050,PALESTINE ISD,3419,False,0.075168,Anderson County
835,TEXAS,4832250,NECHES ISD,360,False,0.093889,Anderson County
1113,TEXAS,4843290,WESTWOOD ISD,1564,False,0.095358,Anderson County
...,...,...,...,...,...,...,...
846,TEXAS,4832700,NEWCASTLE ISD,231,False,0.082251,Young County
549,TEXAS,4821360,GRAHAM ISD,2373,False,0.069954,Young County
1198,TEXAS,4846710,ZAPATA COUNTY ISD,3521,False,0.062482,Zapata County
688,TEXAS,4826250,LA PRYOR ISD,468,False,0.091880,Zavala County


In [13]:
# Merge our books df with our county leaning df
merged["NMCNTY"] = merged["NMCNTY"].str.lower()
merged["NMCNTY"] = merged["NMCNTY"].str.replace('county', '', case=False).str.strip()
df_result["County Name"] = df_result["County Name"].str.lower().str.strip()
df_result = df_result[["County Name", "overall_TPI"]]
df_result["NMCNTY"] = df_result["County Name"].astype(str)
merged["NMCNTY"] = merged["NMCNTY"].astype(str)

merged = merged.merge(df_result, on='NMCNTY', how='left')

In [14]:
merged.dropna()

Unnamed: 0,LEA_STATE_NAME,LEAID,LEA_NAME,LEA_ENR,banned_books,SCH_FTETEACH_TOT,NMCNTY,County Name,overall_TPI
0,TEXAS,4800001,CROSBYTON CISD,365,False,0.101370,crosby,crosby,0.740
1,TEXAS,4800002,SPUR ISD,261,False,0.107280,dickens,dickens,0.865
2,TEXAS,4800003,ROCKSPRINGS ISD,313,False,0.076677,edwards,edwards,0.850
3,TEXAS,4800004,POR VIDA ACADEMY,273,False,0.091575,bexar,bexar,-0.570
4,TEXAS,4800005,BLACKWELL CISD,155,False,0.129032,mitchell,mitchell,0.855
...,...,...,...,...,...,...,...,...,...
1188,TEXAS,4846680,YSLETA ISD,41198,False,0.066469,el paso,el paso,-0.685
1189,TEXAS,4846710,ZAPATA COUNTY ISD,3521,False,0.062482,zapata,zapata,-0.585
1190,TEXAS,4846740,ZAVALLA ISD,368,False,0.092391,angelina,angelina,0.750
1191,TEXAS,4846770,ZEPHYR ISD,202,False,0.103960,brown,brown,0.875


In [301]:
merged.to_csv("Final_DF.csv")

In [179]:
'''
Advanced_Placement.csv: 
    SCH_APCOURSES ??
    TOT_APENR_M + TOT_APENR_F
    
Gifted_and_Talented.csv:
    SCH_GT_IND
    TOT_GTENR_M + TOT_GTENR_F
    
SAT_and_ACT.csv: 
    TOT_SATACT_M + TOT_SATACT_F
'''

sat = pd.read_csv("SAT_and_ACT.csv", encoding='latin-1')

sat_cleaned = sat[['LEA_STATE_NAME','LEAID', 'LEA_NAME', 'SCH_NAME', 'TOT_SATACT_M', 'TOT_SATACT_F']]
sat_cleaned['LEAID'] = pd.to_numeric(sat_cleaned['LEAID'], errors='coerce')

sat_cleaned = sat_cleaned.replace({-9: None})
sat_cleaned = sat_cleaned.dropna(subset=['TOT_SATACT_M', 'TOT_SATACT_F'])

sat_cleaned = sat_cleaned.groupby(['LEA_STATE_NAME', 'LEAID', 'LEA_NAME'], as_index=False).agg('sum')

In [180]:
texas_sat = sat_cleaned.loc[sat_cleaned['LEA_STATE_NAME'] == 'TEXAS']
final_sat = merged.merge(texas_sat, on=['LEA_NAME', 'LEAID', 'LEA_STATE_NAME',], how='left')
final_sat['SATACT_P'] = (final_sat['TOT_SATACT_M'] + final_sat['TOT_SATACT_F']) / final_sat['LEA_ENR'] * 100

In [181]:
ap = pd.read_csv("Advanced_Placement.csv", encoding='latin-1')

In [182]:
ap_cleaned = ap[['LEA_STATE_NAME', 'LEAID', 'TOT_APENR_M', 'TOT_APENR_F']]
ap_cleaned['LEAID'] = pd.to_numeric(ap_cleaned['LEAID'], errors='coerce')
ap_cleaned = ap_cleaned.replace({-9: None})
ap_cleaned = ap_cleaned.dropna(subset=['TOT_APENR_M', 'TOT_APENR_F'])
ap_cleaned = ap_cleaned.groupby(['LEA_STATE_NAME', 'LEAID'], as_index=False).agg('sum')
ap_texas = ap_cleaned.loc[ap_cleaned['LEA_STATE_NAME'] == 'TEXAS']

In [183]:
final_sat_ap = final_sat.merge(ap_texas, on=['LEAID', 'LEA_STATE_NAME'], how='left')
final_sat_ap['APENR_P'] = (final_sat_ap['TOT_APENR_M'] + final_sat_ap['TOT_APENR_F']) / final_sat['LEA_ENR'] * 100

In [184]:
## Disciplinary Action 
'''
Corporal_Punishment.csv
    SCH_CORPINSTANCES_IND
    
Expulsions.csv
    TOT_DISCWDIS_EXPWOE_IDEA_F + TOT_DISCWDIS_EXPWOE_IDEA_M + TOT_DISCWDIS_EXPWE_IDEA_F 
    + TOT_DISCWDIS_EXPWE_IDEA_M + TOT_DISCWODIS_EXPWOE_F + TOT_DISCWODIS_EXPWOE_M + 
    TOT_DISCWODIS_EXPWE_F + TOT_DISCWODIS_EXPWE_M
    
Harassment_and_Bullying.csv
    Disability: TOT_HBDISCIPLINED_DIS_M + TOT_HBDISCIPLINED_DIS_F
    Ethnicity: TOT_HBDISCIPLINED_RAC_M + TOT_HBDISCIPLINED_RAC_F
    Sex: TOT_HBDISCIPLINED_SEX_M + TOT_HBDISCIPLINED_SEX_F
'''

expulsions = pd.read_csv("Expulsions.csv", encoding='latin-1')
hb = pd.read_csv("Harassment_and_Bullying.csv", encoding='latin-1')
cp = pd.read_csv("Corporal_Punishment.csv", encoding='latin-1')

In [185]:
clean_e = expulsions.replace({-9: None})
clean_e = clean_e.dropna(subset=['TOT_DISCWDIS_EXPWOE_IDEA_F', 
                                 'TOT_DISCWDIS_EXPWOE_IDEA_M', 
                                 'TOT_DISCWDIS_EXPWE_IDEA_F', 
                                 'TOT_DISCWDIS_EXPWE_IDEA_M', 
                                 'TOT_DISCWODIS_EXPWOE_F', 
                                 'TOT_DISCWODIS_EXPWOE_M', 
                                 'TOT_DISCWODIS_EXPWE_F', 
                                'TOT_DISCWODIS_EXPWE_M'])
clean_e['TOT_EXP'] = (clean_e['TOT_DISCWDIS_EXPWOE_IDEA_F'] + 
                      clean_e['TOT_DISCWDIS_EXPWOE_IDEA_M'] + 
                      clean_e['TOT_DISCWDIS_EXPWE_IDEA_F'] + 
                      clean_e['TOT_DISCWDIS_EXPWE_IDEA_M'] + 
                      clean_e['TOT_DISCWODIS_EXPWOE_F'] + 
                      clean_e['TOT_DISCWODIS_EXPWOE_M'] + 
                      clean_e['TOT_DISCWODIS_EXPWE_F'] + 
                      clean_e['TOT_DISCWODIS_EXPWE_M'])
clean_e = clean_e[['LEA_STATE_NAME', 'LEAID', 'TOT_EXP']] 
clean_e['LEAID'] = pd.to_numeric(clean_e['LEAID'], errors='coerce')
clean_e = clean_e.groupby(['LEA_STATE_NAME', 'LEAID'], as_index=False).agg('sum')
expulsions_texas = clean_e.loc[clean_e['LEA_STATE_NAME'] == 'TEXAS']

In [202]:
final_sat_ap

Unnamed: 0,LEA_STATE_NAME,LEAID,LEA_NAME,LEA_ENR,banned_books,SCH_FTETEACH_TOT,NMCNTY,County Name,overall_TPI,SCH_NAME,TOT_SATACT_M,TOT_SATACT_F,SATACT_P,TOT_APENR_M,TOT_APENR_F,APENR_P
0,TEXAS,4800001,CROSBYTON CISD,365,False,0.101370,crosby,crosby,0.740,CROSBYTON SECONDARY,11.0,14.0,6.849315,,,
1,TEXAS,4800002,SPUR ISD,261,False,0.107280,dickens,dickens,0.865,SPUR SCHOOL,16.0,18.0,13.026820,,,
2,TEXAS,4800003,ROCKSPRINGS ISD,313,False,0.076677,edwards,edwards,0.850,ROCKSPRINGS K-12,7.0,9.0,5.111821,,,
3,TEXAS,4800004,POR VIDA ACADEMY,273,False,0.091575,bexar,bexar,-0.570,POR VIDA ACADEMY CHARTER H SCORPUS CHRISTI COL...,0.0,0.0,0.000000,,,
4,TEXAS,4800005,BLACKWELL CISD,155,False,0.129032,mitchell,mitchell,0.855,BLACKWELL SCHOOL,0.0,0.0,0.000000,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1188,TEXAS,4846680,YSLETA ISD,41198,False,0.066469,el paso,el paso,-0.685,CESAR CHAVEZ ACADEMYBEL AIR H SEASTWOOD H SEAS...,301.0,492.0,1.924851,1263.0,1669.0,7.116850
1189,TEXAS,4846710,ZAPATA COUNTY ISD,3521,False,0.062482,zapata,zapata,-0.585,ZAPATA H S,58.0,31.0,2.527691,138.0,178.0,8.974723
1190,TEXAS,4846740,ZAVALLA ISD,368,False,0.092391,angelina,angelina,0.750,ZAVALLA H S,0.0,0.0,0.000000,,,
1191,TEXAS,4846770,ZEPHYR ISD,202,False,0.103960,brown,brown,0.875,ZEPHYR SCHOOL,6.0,8.0,6.930693,,,


In [203]:
final_sat_ap_exp = final_sat_ap.merge(expulsions_texas, on=['LEAID', 'LEA_STATE_NAME'], how='left')
final_sat_ap_exp['P_EXPULSIONS'] = final_sat_ap_exp['TOT_EXP'] / final_sat_ap_exp['LEA_ENR'] * 100
final_sat_ap_exp = final_sat_ap_exp[['LEA_STATE_NAME', 'LEAID', 'LEA_NAME', 'County Name', 'LEA_ENR', 'SCH_FTETEACH_TOT', 'overall_TPI', 'SATACT_P', 'APENR_P', 'P_EXPULSIONS', 'banned_books']]

In [204]:
hb_clean = hb.replace({-9: None})
hb_clean = hb_clean.dropna(subset=['TOT_HBDISCIPLINED_DIS_M', 
                        'TOT_HBDISCIPLINED_DIS_F', 
                       'TOT_HBDISCIPLINED_RAC_M', 
                       'TOT_HBDISCIPLINED_RAC_F',
                       'TOT_HBDISCIPLINED_SEX_M', 
                       'TOT_HBDISCIPLINED_SEX_F'])
hb_clean['TOT_DIS_HB'] = hb_clean['TOT_HBDISCIPLINED_DIS_M'] + hb_clean['TOT_HBDISCIPLINED_DIS_F']
hb_clean['TOT_RAC_HB'] = hb_clean['TOT_HBDISCIPLINED_RAC_M'] + hb_clean['TOT_HBDISCIPLINED_RAC_F']
hb_clean['TOT_SEX_HB'] = hb_clean['TOT_HBDISCIPLINED_SEX_M'] + hb_clean['TOT_HBDISCIPLINED_SEX_F']
hb_clean = hb_clean[['LEA_STATE_NAME', 'LEAID', 'TOT_DIS_HB', 'TOT_RAC_HB', 'TOT_SEX_HB']]
hb_clean['LEAID'] = pd.to_numeric(hb_clean['LEAID'], errors='coerce')
hb_clean = hb_clean.groupby(['LEA_STATE_NAME', 'LEAID'], as_index=False).agg('sum')
hb_texas = hb_clean.loc[hb_clean['LEA_STATE_NAME'] == 'TEXAS']

In [205]:
final_sat_ap_exp_hb = final_sat_ap_exp.merge(hb_texas, on=['LEAID', 'LEA_STATE_NAME'], how='left')
final_sat_ap_exp_hb

Unnamed: 0,LEA_STATE_NAME,LEAID,LEA_NAME,County Name,LEA_ENR,SCH_FTETEACH_TOT,overall_TPI,SATACT_P,APENR_P,P_EXPULSIONS,banned_books,TOT_DIS_HB,TOT_RAC_HB,TOT_SEX_HB
0,TEXAS,4800001,CROSBYTON CISD,crosby,365,0.101370,0.740,6.849315,,0.000000,False,0,0,0
1,TEXAS,4800002,SPUR ISD,dickens,261,0.107280,0.865,13.026820,,0.000000,False,0,0,0
2,TEXAS,4800003,ROCKSPRINGS ISD,edwards,313,0.076677,0.850,5.111821,,0.000000,False,0,0,0
3,TEXAS,4800004,POR VIDA ACADEMY,bexar,273,0.091575,-0.570,0.000000,,0.000000,False,0,0,0
4,TEXAS,4800005,BLACKWELL CISD,mitchell,155,0.129032,0.855,0.000000,,0.000000,False,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1188,TEXAS,4846680,YSLETA ISD,el paso,41198,0.066469,-0.685,1.924851,7.116850,0.014564,False,0,0,0
1189,TEXAS,4846710,ZAPATA COUNTY ISD,zapata,3521,0.062482,-0.585,2.527691,8.974723,0.000000,False,0,0,0
1190,TEXAS,4846740,ZAVALLA ISD,angelina,368,0.092391,0.750,0.000000,,0.000000,False,0,0,0
1191,TEXAS,4846770,ZEPHYR ISD,brown,202,0.103960,0.875,6.930693,,0.000000,False,0,0,0


In [201]:
final_sat_ap_exp_hb.to_csv("final_table.csv")

In [127]:
# Instead of merging, create a binary variable where a column exists
lea["banned_books"] = np.where((lea['LEA_NAME'].isin(books['matched_name'])) & (lea['LEA_STATE_NAME'].isin(books['STATE'])), True, False)

# print(lea.loc[lea["banned_books"] == True])

Unnamed: 0,LEA_STATE_NAME,LEAID,LEA_NAME,LEA_ENR,banned_books
0,ALABAMA,100002,Alabama Youth Services,280,False
1,ALABAMA,100005,Albertville City,5445,False
2,ALABAMA,100006,Marshall County,5698,False
3,ALABAMA,100007,Hoover City,14372,False
4,ALABAMA,100008,Madison City,10810,False
...,...,...,...,...,...
17599,WYOMING,5680251,Wyoming Department of Family Services,162,False
17600,WYOMING,5680252,Youth Emergency Services Inc. - Administration...,34,False
17601,WYOMING,5680254,Saint Stephen's Indian School Admin Office,243,False
17602,WYOMING,5680255,Rite of Passage,31,False


In [16]:
# Now we can merge in additional dataframes of interest

'''
Dropout Count (TOTD912)
Enrollment Base (EBS912)
Dropout Rate (DRP912)
Diploma Counts (TOTDPL)
AFGR Enrollment Base Counts (AFGEB)
'''

# Need a better/more recent dataset

lea_academic_success = pd.read_csv("lea_academic_success.txt", delim_whitespace=True)
lea_academic_success.drop(columns=["SURVYEAR", "FIPST"], inplace=True)

In [256]:
# Merge our new academic success DF with our other data on LEAID
final_df = lea.merge(lea_academic_success, on='LEAID', how='left', indicator=True)

final_df.to_csv("test1.csv")

In [19]:
# final_df = lea.merge(lea_academic_success, on='LEAID', how='left', indicator=True)
lea_academic_success

Unnamed: 0,LEAID,TOTD912,EBS912,DRP912,TOTDPL,AFGEB,AFGR,TOTOHC
0,100002,-4,0,-4.0,-2,5,-2.0,-2
1,100005,-3,1052,-3.0,193,278,69.4,9
2,100006,-1,1648,-1.0,338,469,72.1,13
3,100007,-3,3935,-3.0,900,965,93.3,12
4,100008,-3,2886,-3.0,672,674,99.7,13
...,...,...,...,...,...,...,...,...
18434,6600040,-1,-1,-1.0,-1,-1,-1.0,-1
18435,6900030,-1,-1,-1.0,-1,-1,-1.0,-1
18436,7200030,-9,145172,-9.0,25514,41984,60.8,-2
18437,7800002,144,2591,5.6,463,-2,-2.0,0


In [None]:
print(lea.loc[(lea["LEA_STATE"] == "TX") & (lea["banned_books"] == True)])

KeyError: 'LEA_STATE'

In [None]:
print(lea.loc[(lea["LEA_STATE"] == "TX")])

print(lea.loc[(lea["LEA_STATE"] == "TX")])

      LEA_STATE LEA_STATE_NAME    LEAID           LEA_NAME        LEA_ADDRESS  \
15027        TX          TEXAS  4800001     CROSBYTON CISD  204 S HARRISON ST   
15028        TX          TEXAS  4800002           SPUR ISD        P O BOX 550   
15029        TX          TEXAS  4800003    ROCKSPRINGS ISD        P O BOX 157   
15030        TX          TEXAS  4800004   POR VIDA ACADEMY    1135 MISSION RD   
15031        TX          TEXAS  4800005     BLACKWELL CISD        P O BOX 505   
...         ...            ...      ...                ...                ...   
16224        TX          TEXAS  4846680         YSLETA ISD       9600 SIMS DR   
16225        TX          TEXAS  4846710  ZAPATA COUNTY ISD        P O BOX 158   
16226        TX          TEXAS  4846740        ZAVALLA ISD      431 E MAIN ST   
16227        TX          TEXAS  4846770         ZEPHYR ISD       11625 CR 281   
16228        TX          TEXAS  4899130      BENAVIDES ISD        PO DRAWER P   

          LEA_CITY LEA_ZIP 