In [32]:
import numpy as np
import pandas as pd
import warnings
from fuzzywuzzy import process

warnings.filterwarnings("ignore")

In [33]:
# Get all school districts in the US 
lea = pd.read_csv("lea_characteristics.csv", sep=",")

# Get our banned books + districts
df_school_yr_1 = pd.read_csv("PEN_July_1_2021_June_30_2022.csv", sep=",")
df_school_yr_2 = pd.read_csv("PEN_July_1_2022_December_31_2022.csv", sep=",")
books = df_school_yr_1.append(df_school_yr_2)

In [34]:
books

Unnamed: 0,Author,Title,Type of Ban,Secondary Author(s),Illustrator(s),Translator(s),State,District,Date of Challenge/Removal,Origin of Challenge,Series Name,Ban Status
0,"Àbíké-Íyímídé, Faridah",Ace of Spades,Banned in Libraries and Classrooms,,,,Florida,Indian River County School District,Nov-21,Administrator,,
1,"Acevedo, Elizabeth",Clap When You Land,Banned in Classrooms,,,,Pennsylvania,Central York School District,Aug-21,Administrator,,
2,"Acevedo, Elizabeth",The Poet X,Banned in Libraries,,,,Florida,Indian River County School District,Nov-21,Administrator,,
3,"Acevedo, Elizabeth",The Poet X,Banned in Libraries and Classrooms,,,,New York,Marlboro Central School District,Feb-22,Administrator,,
4,"Acevedo, Elizabeth",The Poet X,Banned Pending Investigation,,,,Texas,Fredericksburg Independent School District,Mar-22,Administrator,,
...,...,...,...,...,...,...,...,...,...,...,...,...
1472,"Lyga, Barry",Boy Toy,,,,,Virginia,Spotsylvania County Public Schools,Oct-22,Formal Challenge,,Banned in Libraries and Classrooms
1473,"Arnold, Elana K.",Red Hood,,,,,Virginia,Spotsylvania County Public Schools,Oct-22,Formal Challenge,,Banned in Libraries and Classrooms
1474,"McQuiston, Casey","Red, White, and Royal Blue",,,,,Virginia,Spotsylvania County Public Schools,Oct-22,Formal Challenge,,Banned in Libraries and Classrooms
1475,"Kobabe, Maia",Gender Queer: A Memoir,,,,,Wyoming,Natrona County Schools,Oct-22,Formal Challenge,,Banned Pending Investigation


In [35]:
gb = books.groupby(by="State")["District"].nunique()

gb.tail(20)

State
Missouri          16
New Jersey         4
New York           6
North Carolina     6
North Dakota       1
Ohio               1
Oklahoma           4
Oregon             1
Pennsylvania      12
Rhode Island       1
South Carolina     6
South Dakota       1
Tennessee          9
Texas             28
Utah               4
Vermont            1
Virginia          10
Washington         3
Wisconsin          6
Wyoming            1
Name: District, dtype: int64

In [36]:
test = books.loc[books["State"] == "New York"]
len(test["District"].unique())

6

In [37]:
# Create a new binary variable that is "book bans" and set true given our "data" df
books['district_name'] = books['District'].str.lower()
lea['district_name'] = lea['LEA_NAME'].str.lower()
books["STATE"] = books["State"].str.upper()

# Strip of common words
common_words = ["city school district", "schools", "school", "district", "public", "county", 
                "independent", "cisd", "isd", " sd", " co ", "pblc", "schs", "high", "of", 
                "community", "comm", "area", "the", "department", "consolidated", "  no",
                "r-iii", "r-vii", "r-ix", "r-iv", "r-1", "r-i", 
                "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "#", "-", "."]

for word in common_words: 
    books['district_name'] = books['district_name'].str.replace(word, " ").str.strip()
    lea['district_name'] = lea['district_name'].str.replace(word, " ").str.strip()

In [38]:
# make sure they are unique
pairs = {}
for district in pd.unique(books["district_name"]): 
    pen_names = books[books["district_name"]==district][["STATE", "District"]]
    lea_names = lea[lea["district_name"]==district][["LEA_STATE_NAME", "LEA_NAME"]]
    
    for pen_name in pd.unique(pen_names["District"]): 
        state = pen_names[pen_names["District"] == pen_name]["STATE"].values[0]
        lea_name = lea_names[lea_names["LEA_STATE_NAME"] == state]["LEA_NAME"].values
        if len(lea_name) > 0: 
            lea_name = lea_name[0]
            pairs[pen_name] = lea_name

outliers_pen_to_lea = {
    "Medford School District 549C": "Medford SD 549C", 
    "Virginia Beach City Public Schools": "VA BEACH CITY PBLC SCHS",
    "Harlem School District 122": "Harlem UD 122", 
    "Regional School Unit No. 56": "RSU 56", 
    "ROWVA CUSD 208": "R O W V A CUSD 208", 
    "Cherry Creek School District": "Cherry Creek School District No. 5 in the County of Arapaho", 
    "Raytown Quality Schools": "RAYTOWN C-2", 
    "Williston Basin School District #7": "WILLISTON 1", 
    "Lexington-Richland School District 5": "School District 5 of Lexington and Richland Counties", 
    "Madison County Schools": "MADISON CO SCHOOL DIST", 
}

matching_district_names = pairs | outliers_pen_to_lea

books['matched_name'] = [matching_district_names.get(d, None) for d in books['District']]

In [39]:
books

Unnamed: 0,Author,Title,Type of Ban,Secondary Author(s),Illustrator(s),Translator(s),State,District,Date of Challenge/Removal,Origin of Challenge,Series Name,Ban Status,district_name,STATE,matched_name
0,"Àbíké-Íyímídé, Faridah",Ace of Spades,Banned in Libraries and Classrooms,,,,Florida,Indian River County School District,Nov-21,Administrator,,,indian river,FLORIDA,INDIAN RIVER
1,"Acevedo, Elizabeth",Clap When You Land,Banned in Classrooms,,,,Pennsylvania,Central York School District,Aug-21,Administrator,,,central york,PENNSYLVANIA,Central York SD
2,"Acevedo, Elizabeth",The Poet X,Banned in Libraries,,,,Florida,Indian River County School District,Nov-21,Administrator,,,indian river,FLORIDA,INDIAN RIVER
3,"Acevedo, Elizabeth",The Poet X,Banned in Libraries and Classrooms,,,,New York,Marlboro Central School District,Feb-22,Administrator,,,marlboro central,NEW YORK,MARLBORO CENTRAL SCHOOL DISTRICT
4,"Acevedo, Elizabeth",The Poet X,Banned Pending Investigation,,,,Texas,Fredericksburg Independent School District,Mar-22,Administrator,,,fredericksburg,TEXAS,FREDERICKSBURG ISD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1472,"Lyga, Barry",Boy Toy,,,,,Virginia,Spotsylvania County Public Schools,Oct-22,Formal Challenge,,Banned in Libraries and Classrooms,spotsylvania,VIRGINIA,SPOTSYLVANIA CO PBLC SCHS
1473,"Arnold, Elana K.",Red Hood,,,,,Virginia,Spotsylvania County Public Schools,Oct-22,Formal Challenge,,Banned in Libraries and Classrooms,spotsylvania,VIRGINIA,SPOTSYLVANIA CO PBLC SCHS
1474,"McQuiston, Casey","Red, White, and Royal Blue",,,,,Virginia,Spotsylvania County Public Schools,Oct-22,Formal Challenge,,Banned in Libraries and Classrooms,spotsylvania,VIRGINIA,SPOTSYLVANIA CO PBLC SCHS
1475,"Kobabe, Maia",Gender Queer: A Memoir,,,,,Wyoming,Natrona County Schools,Oct-22,Formal Challenge,,Banned Pending Investigation,natrona,WYOMING,Natrona County School District #1


In [40]:
books

Unnamed: 0,Author,Title,Type of Ban,Secondary Author(s),Illustrator(s),Translator(s),State,District,Date of Challenge/Removal,Origin of Challenge,Series Name,Ban Status,district_name,STATE,matched_name
0,"Àbíké-Íyímídé, Faridah",Ace of Spades,Banned in Libraries and Classrooms,,,,Florida,Indian River County School District,Nov-21,Administrator,,,indian river,FLORIDA,INDIAN RIVER
1,"Acevedo, Elizabeth",Clap When You Land,Banned in Classrooms,,,,Pennsylvania,Central York School District,Aug-21,Administrator,,,central york,PENNSYLVANIA,Central York SD
2,"Acevedo, Elizabeth",The Poet X,Banned in Libraries,,,,Florida,Indian River County School District,Nov-21,Administrator,,,indian river,FLORIDA,INDIAN RIVER
3,"Acevedo, Elizabeth",The Poet X,Banned in Libraries and Classrooms,,,,New York,Marlboro Central School District,Feb-22,Administrator,,,marlboro central,NEW YORK,MARLBORO CENTRAL SCHOOL DISTRICT
4,"Acevedo, Elizabeth",The Poet X,Banned Pending Investigation,,,,Texas,Fredericksburg Independent School District,Mar-22,Administrator,,,fredericksburg,TEXAS,FREDERICKSBURG ISD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1472,"Lyga, Barry",Boy Toy,,,,,Virginia,Spotsylvania County Public Schools,Oct-22,Formal Challenge,,Banned in Libraries and Classrooms,spotsylvania,VIRGINIA,SPOTSYLVANIA CO PBLC SCHS
1473,"Arnold, Elana K.",Red Hood,,,,,Virginia,Spotsylvania County Public Schools,Oct-22,Formal Challenge,,Banned in Libraries and Classrooms,spotsylvania,VIRGINIA,SPOTSYLVANIA CO PBLC SCHS
1474,"McQuiston, Casey","Red, White, and Royal Blue",,,,,Virginia,Spotsylvania County Public Schools,Oct-22,Formal Challenge,,Banned in Libraries and Classrooms,spotsylvania,VIRGINIA,SPOTSYLVANIA CO PBLC SCHS
1475,"Kobabe, Maia",Gender Queer: A Memoir,,,,,Wyoming,Natrona County Schools,Oct-22,Formal Challenge,,Banned Pending Investigation,natrona,WYOMING,Natrona County School District #1


In [47]:
# Only keep specific cols of a df so that we can start fresh with a LEAIID
cols_to_keep = ["LEA_STATE_NAME", "LEAID", "LEA_NAME", "LEA_ENR"]
lea = lea[cols_to_keep]
merged = lea.copy()
merged["banned_books"] = np.where((lea['LEA_NAME'].isin(books['matched_name'])) & (lea['LEA_STATE_NAME'].isin(books['STATE'])), True, False)

counts = books['matched_name'].value_counts()
merged["number_bans"] = merged["LEA_NAME"].map(counts)

merged = merged.loc[merged["LEA_STATE_NAME"] == "TEXAS"]
merged.loc[merged["number_bans"] > 0]


Unnamed: 0,LEA_STATE_NAME,LEAID,LEA_NAME,LEA_ENR,banned_books,number_bans
15281,TEXAS,4809860,BELTON ISD,11526,True,10.0
15287,TEXAS,4810230,BIRDVILLE ISD,23665,True,14.0
15353,TEXAS,4812780,CANUTILLO ISD,6201,True,1.0
15391,TEXAS,4814280,CLEAR CREEK ISD,41874,True,7.0
15413,TEXAS,4815000,CONROE ISD,61544,True,10.0
15442,TEXAS,4816110,CYPRESS-FAIRBANKS ISD,116359,True,16.0
15460,TEXAS,4816740,DENTON ISD,29341,True,1.0
15531,TEXAS,4819650,FORT BEND ISD,75184,True,1.0
15537,TEXAS,4819840,FREDERICKSBURG ISD,3272,True,42.0
15540,TEXAS,4820010,FRISCO ISD,58424,True,322.0


In [42]:
### TLDR; I'm dumb and didn't realize there was an LEA_ENR col. But verified that both ways of caluclating achieves the same result!
'''
# Add enrollment to our table
enrollment = pd.read_csv("Enrollment.csv")
enrollment["Total_enrollment"] = enrollment["TOT_ENR_M"] + enrollment["TOT_ENR_F"]
cols = ["LEAID", "Total_enrollment"]
enrollment = enrollment[cols]
enrollment = enrollment.groupby('LEAID')["Total_enrollment"].sum().reset_index() # Sum enrollment across all schools in a district

# Merge with our merged df
# Cast our LEAID to a numeric or else the merge doesn't work correctly
merged['LEAID'] = pd.to_numeric(merged['LEAID'], errors='coerce')
enrollment['LEAID'] = pd.to_numeric(enrollment['LEAID'], errors='coerce')
merged = pd.merge(merged, enrollment, on='LEAID', how='left')

merged
'''

'\n# Add enrollment to our table\nenrollment = pd.read_csv("Enrollment.csv")\nenrollment["Total_enrollment"] = enrollment["TOT_ENR_M"] + enrollment["TOT_ENR_F"]\ncols = ["LEAID", "Total_enrollment"]\nenrollment = enrollment[cols]\nenrollment = enrollment.groupby(\'LEAID\')["Total_enrollment"].sum().reset_index() # Sum enrollment across all schools in a district\n\n# Merge with our merged df\n# Cast our LEAID to a numeric or else the merge doesn\'t work correctly\nmerged[\'LEAID\'] = pd.to_numeric(merged[\'LEAID\'], errors=\'coerce\')\nenrollment[\'LEAID\'] = pd.to_numeric(enrollment[\'LEAID\'], errors=\'coerce\')\nmerged = pd.merge(merged, enrollment, on=\'LEAID\', how=\'left\')\n\nmerged\n'

In [43]:
# Calculate teacher to student ratio
teacher_student_ratio = pd.read_csv("School_Support.csv", encoding='latin-1')
cols = ["LEAID", "SCH_FTETEACH_TOT"]

teacher_student_ratio = teacher_student_ratio[cols]
teacher_student_ratio = teacher_student_ratio.groupby('LEAID')["SCH_FTETEACH_TOT"].sum().reset_index() # Sum enrollment across all schools in a district
# Cast our LEAID to numeric
merged['LEAID'] = pd.to_numeric(merged['LEAID'], errors='coerce')
teacher_student_ratio['LEAID'] = pd.to_numeric(teacher_student_ratio['LEAID'], errors='coerce')
merged = pd.merge(merged, teacher_student_ratio, on='LEAID', how='left')

merged["SCH_FTETEACH_TOT"] = merged["SCH_FTETEACH_TOT"] / merged["LEA_ENR"]

In [44]:
# Look at school expenditures
moolah = pd.read_csv("School Expenditures.csv", encoding='latin-1')
moolah

Unnamed: 0,LEA_STATE,LEA_STATE_NAME,LEAID,LEA_NAME,SCHID,SCH_NAME,COMBOKEY,JJ,SCH_SAL_TOTPERS_WOFED,SCH_FTE_TEACH_WOFED,...,SCH_SAL_ADM_WOFED,SCH_SAL_TEACH_WFED,SCH_FTE_AID_WFED,SCH_SAL_AID_WFED,SCH_FTE_SUP_WFED,SCH_SAL_SUP_WFED,SCH_FTE_ADM_WFED,SCH_SAL_ADM_WFED,SCH_SAL_TOTPERS_WFED,SCH_NPE_WFED
0,AL,ALABAMA,0100002,Alabama Youth Services,1705,Wallace Sch - Mt Meigs Campus,010000201705,Yes,1908676.0,25.00,...,81533.0,1827143.00,0.0,0.0,0.00,0.00,1.0,81533.0,1908676.00,102500.00
1,AL,ALABAMA,0100002,Alabama Youth Services,1706,McNeel Sch - Vacca Campus,010000201706,Yes,1563570.0,16.00,...,82000.0,1223463.00,0.0,0.0,0.00,0.00,0.0,82000.0,1305463.00,0.00
2,AL,ALABAMA,0100002,Alabama Youth Services,1876,Alabama Youth Services,010000201876,No,3998276.0,60.00,...,81533.0,3400850.00,0.0,0.0,0.00,0.00,1.0,81533.0,5122976.00,0.00
3,AL,ALABAMA,0100002,Alabama Youth Services,99995,AUTAUGA CAMPUS,010000299995,Yes,305220.0,3.00,...,81553.0,305220.00,0.0,0.0,0.00,0.00,1.0,82000.0,386551.00,18700.00
4,AL,ALABAMA,0100005,Albertville City,870,Albertville Middle School,010000500870,No,2394774.5,38.29,...,257464.0,1944222.84,3.0,66178.0,13.89,514793.08,3.0,257464.0,2782657.92,1059055.88
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97627,PR,PUERTO RICO,7200030,PUERTO RICO DEPARTMENT OF EDUCATION,2079,SEVERO E. COLBERG RAMIREZ,720003002079,No,0.0,0.00,...,-9.0,1373160.00,0.0,0.0,0.00,0.00,0.0,0.0,0.00,0.00
97628,PR,PUERTO RICO,7200030,PUERTO RICO DEPARTMENT OF EDUCATION,2082,INTERMEDIA JUAN SERRALLES,720003002082,No,0.0,0.00,...,-9.0,676944.00,0.0,0.0,0.00,0.00,0.0,0.0,0.00,0.00
97629,PR,PUERTO RICO,7200030,PUERTO RICO DEPARTMENT OF EDUCATION,2083,MARTIN GONZALEZ,720003002083,No,0.0,0.00,...,-9.0,393570.00,0.0,0.0,0.00,0.00,0.0,0.0,0.00,0.00
97630,PR,PUERTO RICO,7200030,PUERTO RICO DEPARTMENT OF EDUCATION,2084,NEMESIO R. CANALES I,720003002084,No,0.0,0.00,...,-9.0,638940.00,0.0,0.0,0.00,0.00,0.0,0.0,0.00,0.00


In [45]:
# Retention table looks quite... sparse
retention = pd.read_csv("Retention.csv", encoding='latin-1')
retention = retention.loc[retention["LEA_STATE"] == "TX"]
retention

Unnamed: 0,LEA_STATE,LEA_STATE_NAME,LEAID,LEA_NAME,SCHID,SCH_NAME,COMBOKEY,JJ,SCH_RET_KG_IND,SCH_RET_G01_IND,...,SCH_RET_G12_TR_M,SCH_RET_G12_TR_F,TOT_RET_G12_M,TOT_RET_G12_F,SCH_RET_G12_LEP_M,SCH_RET_G12_LEP_F,SCH_RET_G12_IDEA_M,SCH_RET_G12_IDEA_F,SCH_RET_G12_504_M,SCH_RET_G12_504_F
78831,TX,TEXAS,4800001,CROSBYTON CISD,1145,CROSBYTON EL,480000101145,No,Yes,Yes,...,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9
78832,TX,TEXAS,4800001,CROSBYTON CISD,1146,CROSBYTON SECONDARY,480000101146,No,-9,-9,...,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9
78833,TX,TEXAS,4800001,CROSBYTON CISD,3621,SP ED CO-OP,480000103621,No,-9,-9,...,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9
78834,TX,TEXAS,4800002,SPUR ISD,4732,SPUR SCHOOL,480000204732,No,No,No,...,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9
78835,TX,TEXAS,4800003,ROCKSPRINGS ISD,4219,ROCKSPRINGS K-12,480000304219,No,No,Yes,...,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87584,TX,TEXAS,4846740,ZAVALLA ISD,5353,ZAVALLA EL,484674005353,No,Yes,Yes,...,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9
87585,TX,TEXAS,4846740,ZAVALLA ISD,5354,ZAVALLA H S,484674005354,No,-9,-9,...,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9
87586,TX,TEXAS,4846770,ZEPHYR ISD,5356,ZEPHYR SCHOOL,484677005356,No,Yes,No,...,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9
87587,TX,TEXAS,4899130,BENAVIDES ISD,5627,BENAVIDES SECONDARY,489913005627,No,-9,-9,...,0,0,1,0,0,0,0,0,0,0


In [46]:
# Merge in county information
county_info = pd.read_excel("county_info.xlsx")

KeyboardInterrupt: 

In [None]:
# Add in county information for all LEAIDs
cols = ["LEAID", "NMCNTY"]
county_info = county_info[cols]
county_info['LEAID'] = pd.to_numeric(county_info['LEAID'], errors='coerce')

# Get the most common county for each school district
county_info = county_info.groupby('LEAID')['NMCNTY'].agg(lambda x: x.mode().iloc[0])

merged = pd.merge(merged, county_info, on='LEAID', how='left')

In [None]:
merged = merged.dropna() # only 9 instances of NA

# Calculate the Political Leaning per county
tpi = pd.read_csv("TPI.csv")
pattern1 = r'(?P<Prefix1>[A-Za-z]+)-(?P<Percentage1>\d+)%' # REGEX for the pattern
pattern2 = r'(?P<Prefix2>[A-Za-z]+)-(?P<Percentage2>\d+)%' # REGEX for the pattern

df_extracted1 = tpi['Current TPI (20-22)'].str.extract(pattern1)
df_extracted2 = tpi['Previous TPI (18-20)'].str.extract(pattern2)

# Concatenate the extracted columns with the original DataFrame
df_result = pd.concat([tpi, df_extracted1, df_extracted2], axis=1)
df_result["Prefix1"] = np.where(df_result["Prefix1"] == "R", 1, -1).astype(float)
df_result["Prefix2"] = np.where(df_result["Prefix2"] == "R", 1, -1).astype(float)

# Case our columns to perform our calculation
df_result["Percentage1"] = df_result["Percentage1"].astype(float)
df_result["Percentage2"] = df_result["Percentage2"].astype(float)

df_result["overall_TPI"] = ((df_result["Prefix1"] * df_result["Percentage1"] * 0.01) + (df_result["Prefix2"] * df_result["Percentage2"] * 0.01)) / 2

In [None]:
merged.sort_values(by="NMCNTY")

Unnamed: 0,LEA_STATE_NAME,LEAID,LEA_NAME,LEA_ENR,banned_books,SCH_FTETEACH_TOT,NMCNTY
334,TEXAS,4813200,CAYUGA ISD,576,False,0.085938,Anderson County
1039,TEXAS,4840500,SLOCUM ISD,423,False,0.131560,Anderson County
877,TEXAS,4834050,PALESTINE ISD,3419,False,0.075168,Anderson County
835,TEXAS,4832250,NECHES ISD,360,False,0.093889,Anderson County
1113,TEXAS,4843290,WESTWOOD ISD,1564,False,0.095358,Anderson County
...,...,...,...,...,...,...,...
846,TEXAS,4832700,NEWCASTLE ISD,231,False,0.082251,Young County
549,TEXAS,4821360,GRAHAM ISD,2373,False,0.069954,Young County
1198,TEXAS,4846710,ZAPATA COUNTY ISD,3521,False,0.062482,Zapata County
688,TEXAS,4826250,LA PRYOR ISD,468,False,0.091880,Zavala County


In [None]:
# Merge our books df with our county leaning df
merged["NMCNTY"] = merged["NMCNTY"].str.lower()
merged["NMCNTY"] = merged["NMCNTY"].str.replace('county', '', case=False).str.strip()
df_result["County Name"] = df_result["County Name"].str.lower().str.strip()
df_result = df_result[["County Name", "overall_TPI"]]
df_result["NMCNTY"] = df_result["County Name"].astype(str)
merged["NMCNTY"] = merged["NMCNTY"].astype(str)

merged = merged.merge(df_result, on='NMCNTY', how='left')

In [None]:
merged.dropna()

Unnamed: 0,LEA_STATE_NAME,LEAID,LEA_NAME,LEA_ENR,banned_books,SCH_FTETEACH_TOT,NMCNTY,County Name,overall_TPI
0,TEXAS,4800001,CROSBYTON CISD,365,False,0.101370,crosby,crosby,0.740
1,TEXAS,4800002,SPUR ISD,261,False,0.107280,dickens,dickens,0.865
2,TEXAS,4800003,ROCKSPRINGS ISD,313,False,0.076677,edwards,edwards,0.850
3,TEXAS,4800004,POR VIDA ACADEMY,273,False,0.091575,bexar,bexar,-0.570
4,TEXAS,4800005,BLACKWELL CISD,155,False,0.129032,mitchell,mitchell,0.855
...,...,...,...,...,...,...,...,...,...
1188,TEXAS,4846680,YSLETA ISD,41198,False,0.066469,el paso,el paso,-0.685
1189,TEXAS,4846710,ZAPATA COUNTY ISD,3521,False,0.062482,zapata,zapata,-0.585
1190,TEXAS,4846740,ZAVALLA ISD,368,False,0.092391,angelina,angelina,0.750
1191,TEXAS,4846770,ZEPHYR ISD,202,False,0.103960,brown,brown,0.875


In [None]:
# merged.to_csv("Final_DF.csv")

In [None]:
def parse_slash(text):
    parts = text.split('/')
    first_part = parts[0].strip()
    return first_part

In [None]:
wealth = pd.read_csv("School District Wealth.csv")
wealth["Median Household Income"] = wealth["Median Household Income"].str.replace("$", "").str.strip()
wealth["Median Household Income"] = wealth["Median Household Income"].str.replace(",", "").str.strip()
wealth["Median Household Income"] = wealth["Median Household Income"].astype(int)

wealth['dist_key'] = wealth['School District'].apply(parse_slash)
wealth["dist_key"] = wealth["dist_key"].str.replace("Independent School District", "").str.strip()
wealth["dist_key"] = wealth["dist_key"].str.replace("Municipal School District", "").str.strip()
wealth["dist_key"] = wealth["dist_key"].str.replace("Common School District", "").str.strip()
wealth["dist_key"] = wealth["dist_key"].str.upper()
wealth

Unnamed: 0,Median Household Income,School District,dist_key
0,182609,Highland Park Independent School District / 34...,HIGHLAND PARK
1,169401,"Carroll Independent School District / 29,265",CARROLL
2,131250,"London Independent School District / 1,660",LONDON
3,127991,"Eanes Independent School District / 36,986",EANES
4,117541,"Coppell Independent School District / 46,020",COPPELL
...,...,...,...
1023,22595,"San Elizario Independent School District / 14,557",SAN ELIZARIO
1024,22308,"Calvert Independent School District / 1,604",CALVERT
1025,22176,"Brooks County Independent School District / 7,200",BROOKS COUNTY
1026,22042,"Fort Hancock Independent School District / 2,084",FORT HANCOCK


In [None]:
def find_closest_match(row, column_to_match):
    choices = wealth[column_to_match]
    result = process.extractOne(row['LEA_NAME'], choices)
    
    # Check if a match is found and the score is above the threshold (80 in this example)
    if result is not None and result[1] >= 80:
        return result[0]  # Return the match
    else:
        return None  # Return None if no suitable match is found

merged['Closest_Match'] = merged.apply(find_closest_match, column_to_match='dist_key', axis=1)


KeyboardInterrupt: 

In [None]:
merged

Unnamed: 0,LEA_STATE_NAME,LEAID,LEA_NAME,LEA_ENR,banned_books,SCH_FTETEACH_TOT,NMCNTY,County Name,overall_TPI,Closest_Match
0,TEXAS,4800001,CROSBYTON CISD,365,False,0.101370,crosby,crosby,0.740,CROSBY
1,TEXAS,4800002,SPUR ISD,261,False,0.107280,dickens,dickens,0.865,SPUR
2,TEXAS,4800003,ROCKSPRINGS ISD,313,False,0.076677,edwards,edwards,0.850,ROCKSPRINGS
3,TEXAS,4800004,POR VIDA ACADEMY,273,False,0.091575,bexar,bexar,-0.570,ACADEMY
4,TEXAS,4800005,BLACKWELL CISD,155,False,0.129032,mitchell,mitchell,0.855,BLACKWELL CONSOLIDATED
...,...,...,...,...,...,...,...,...,...,...
1188,TEXAS,4846680,YSLETA ISD,41198,False,0.066469,el paso,el paso,-0.685,YSLETA
1189,TEXAS,4846710,ZAPATA COUNTY ISD,3521,False,0.062482,zapata,zapata,-0.585,ZAPATA COUNTY
1190,TEXAS,4846740,ZAVALLA ISD,368,False,0.092391,angelina,angelina,0.750,ZAVALLA
1191,TEXAS,4846770,ZEPHYR ISD,202,False,0.103960,brown,brown,0.875,ZEPHYR


In [None]:
merged_df = pd.merge(merged, wealth, how='left', left_on='Closest_Match', right_on='dist_key')
merged_df["Median Household Income"].sort_values()

merged_df.loc[merged_df["Median Household Income"].isna()]

Unnamed: 0,LEA_STATE_NAME,LEAID,LEA_NAME,LEA_ENR,banned_books,SCH_FTETEACH_TOT,NMCNTY,County Name,overall_TPI,Closest_Match,Median Household Income,School District,dist_key
17,TEXAS,4800022,RAUL YZAGUIRRE SCHOOLS FOR SUCCESS,1287,False,0.056488,harris,harris,-0.54,,,,
21,TEXAS,4800030,UPLIFT EDUCATION,17352,False,0.064762,dallas,dallas,-0.63,,,,
29,TEXAS,4800044,WAYSIDE SCHOOLS,2115,False,0.068558,travis,travis,-0.705,,,,
40,TEXAS,4800065,HORIZON MONTESSORI PUBLIC SCHOOLS,1237,False,0.089733,hidalgo,hidalgo,-0.625,,,,
54,TEXAS,4800089,LUMIN EDUCATION,299,False,0.053512,dallas,dallas,-0.63,,,,
64,TEXAS,4800113,AMIGOS POR VIDA-FRIENDS FOR LIFE PUB CHTR SCH,513,False,0.053294,harris,harris,-0.54,,,,
83,TEXAS,4800179,JUBILEE ACADEMIES,5649,False,0.067446,bexar,bexar,-0.57,,,,
97,TEXAS,4800209,YES PREP PUBLIC SCHOOLS INC,10860,False,0.062155,harris,harris,-0.54,,,,
99,TEXAS,4800211,IDEA PUBLIC SCHOOLS,35595,False,0.054727,hidalgo,hidalgo,-0.625,,,,
121,TEXAS,4800262,MEYERPARK ELEMENTARY,248,False,0.052419,harris,harris,-0.54,,,,


In [None]:
merged_df.to_csv("Final_DF.csv")