In [93]:
import re
import unicodedata
import pandas as pd
from unidecode import unidecode

In [94]:
df_raw = pd.read_csv("escolas.csv")
df_material = pd.read_csv("material_didatico.csv")
df_sub = pd.read_csv("subprefeituras.csv")

In [95]:

def short_to_full_tag(original_str: str) -> str:
    address_mapping = {
        "R.": "Rua",
        "Av.": "Avenida",
        "Pça.": "Praça",
        "Estr.": "Estrada"
    }
    
    short_tag, address = original_str.split(maxsplit=1)
    if short_tag in address_mapping.keys():
        return f"{address_mapping[short_tag]} {address}"
    else:
        return original_str
    
def fix_names(name: str) -> str:
    replacements = {
        "E.M.": "ESCOLA MUNICIPAL",
        "CIEP": "CENTRO INTEGRADO DE EDUCAÇÃO PÚBLICA",
        "EM": "ESCOLA MUNICIPAL",
        "e.m.": "ESCOLA MUNICIPAL",
        "E.M": "ESCOLA MUNICIPAL"
    }

    for abbreviation, replacement in replacements.items():
        if name.startswith(abbreviation):
            return name.replace(abbreviation, replacement, 1)

    return name

def type_school(name: str) -> str:
    if "ESCOLA MUNICIPAL" in name:
        return "EM"
    elif "CENTRO INTEGRADO DE EDUCAÇÃO PÚBLICA" in name:
        return "CIEP"
    elif "COLÉGIO" in name:
        return "COLÉGIO"
    else:
        return "Desconhecido"

In [96]:
normalized_address_list = []
address_names = []
address_numbers = []

In [97]:
df_raw.columns = ["uuid", "escolas_postos", "bairro", "endereco", "lat", "lon"]
address_list = df_raw["endereco"].values.tolist()
df_raw["endereco"] = df_raw["endereco"].apply(short_to_full_tag).str.upper()

for address in df_raw["endereco"].values.tolist():
    normalized_address = unicodedata.normalize("NFKD", address).encode("ASCII", errors="ignore").decode("ASCII")
    
    normalized_address_list.append(normalized_address.replace(",", ""))
    
df_raw["endereco"] = normalized_address_list
df_raw.head()

for i, address in enumerate(df_raw["endereco"].values.tolist()):
    address_name = re.sub(r"\d+|S/No", "", address)
    try:
        address_number = re.search(r"\d+|S/No", address).group().replace("S/No", "S/N")
    except AttributeError:
        address_number = "S/N"

    address_names.append(address_name.strip())
    address_numbers.append(address_number.strip())
df_raw["logradouro"] = address_names
df_raw["numero_end"] = address_numbers
df_raw.head()

Unnamed: 0,uuid,escolas_postos,bairro,endereco,lat,lon,logradouro,numero_end
0,178,CENTRO INTEGRADO DE EDUCAÇÃO PÚBLICA HENFIL,CAJU,RUA CARLOS SEIDL S/No,-22880888,-43225326,RUA CARLOS SEIDL,S/N
1,634,EM ALICE DO AMARAL PEIXOTO,BENFICA,RUA EBANO 187,-22889574,-43236202,RUA EBANO,187
2,483,EM CELESTINO SILVA,CENTRO,RUA DO LAVRADIO 56,-22909293,-43183579,RUA DO LAVRADIO,56
3,476,ESCOLA MUNICIPAL FLORIANO PEIXOTO,SÃO CRISTÓVÃO,PRACA ARGENTINA 20,-22897629,-43227456,PRACA ARGENTINA,20
4,132,EM PEREIRA PASSOS,RIO COMPRIDO,PRACA CONDESSA PAULO DE FRONTIN 45,-22924412,-43208579,PRACA CONDESSA PAULO DE FRONTIN,45


In [98]:
df_raw['escolas_postos'] = df_raw['escolas_postos'].apply(fix_names)
df_raw['tipo_escola'] = df_raw['escolas_postos'].apply(type_school)

In [99]:
df_sub["nome"] = df_sub["nome"].str.upper().apply(unidecode)
df_raw["bairro"] = df_raw["bairro"].str.upper().apply(unidecode)

In [100]:
df_raw_sub = df_raw.merge(df_sub, left_on='bairro', right_on='nome', how='left')
df_raw_sub.drop(columns=['nome'], inplace=True)  

In [101]:
df_raw_sub.head(40)

Unnamed: 0,uuid,escolas_postos,bairro,endereco,lat,lon,logradouro,numero_end,tipo_escola,subprefeitura
0,178,CENTRO INTEGRADO DE EDUCAÇÃO PÚBLICA HENFIL,CAJU,RUA CARLOS SEIDL S/No,-22880888,-43225326,RUA CARLOS SEIDL,S/N,CIEP,Centro
1,634,ESCOLA MUNICIPAL ALICE DO AMARAL PEIXOTO,BENFICA,RUA EBANO 187,-22889574,-43236202,RUA EBANO,187,EM,Centro
2,483,ESCOLA MUNICIPAL CELESTINO SILVA,CENTRO,RUA DO LAVRADIO 56,-22909293,-43183579,RUA DO LAVRADIO,56,EM,Centro
3,476,ESCOLA MUNICIPAL FLORIANO PEIXOTO,SAO CRISTOVAO,PRACA ARGENTINA 20,-22897629,-43227456,PRACA ARGENTINA,20,EM,Centro
4,132,ESCOLA MUNICIPAL PEREIRA PASSOS,RIO COMPRIDO,PRACA CONDESSA PAULO DE FRONTIN 45,-22924412,-43208579,PRACA CONDESSA PAULO DE FRONTIN,45,EM,Centro
5,17,ESCOLA MUNICIPAL PRESIDENTE JOSE LINHARES,IPANEMA,RUA BARAO DA TORRE 90,-22983332,-43199036,RUA BARAO DA TORRE,90,EM,Zona Sul
6,55,ESCOLA MUNICIPAL SANTA CATARINA,SANTA TERESA,RUA EDUARDO SANTOS 38,-22915535,-43191588,RUA EDUARDO SANTOS,38,EM,Centro
7,490,ESCOLA MUNICIPAL TIA CIATA,CENTRO,AVENIDA PRESIDENTE VARGAS S/No,-22907123,-43195068,AVENIDA PRESIDENTE VARGAS,S/N,EM,Centro
8,600,ESCOLA MUNICIPAL URUGUAI,BENFICA,RUA ANA NERI 192,-22898488,-43237756,RUA ANA NERI,192,EM,Centro
9,89,ESCOLA MUNICIPAL MARIO CLAUDIO,RIO COMPRIDO,RUA HADDOCK LOBO 148,-22917233,-43211044,RUA HADDOCK LOBO,148,EM,Centro
