In [1]:
import re
import unicodedata
import pandas as pd

In [35]:
df_raw = pd.read_csv("escolas.csv")
df_material = pd.read_csv("material_didatico.csv")
df_sub = pd.read_csv("subprefeituras.csv")

In [37]:

def short_to_full_tag(original_str: str) -> str:
    address_mapping = {
        "R.": "Rua",
        "Av.": "Avenida",
        "Pça.": "Praça",
        "Estr.": "Estrada"
    }
    
    short_tag, address = original_str.split(maxsplit=1)
    if short_tag in address_mapping.keys():
        return f"{address_mapping[short_tag]} {address}"
    else:
        return original_str
    
def fix_names(name: str) -> str:
    replacements = {
        "E.M.": "ESCOLA MUNICIPAL",
        "CIEP": "CENTRO INTEGRADO DE EDUCAÇÃO PÚBLICA",
        "EM": "ESCOLA MUNICIPAL",
        "e.m.": "ESCOLA MUNICIPAL",
        "E.M": "ESCOLA MUNICIPAL"
    }

    for abbreviation, replacement in replacements.items():
        if name.startswith(abbreviation):
            return name.replace(abbreviation, replacement, 1)

    return name

def type_school(name: str) -> str:
    if "ESCOLA MUNICIPAL" in name:
        return "EM"
    elif "CENTRO INTEGRADO DE EDUCAÇÃO PÚBLICA" in name:
        return "CIEP"
    elif "COLÉGIO" in name:
        return "COLÉGIO"
    else:
        return "Desconhecido"

In [38]:
normalized_address_list = []
address_names = []
address_numbers = []

In [39]:
df_raw.columns = ["uuid", "escolas_postos", "bairro", "endereco", "lat", "lon"]
address_list = df_raw["endereco"].values.tolist()
df_raw["endereco"] = df_raw["endereco"].apply(short_to_full_tag).str.upper()

for address in df_raw["endereco"].values.tolist():
    normalized_address = unicodedata.normalize("NFKD", address).encode("ASCII", errors="ignore").decode("ASCII")
    
    normalized_address_list.append(normalized_address.replace(",", ""))
    
df_raw["endereco"] = normalized_address_list
df_raw.head()

for i, address in enumerate(df_raw["endereco"].values.tolist()):
    address_name = re.sub(r"\d+|S/No", "", address)
    try:
        address_number = re.search(r"\d+|S/No", address).group().replace("S/No", "S/N")
    except AttributeError:
        address_number = "S/N"

    address_names.append(address_name.strip())
    address_numbers.append(address_number.strip())
df_raw["logradouro"] = address_names
df_raw["numero_end"] = address_numbers
df_raw.head()

Unnamed: 0,uuid,escolas_postos,bairro,endereco,lat,lon,logradouro,numero_end
0,178,CENTRO INTEGRADO DE EDUCAÇÃO PÚBLICA HENFIL,CAJU,RUA CARLOS SEIDL S/No,-22880888,-43225326,RUA CARLOS SEIDL,S/N
1,634,EM ALICE DO AMARAL PEIXOTO,BENFICA,RUA EBANO 187,-22889574,-43236202,RUA EBANO,187
2,483,EM CELESTINO SILVA,CENTRO,RUA DO LAVRADIO 56,-22909293,-43183579,RUA DO LAVRADIO,56
3,476,ESCOLA MUNICIPAL FLORIANO PEIXOTO,SÃO CRISTÓVÃO,PRACA ARGENTINA 20,-22897629,-43227456,PRACA ARGENTINA,20
4,132,EM PEREIRA PASSOS,RIO COMPRIDO,PRACA CONDESSA PAULO DE FRONTIN 45,-22924412,-43208579,PRACA CONDESSA PAULO DE FRONTIN,45


In [41]:
df_raw['escolas_postos'] = df_raw['escolas_postos'].apply(fix_names)


In [43]:
df_raw

Unnamed: 0,uuid,escolas_postos,bairro,endereco,lat,lon,logradouro,numero_end
0,178,CENTRO INTEGRADO DE EDUCAÇÃO PÚBLICA HENFIL,CAJU,RUA CARLOS SEIDL S/No,-22880888,-43225326,RUA CARLOS SEIDL,S/N
1,634,ESCOLA MUNICIPAL ALICE DO AMARAL PEIXOTO,BENFICA,RUA EBANO 187,-22889574,-43236202,RUA EBANO,187
2,483,ESCOLA MUNICIPAL CELESTINO SILVA,CENTRO,RUA DO LAVRADIO 56,-22909293,-43183579,RUA DO LAVRADIO,56
3,476,ESCOLA MUNICIPAL FLORIANO PEIXOTO,SÃO CRISTÓVÃO,PRACA ARGENTINA 20,-22897629,-43227456,PRACA ARGENTINA,20
4,132,ESCOLA MUNICIPAL PEREIRA PASSOS,RIO COMPRIDO,PRACA CONDESSA PAULO DE FRONTIN 45,-22924412,-43208579,PRACA CONDESSA PAULO DE FRONTIN,45
...,...,...,...,...,...,...,...,...
147,474,ESCOLA MUNICIPAL ENGENHEIRO GASTÃO RANGEL,GUARATIBA,ESTRADA DO MAGARCA 9.183,-2298046,-43643545,ESTRADA DO MAGARCA .,9
148,301,ESCOLA MUNICIPAL JONATAS SERRANO,GUARATIBA,ESTRADA DO MATO ALTO S/No,-22953163,-43577409,ESTRADA DO MATO ALTO,S/N
149,215,ESCOLA MUNICIPAL NARCISA AMALIA,ILHA DE GUARATIBA,ESTRADA TEODORETO DE CAMARGO S/N.o,-23009084,-43537582,ESTRADA TEODORETO DE CAMARGO S/N.o,S/N
150,606,ESCOLA MUNICIPAL PROFESSOR CASTILHO,ILHA DE GUARATIBA,CAMINHO DA MATRIZ 4406,-22994124,-43593683,CAMINHO DA MATRIZ,4406
