### Jupyter Notebook - Pré-processamento
#### Feito por Lucas D. F. Rodrigues.

1. Normalização textual (_e.g._ remoção de acentuação)
2. Transformação textual: _uppercase_
3. Substituição textual (_e.g._ siglas)

In [95]:
import re
import unicodedata
import pandas as pd

In [20]:
df_raw = pd.read_csv("escolas.csv")

df_raw.columns = ["uuid", "escolas_postos", "bairro", "endereco", "lat", "lon"]
df_raw.head()

Unnamed: 0,uuid,escolas_postos,bairro,endereco,lat,lon
0,178,CENTRO INTEGRADO DE EDUCAÇÃO PÚBLICA HENFIL,CAJU,Rua Carlos Seidl S/Nº,-22880888,-43225326
1,634,EM ALICE DO AMARAL PEIXOTO,BENFICA,Rua Ébano 187,-22889574,-43236202
2,483,EM CELESTINO SILVA,CENTRO,"R. do Lavradio, 56",-22909293,-43183579
3,476,ESCOLA MUNICIPAL FLORIANO PEIXOTO,SÃO CRISTÓVÃO,Praça Argentina 20,-22897629,-43227456
4,132,EM PEREIRA PASSOS,RIO COMPRIDO,Praça Condessa Paulo de Frontin 45,-22924412,-43208579


In [21]:
address_list = df_raw["endereco"].values.tolist()

In [22]:
def short_to_full_tag(original_str: str) -> str:
    address_mapping = {
        "R.": "Rua",
        "Av.": "Avenida",
        "Pça.": "Praça",
        "Estr.": "Estrada"
    }
    
    short_tag, address = original_str.split(maxsplit=1)
    if short_tag in address_mapping.keys():
        return f"{address_mapping[short_tag]} {address}"
    else:
        return original_str

In [23]:
df_raw["endereco"] = df_raw["endereco"].apply(short_to_full_tag).str.upper()
df_raw.head()

Unnamed: 0,uuid,escolas_postos,bairro,endereco,lat,lon
0,178,CENTRO INTEGRADO DE EDUCAÇÃO PÚBLICA HENFIL,CAJU,RUA CARLOS SEIDL S/Nº,-22880888,-43225326
1,634,EM ALICE DO AMARAL PEIXOTO,BENFICA,RUA ÉBANO 187,-22889574,-43236202
2,483,EM CELESTINO SILVA,CENTRO,"RUA DO LAVRADIO, 56",-22909293,-43183579
3,476,ESCOLA MUNICIPAL FLORIANO PEIXOTO,SÃO CRISTÓVÃO,PRAÇA ARGENTINA 20,-22897629,-43227456
4,132,EM PEREIRA PASSOS,RIO COMPRIDO,PRAÇA CONDESSA PAULO DE FRONTIN 45,-22924412,-43208579


In [65]:
normalized_address_list = []
for address in df_raw["endereco"].values.tolist():
    normalized_address = unicodedata.normalize("NFKD", address).encode("ASCII", errors="ignore").decode("ASCII")
    
    normalized_address_list.append(normalized_address.replace(",", ""))
    
df_raw["endereco"] = normalized_address_list
df_raw.head()

Unnamed: 0,uuid,escolas_postos,bairro,endereco,lat,lon
0,178,CENTRO INTEGRADO DE EDUCAÇÃO PÚBLICA HENFIL,CAJU,RUA CARLOS SEIDL S/No,-22880888,-43225326
1,634,EM ALICE DO AMARAL PEIXOTO,BENFICA,RUA EBANO 187,-22889574,-43236202
2,483,EM CELESTINO SILVA,CENTRO,RUA DO LAVRADIO 56,-22909293,-43183579
3,476,ESCOLA MUNICIPAL FLORIANO PEIXOTO,SÃO CRISTÓVÃO,PRACA ARGENTINA 20,-22897629,-43227456
4,132,EM PEREIRA PASSOS,RIO COMPRIDO,PRACA CONDESSA PAULO DE FRONTIN 45,-22924412,-43208579


In [92]:
address_names = []
address_numbers = []

for i, address in enumerate(df_raw["endereco"].values.tolist()):
    address_name = re.sub(r"\d+|S/No", "", address)
    try:
        address_number = re.search(r"\d+|S/No", address).group().replace("S/No", "S/N")
    except AttributeError:
        address_number = "S/N"

    address_names.append(address_name.strip())
    address_numbers.append(address_number.strip())

In [94]:
df_raw["logradouro"] = address_names
df_raw["numero_end"] = address_numbers
df_raw.head()

Unnamed: 0,uuid,escolas_postos,bairro,endereco,lat,lon,logradouro,numero_end
0,178,CENTRO INTEGRADO DE EDUCAÇÃO PÚBLICA HENFIL,CAJU,RUA CARLOS SEIDL S/No,-22880888,-43225326,RUA CARLOS SEIDL,S/N
1,634,EM ALICE DO AMARAL PEIXOTO,BENFICA,RUA EBANO 187,-22889574,-43236202,RUA EBANO,187
2,483,EM CELESTINO SILVA,CENTRO,RUA DO LAVRADIO 56,-22909293,-43183579,RUA DO LAVRADIO,56
3,476,ESCOLA MUNICIPAL FLORIANO PEIXOTO,SÃO CRISTÓVÃO,PRACA ARGENTINA 20,-22897629,-43227456,PRACA ARGENTINA,20
4,132,EM PEREIRA PASSOS,RIO COMPRIDO,PRACA CONDESSA PAULO DE FRONTIN 45,-22924412,-43208579,PRACA CONDESSA PAULO DE FRONTIN,45
