# Projeto

## 1. Configurações de Ambiente

In [1]:
import sys
import os

# Add the project's root directory to the Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

In [None]:
from __future__ import annotations

# 1. Standart Library
import functools
import os
from pathlib import Path
from string import ascii_uppercase
from typing import Any, Iterable, Mapping, Sequence, Tuple
import unicodedata

# 2. Third-party
import basedosdados as bd
from dotenv import load_dotenv
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import seaborn as sns

# 3. Local Application Modules
from config import GCLOUD_PROJECT_ID
from src.roda.utils.helpers import get_municipality_codes_from_names
from src.roda.pipelines.data_loading import load_geolocated_layers
from src.roda.utils.geo import override_geometry
from roda.utils.cnae import add_cnae_section_letter

In [4]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.style.use('Solarize_Light2')

pd.options.display.float_format = '{:,.2f}'.format

In [None]:
# Ainda se faz necessário ter uma parte de configuração que
# será mais específica de acordo com o objetivo do estudo
MUNICIPALITY_LIST = [("RJ", "Rio de Janeiro"), ("RJ", "Niterói")]
MUNICIPALITY_CODES = get_municipality_codes_from_names(MUNICIPALITY_LIST)

YEAR = 2023
CNES_MONTH = 10

# MOVIDA PARA PARTE DE CONFIGURAÇÃO 
COL = "etapas_modalidades_oferecidas" 

[3304557, 3303302]


## 2. 

In [None]:
camadas_geo = load_geolocated_layers(
    municipalities=MUNICIPALITY_LIST,
    year=YEAR,
    cnes_month=CNES_MONTH,
    billing_project_id=GCLOUD_PROJECT_ID
)

rais_gdf = camadas_geo["rais"]
cnes_gdf = camadas_geo["cnes"]
schools_gdf = camadas_geo["schools"]


rais_gdf.plot()

In [None]:
def strip_accents(s: str) -> str:
    """
    Remove os acentos de uma string utilizando normalização Unicode.

    Parâmetros:
    ----------
    s : str
        String de entrada que pode conter caracteres acentuados. Se o valor for
        nulo (NaN), retorna uma string vazia.

    Retorna:
    -------
    str
        String sem acentos. Se a entrada for nula, retorna "".

    Exemplo:
    -------
    >>> strip_accents("ação")
    'acao'

    >>> strip_accents("café")
    'cafe'

    >>> strip_accents(None)
    ''
    """

    if pd.isna(s):
        return ""
    return "".join(c for c in unicodedata.normalize("NFD", s)
                    if unicodedata.category(c) != "Mn")

In [None]:
norm = (schools_gdf[COL].astype("string")
        .fillna("")
        .map(strip_accents)
        .str.lower())

schools_gdf = schools_gdf.assign(
    infantil     = norm.str.contains(r"\binfantil\b").astype("int8"),
    fundamental  = norm.str.contains(r"\bfundamental\b").astype("int8"),
    medio        = norm.str.contains(r"\bmedio\b").astype("int8"),
    profissional = norm.str.contains(r"\bprofissional\b").astype("int8"),
    eja          = norm.str.contains(r"\badultos\b").astype("int8"),
)

## 3. 

In [None]:
override_cnes_geometry = functools.partial(
    override_geometry,
    id_column="id_estabelecimento_cnes"
)

In [None]:
# dicionário de (lat, lon) extraídos do Google Maps
"""
overrides = {
    '5042488': (-22.93496303213362, -43.10111846368994),
    '0113891': (-22.82297090020438, -42.97796431846613),
    '0012521': (-22.880157833214476, -43.07872347303554),
    '3784916': (-22.771568234552003, -42.91998438196854),
    '0012599': (-22.881199876918767, -43.078428218305724),
    '2297590': (-22.81838059154707, -43.01194919623843),
    '2291525': (-22.8261384043605, -43.04752398500165),
    '9101039': (-22.84514982733307, -42.95904980987253),
    '7884680': (-22.93934643214391, -43.0586731769422),
    '0105317': (-22.94305755496102, -43.061599214063634),
    '4156390': (-22.7452793538039, -42.83392041918691),
    '9101039': (-22.827399501549497, -43.09059726136488),
}
"""

# aplica somente num único comando:
"""
cnes_gdf = override_cnes_geometry(
    cnes_gdf,
    overrides,
    drop_ids=["9101039"]
)
"""

## 4. 

In [None]:
rais_gdf = add_cnae_section_letter(rais_gdf, cnae_col="cnae_2", section_col="grupo_cnae")