In [1]:
#default_exp query
%load_ext autoreload
%autoreload 2

In [2]:
#hide
import sys
from pathlib import Path

# Insert in Path Project Directory
sys.path.insert(0, str(Path().cwd().parent))

# Queries

> Este módulo executa as queries sql / MongoDB necessárias para baixar os dados do STEL, RADCOM e MOSAICO

In [21]:
#export
from decimal import Decimal, getcontext
from typing import Union
from urllib.request import urlretrieve
from numpy import save
import xmltodict
from zipfile import ZipFile


import pandas as pd
import pyodbc
from rich.console import Console
from pyarrow import ArrowInvalid
from unidecode import unidecode
from fastcore.xtras import Path
from fastcore.foundation import L
from fastcore.utils import listify
from fastcore.test import test_eq


from anateldb.constants import *
from anateldb.format import parse_bw, dict2cols, format_types
from anateldb.merge import clean_mosaico

getcontext().prec = 5

## Conexão com o banco de dados
A função a seguir é um `wrapper` simples que utiliza o `pyodbc` para se conectar ao banco de dados base da Anatel e retorna o objeto da conexão

In [4]:
#export
def connect_db():
    """Conecta ao Banco ANATELBDRO01 e retorna o 'cursor' (iterador) do Banco pronto para fazer iterações"""
    return pyodbc.connect(
        "Driver={ODBC Driver 17 for SQL Server};"
        "Server=ANATELBDRO01;"
        "Database=SITARWEB;"
        "Trusted_Connection=yes;"
        "MultipleActiveResultSets=True;",
        timeout=TIMEOUT,
    )

In [5]:
#slow
def test_connection():
    conn = connect_db()
    cursor = conn.cursor()
    for query in (RADCOM,):
        cursor.execute(query)
        test_eq(type(cursor.fetchone()), pyodbc.Row)

In [None]:
test_connection()

In [6]:
#exporti
def _parse_estações(row: dict)->dict:
    """Given a row in a MongoDB ( a dict of dicts ), it travels some keys and return a subset dict"""
    
    d = {k.replace('@', '').lower():row[k] for k in ("@SiglaServico", "@id", "@state",
        "@entidade",
        "@fistel",
        "@cnpj",
        "@municipio",
        "@uf")}
    entidade = row.get('entidade', {})
    d.update({k.replace('@', '').lower():entidade[k] for k in ('@num_servico', '@habilitacao_DataValFreq')})
    administrativo = row.get('administrativo', {})
    d['numero_estacao'] = administrativo.get('@numero_estacao')
    estacao = row.get('estacao_principal', {})
    d.update({k.replace('@', '').lower():estacao[k] for k in ('@latitude', '@longitude')})
    return d

In [7]:
#exporti
def _read_estações(path: Union[str, Path]) -> pd.DataFrame:
    """Read the zipped xml file `Estações.zip` from MOSAICO and returns a dataframe"""
    
    with ZipFile(path) as myzip:
        with myzip.open('estacao_rd.xml') as myfile:
            estacoes = xmltodict.parse(myfile.read())
            
    assert 'estacao_rd' in estacoes, "The xml file inside estacoes.zip is not in the expected format"
    assert 'row' in estacoes['estacao_rd'], "The xml file inside estacoes.zip is not in the expected format"
    
    df = pd.DataFrame(L(estacoes['estacao_rd']['row']).map(_parse_estações))
    df = df[df.state.str.contains("-C1$|-C2$|-C3$|-C4$|-C7|-C98$")].reset_index(drop=True)
    df = df.loc[:, COL_ESTACOES]
    df.columns = NEW_ESTACOES    
    for c in df.columns:
        df.loc[df[c] == "", c] = pd.NA
    return df

In [8]:
#exporti
def _parse_pb(row: dict)->dict:
    """Given a row in the MongoDB file canais.zip ( a dict of dicts ), it travels some keys and return a subset dict"""
    return {unidecode(k).lower().replace("@", ""): v  for k,v in row.items()}

In [23]:
#exporti
def _read_plano_basico(path: Union[str, Path]) -> pd.DataFrame:
    """Read the zipped xml file `Plano_Básico.zip` from MOSAICO and returns a dataframe"""    
    df = L()
    with ZipFile(path) as myzip:
        with myzip.open('plano_basicoTVFM.xml') as myfile:
            pbtvfm = xmltodict.parse(myfile.read())
        with myzip.open('plano_basicoAM.xml') as myfile:
            pbam = xmltodict.parse(myfile.read())
        with myzip.open('secundariosTVFM.xml') as myfile:
            stvfm = xmltodict.parse(myfile.read())
        with myzip.open('secundariosAM.xml') as myfile:
            sam = xmltodict.parse(myfile.read())    
            
    for base in (pbtvfm, stvfm, pbam, sam):
        assert 'plano_basico' in base, "The xml files inside canais.zip is not in the expected format"
        assert 'row' in base['plano_basico'], "The xml file inside canais.zip is not in the expected format"
        df.extend(L(base['plano_basico']['row']).map(_parse_pb))
        
    df = pd.DataFrame(df)
    df = df.loc[df.pais == "BRA", COL_PB].reset_index(drop=True)    
    df.columns = NEW_PB
    df = df[df.Status.str.contains("-C1$|-C2$|-C3$|-C4$|-C7|-C98$")].reset_index(drop=True)
    df.loc[:, 'Frequência'] = df.Frequência.str.replace(',', '.')
    for c in df.columns:
        df.loc[df[c] == '', c] = pd.NA
    return df    

## Atualização das bases de dados
As bases de dados são atualizadas atráves das funções a seguir, o único argumento passado em todas elas é a pasta na qual os arquivos locais processados serão salvos, os nomes dos arquivos são padronizados e não podem ser editados para que as funções de leitura e processamento recebam somente a pasta na qual esses arquivos foram salvos.

In [10]:
#export
def save_df(df: pd.DataFrame, folder: Union[str, Path], stem: str) -> pd.DataFrame:
    """Format, Save and return a dataframe"""
    df = format_types(df, stem)
    try:
        file = Path(f"{folder}/{stem}.parquet.gzip")
        df.to_parquet(file, compression="gzip")
    except ArrowInvalid:
        file.unlink()
        try:
            file = Path(f"{folder}/{stem}.fth")
            df.to_feather(file)
        except ArrowInvalid:
            file.unlink()
            try:
                file = Path(f"{folder}/{stem}.xlsx")
                with pd.ExcelWriter(file) as wb:
                    df.to_excel(
                        wb, sheet_name="DataBase", engine="openpyxl", index=False
                    )
            except Exception as e:
                raise ValueError(f"Could not save {stem} to {file}") from e
    return df


def update_radcom(folder: Union[str, Path]) -> pd.DataFrame:
    """Atualiza a tabela local retornada pela query `RADCOM`"""
    console = Console()
    with console.status(
        "[cyan]Lendo o Banco de Dados de Radcom...", spinner="earth"
    ) as status:
        try:
            conn = connect_db()
            df = pd.read_sql_query(RADCOM, conn)
            return save_df(df, folder, "radcom")
        except pyodbc.OperationalError:
            status.console.log(
                "Não foi possível abrir uma conexão com o SQL Server. Esta conexão somente funciona da rede cabeada!"
            )
    return None


def update_stel(folder: Union[str, Path]) -> pd.DataFrame:
    """Atualiza a tabela local retornada pela query `STEL`"""
    console = Console()
    with console.status(
        "[red]Lendo o Banco de Dados do STEL. Processo Lento, aguarde...",
        spinner="bouncingBall",
    ) as status:
        try:
            conn = connect_db()
            df = pd.read_sql_query(STEL, conn)
            return save_df(df, folder, "stel")
        except pyodbc.OperationalError:
            status.console.log(
                "Não foi possível abrir uma conexão com o SQL Server. Esta conexão somente funciona da rede cabeada!"
            )
    return df


def update_mosaico(folder: Union[str, Path]) -> pd.DataFrame:
    """Atualiza a tabela local do Mosaico. É baixado e processado arquivos xml zipados da página pública do Spectrum E"""
    console = Console()
    with console.status(
        "[blue]Baixando e consolidando os dados do Mosaico...", spinner="clock"
    ):
        stations, _ = urlretrieve(ESTACOES, f"{folder}/estações.zip")
        pb, _ = urlretrieve(PLANO_BASICO, f"{folder}/canais.zip")
        estações = _read_estações(stations)
        plano_basico = _read_plano_basico(pb)
        df = estações.merge(plano_basico, on="Id", how="left")
        df = clean_mosaico(folder, df)
        return save_df(df, folder, "mosaico")


def update_base(folder: Union[str, Path]) -> pd.DataFrame:
    """Wrapper que atualiza opcionalmente lê e atualiza as três bases indicadas anteriormente, as combina e salva o arquivo consolidado na folder `folder`"""
    stel = update_stel(folder).loc[:, TELECOM]
    radcom = update_radcom(folder).loc[:, SRD]
    mosaico = update_mosaico(folder).loc[:, RADIODIFUSAO]
    radcom["Num_Serviço"] = "231"
    radcom["Status"] = "RADCOM"
    radcom["Classe_Emissão"] = pd.NA
    radcom["Largura_Emissão"] = BW_MAP["231"]
    radcom["Entidade"] = radcom.Entidade.str.rstrip().str.lstrip()
    radcom["Validade_RF"] = pd.NA
    radcom["Fonte"] = "SRD"
    stel["Status"] = "L"
    stel["Entidade"] = stel.Entidade.str.rstrip().str.lstrip()
    stel["Fonte"] = "STEL"
    mosaico["Fonte"] = "MOS"
    mosaico["Classe_Emissão"] = pd.NA
    mosaico["Largura_Emissão"] = mosaico.Num_Serviço.map(BW_MAP)
    rd = (
        pd.concat([mosaico, radcom, stel])
        .sort_values(["Frequência", "Latitude", "Longitude"])
        .reset_index(drop=True)
    )
    rd = rd.drop_duplicates(keep="first").reset_index(drop=True)
    rd["BW(kHz)"] = rd.Largura_Emissão.apply(parse_bw)
    return save_df(rd, folder, "base")

In [11]:
folder = Path.cwd().parent / 'dados' / 'tabular'

In [12]:
es = _read_estações(folder / 'estações.zip')

In [13]:
es.head()

Unnamed: 0,Serviço,Num_Serviço,Status,Entidade,Fistel,UF,Id,Número_Estação,Latitude_Transmissor,Longitude_Transmissor,CNPJ,Validade_RF
0,TV,248,TV-C1,X-MEDIAGROUP S.A.,50410887137,AC,57dbaad053c60,,,,3211814000163,
1,TV,248,TV-C4,TELEVISAO OESTE BAIANO LTDA,6030116240,BA,57dbaad0dc4e3,322647029.0,-12.101388888889,-44.993611111111,16395923000120,2023-12-31
2,TV,248,TV-C2,TELEVISAO SANTA CRUZ LTDA,6020355110,BA,57dbaad0eb54a,322623553.0,-14.779444444444,-39.262222222222,13476833000175,2023-12-31
3,TV,248,TV-C4,TV CABRALIA LTDA,6020354903,BA,57dbaad0ef8af,322623537.0,-14.78167,-39.26167,13494265000135,2023-12-31
4,TV,248,TV-C7,FUNDACAO FUNDESUL,50011828080,BA,57dbaad1077a6,637062230.0,-16.353055555555,-39.386111111111,4188244000109,2017-08-20


In [14]:
es.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29260 entries, 0 to 29259
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Serviço                29260 non-null  object
 1   Num_Serviço            29260 non-null  object
 2   Status                 29260 non-null  object
 3   Entidade               29242 non-null  object
 4   Fistel                 29260 non-null  object
 5   UF                     29258 non-null  object
 6   Id                     29260 non-null  object
 7   Número_Estação         23117 non-null  object
 8   Latitude_Transmissor   23112 non-null  object
 9   Longitude_Transmissor  23082 non-null  object
 10  CNPJ                   29260 non-null  object
 11  Validade_RF            26721 non-null  object
dtypes: object(12)
memory usage: 2.7+ MB


In [15]:
pb = _read_plano_basico(folder / 'canais.zip')

In [16]:
pb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47455 entries, 0 to 47454
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Id                 47455 non-null  object
 1   Município          47447 non-null  object
 2   Frequência         47455 non-null  object
 3   Classe             47435 non-null  object
 4   Serviço            47455 non-null  object
 5   Entidade           47409 non-null  object
 6   Latitude_Estação   47451 non-null  object
 7   Longitude_Estação  47451 non-null  object
 8   UF                 44764 non-null  object
 9   Status             47455 non-null  object
 10  CNPJ               47449 non-null  object
 11  Fistel             47449 non-null  object
dtypes: object(12)
memory usage: 4.3+ MB


In [17]:
pb.head()

Unnamed: 0,Id,Município,Frequência,Classe,Serviço,Entidade,Latitude_Estação,Longitude_Estação,UF,Status,CNPJ,Fistel
0,57dbaad053c60,Mâncio Lima,539,C,TV,X-MEDIAGROUP S.A.,-76141666666667,-72895833333333,AC,TV-C1,3211814000163,50410887137
1,57dbaad0dc4e3,Barreiras,79,A,TV,TELEVISAO OESTE BAIANO LTDA,-12102222222222,-44994444444444,BA,TV-C4,16395923000120,6030116240
2,57dbaad0eb54a,Itabuna,69,A,TV,TELEVISAO SANTA CRUZ LTDA,-14780555555555,-39261944444444,BA,TV-C2,13476833000175,6020355110
3,57dbaad0ef8af,Itabuna,177,B,TV,TV CABRALIA LTDA,-1478,-39260833333333,BA,TV-C4,13494265000135,6020354903
4,57dbaad1077a6,Porto Seguro,515,C,TV,FUNDACAO FUNDESUL,-16353055555555,-39386111111111,BA,TV-C7,4188244000109,50011828080


In [18]:
from urllib.request import Request, urlopen
from urllib.error import URLError
req = Request(ESTACAO)
try:
    response = urlopen(req)
except URLError as e:
    if hasattr(e, 'reason'):
        print('We failed to reach a server.')
        print('Reason: ', e.reason)
    elif hasattr(e, 'code'):
        print('The server couldn\'t fulfill the request.')
        print('Error code: ', e.code)
else:
    Path.cwd().joinpath('estações.zip').write_bytes(response.read())

We failed to reach a server.
Reason:  [WinError 10060] Uma tentativa de conexão falhou porque o componente conectado não respondeu
corretamente após um período de tempo ou a conexão estabelecida falhou
porque o host conectado não respondeu


In [22]:
df = es.merge(pb, on='Id', how='left')
df = clean_mosaico(df, folder)
df = save_df(df, folder, "mosaico")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Frequência"] = df.Frequência.str.replace(",", ".")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Frequência"] = df.Frequência.astype("float")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df.Serviço == "OM", "Frequência"] = df.loc[
A value is trying to be set on a copy of a slice f

In [32]:
import whylogs as why
print(why.__version__)
results = why.log(df)
mosaico_view = results.view()
mosaico_view.to_pandas()

1.0.2


Unnamed: 0_level_0,counts/n,counts/null,types/integral,types/fractional,types/boolean,types/string,types/object,cardinality/est,cardinality/upper_1,cardinality/lower_1,...,distribution/q_05,distribution/q_10,distribution/q_25,distribution/median,distribution/q_75,distribution/q_90,distribution/q_95,distribution/q_99,frequent_items/frequent_strings,type
column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Num_Serviço,29260,0,0,0,0,29260,0,8.0,8.0004,8.0,...,,,,,,,,,"[FrequentItem(value='801', est=13790, upper=13...",SummaryType.COLUMN
Classe,29260,17,0,0,0,29243,0,13.0,13.000649,13.0,...,,,,,,,,,"[FrequentItem(value='C', est=20887, upper=2088...",SummaryType.COLUMN
Latitude,29260,0,0,29260,0,0,0,19444.85448,19699.37697,19196.523396,...,-28.32806,-26.87056,-22.545,-18.46558,-9.166667,-4.943889,-3.09961,-0.38194,,SummaryType.COLUMN
Coordenadas_do_Município,29260,0,0,0,29260,0,0,,,,...,,,,,,,,,,SummaryType.COLUMN
UF,29260,2,0,0,0,29258,0,28.000002,28.0014,28.0,...,,,,,,,,,"[FrequentItem(value='MG', est=5370, upper=5370...",SummaryType.COLUMN
Município,29260,3,0,0,0,29257,0,3929.896192,3981.33638,3879.707317,...,,,,,,,,,"[FrequentItem(value='Brasília', est=94, upper=...",SummaryType.COLUMN
Serviço,29260,0,0,0,0,29260,0,8.0,8.0004,8.0,...,,,,,,,,,"[FrequentItem(value='RTVD', est=13789, upper=1...",SummaryType.COLUMN
Fistel,29260,0,0,0,0,29260,0,29162.053649,29543.768953,28789.623792,...,,,,,,,,,[],SummaryType.COLUMN
CNPJ,29260,0,0,0,0,29260,0,6014.874947,6093.606365,5938.058718,...,,,,,,,,,"[FrequentItem(value='00530352000159', est=1752...",SummaryType.COLUMN
Entidade,29260,18,0,0,0,29242,0,5960.202958,6038.218749,5884.084946,...,,,,,,,,,"[FrequentItem(value='CAMARA DOS DEPUTADOS', es...",SummaryType.COLUMN


In [33]:
radcom = read_radcom(folder)

NameError: name 'read_radcom' is not defined

In [None]:
from nbdev.export import notebook2script; notebook2script()