In [87]:
import sys
import io
import traceback
import csv
from pathlib import Path
import pandas as pd
import numpy as np 
from openpyxl import load_workbook 
import config as cfg
import eda_tools as tls

load config

In [88]:
# configuration load
log_path = Path(Path.cwd().parent /  r"config/config.json")
if not log_path.exists(): 
    print(f"Arquivo de configura√ß√£o n√£o encontrado !\n{log_path}")
    sys.exit(1)
log = cfg.config_log(log_path)
cfg.load_config() 
pd.set_option('future.no_silent_downcasting', True)

load sheets

In [89]:
# sheets dfs

df_tables = pd.read_excel(cfg.eda_sheet_full_path,sheet_name='tables')
df_fields = pd.read_excel(cfg.eda_sheet_full_path,sheet_name='fields', header=1)
df_fields = df_fields.astype('object')
df_fields.set_index(["table", "field"], inplace=True)

# format headers
df_tables.columns = df_tables.columns.str.strip().str.lower()
df_fields.columns = df_fields.columns.str.strip().str.lower()

# open sheet
try:
    wb = load_workbook(cfg.eda_sheet_full_path)
    tables_sheet = wb["tables"]
    fields_sheet = wb["fields"]
except FileNotFoundError:
    print("Erro ao abrir planilhas")
    exit(1)

collect describes

In [90]:
# Estatisticas de contagem por tipo de conteudo do campo 
from numpy import dtype


df_stats = pd.DataFrame(columns=['table','field','stat','value'])
stats_collected = []
for index, table in df_tables.iterrows():
    # load_data
    table_name = table['table']
    print(f"-----{table_name}-----")
    data_path = Path(cfg.data_file_path / table['file'])
    df_dados = pd.read_csv(data_path,encoding=tls.encode(data_path),quotechar=None,quoting=3,keep_default_na=True,sep=cfg.csv_sep,engine='python',dtype={'damesano': 'str'})
    df_dados.columns = df_dados.columns.str.strip().str.lower()
    table_count = len(df_dados)

    # Counts by content type 
    df_types = df_dados.map(tls.classify_content)
    df_types.apply(pd.Series.value_counts)

    for field in df_types.columns:
        counts = df_types[field].value_counts(dropna=False)
        for stat, value in counts.items():
            stats_collected.append({"table": table_name, "field": field, "stat": stat, "value": int(value)})

    # describe stats
    df_fields_table = df_fields.reset_index()
    df_fields_table = df_fields_table[df_fields_table['table'] == table_name]
    for idx, fld in df_fields_table.iterrows(): 
        field_name = fld['field']
        if fld['type'] == "str":
            field_series = df_dados[field_name]             
            field_series = field_series.astype('object')            
        else: 
           field_series = pd.to_numeric(df_dados[field_name], errors='coerce')
        # count 
        stats_collected.append({"table": table_name, "field": field_name, "stat": "count" , "value": table_count})
        # min
        stats_collected.append({"table": table_name, "field": field_name, "stat": "min" , "value": field_series.min()})
        # max 
        stats_collected.append({"table": table_name, "field": field_name, "stat": 'max', "value": field_series.max()})
        # mean 
        try: 
            stats_collected.append({"table": table_name, "field": field_name, "stat": "mean", "value": field_series.mean()})        
        except: 
            stats_collected.append({"table": table_name, "field": field_name, "stat": "mean", "value": "no number"})
        # std
        try:        
            stats_collected.append({"table": table_name, "field": field_name, "stat":"std", "value": field_series.std()}) 
        except: 
            stats_collected.append({"table": table_name, "field": field_name, "stat":"std", "value": "no number"})    
        # nunique    
        stats_collected.append({"table": table_name, "field": field_name, "stat":"unique", "value": field_series.nunique()})
        # top
        try: 
            stats_collected.append({"table": table_name, "field": field_name, "stat":"top", "value": field_series.mode().iloc[0]})
        except: 
            stats_collected.append({"table": table_name, "field": field_name, "stat":"top", "value": "no value"})
        # freq
        try: 
            stats_collected.append({"table": table_name, "field": field_name, "stat":"freq", "value": field_series.value_counts().iloc[0]})
        except: 
           stats_collected.append({"table": table_name, "field": field_name, "stat":"freq", "value": "no value"})
        # q1, q2, q3
        try: 
            q_values = field_series.quantile([0.25, 0.5, 0.75])
            stats_collected.append({"table": table_name, "field": field_name, "stat":"q1(25%)", "value": q_values.iloc[0]})
            stats_collected.append({"table": table_name, "field": field_name, "stat":"q2(50%)", "value": q_values.iloc[1]})
            stats_collected.append({"table": table_name, "field": field_name, "stat":"q3(75%)", "value": q_values.iloc[2]})
        except: 
            stats_collected.append({"table": table_name, "field": field_name, "stat":"q1(25%)", "value": "no value"})
            stats_collected.append({"table": table_name, "field": field_name, "stat":"q2(50%)", "value": "no value"})
            stats_collected.append({"table": table_name, "field": field_name, "stat":"q3(75%)", "value": "no value"})
        # format

        try: 
            if not pd.isna(fld['regex']): 
                if fld['type'] == 'int': 
                    field_series_num = pd.to_numeric(field_series, errors='coerce')
                    field_series = field_series_num.round(0).astype('Int64')
                regex_pattern = fld['regex']
                for valor in field_series: 
                    print(valor) 
                bool_series = field_series.astype(str).str.strip().str.match(regex_pattern, na=False)
                qty_formats = bool_series.sum()
                stats_collected.append({"table": table_name, "field": field_name, "stat":"valid_format", "value": qty_formats})                
            else: 
                stats_collected.append({"table": table_name, "field": field_name, "stat":"valid_format", "value": "no format"})
        except Exception as e:
            exc_type, exc_obj, exc_tb = sys.exc_info() 
            print("üõë Erro format!")
            print(f"üìù Tipo de Erro: {type(e).__name__}")
            linha_do_erro = exc_tb.tb_lineno            
            print(f"üëâ Linha do C√≥digo que Gerou o Erro: {linha_do_erro}")
            print("\n--- Traceback Completo ---")

            stats_collected.append({"table": table_name, "field": field_name, "stat":"valid_format", "value": "err"})
        # list
        try: 
            if not pd.isna(fld['list']): 
                values_lst = fld['list'].split(';')
                field_series_str = field_series.astype(str)
                bool_series = field_series_str.isin(values_lst)
                stats_collected.append({"table": table_name, "field": field_name, "stat":"valid_on_list", "value": bool_series.sum()})                
            else: 
                stats_collected.append({"table": table_name, "field": field_name, "stat":"valid_on_list", "value": "no list"})                
        except Exception as e:
            exc_type, exc_obj, exc_tb = sys.exc_info() 
            print("üõë Erro list!")
            print(f"üìù Tipo de Erro: {type(e).__name__}")
            linha_do_erro = exc_tb.tb_lineno            
            print(f"üëâ Linha do C√≥digo que Gerou o Erro: {linha_do_erro}")
            print("\n--- Traceback Completo ---")

            stats_collected.append({"table": table_name, "field": field_name, "stat":"valid_on_list", "value": "err"})
        # range 
        try: 
            if fld['type'] == "int" or fld['type'] == "float":
                if not pd.isna(fld['range']): 
                    
                    ranges_list = fld['range']
                    range_lst = ranges_list.split(';') 
                    min_limit_str, max_limit_str = range_lst[0], range_lst[1]
                    min_limit = float(min_limit_str)
                    max_limit = float(max_limit_str)                    
                    field_series_num = pd.to_numeric(field_series, errors='coerce')
                    bool_series = field_series_num.between(min_limit, max_limit, inclusive='both')
                    qty_range = bool_series.sum()
                    stats_collected.append({"table": table_name, "field": field_name, "stat":"valid_on_range", "value": qty_range})                
                else: 
                    stats_collected.append({"table": table_name, "field": field_name, "stat":"valid_on_range", "value": "no range"})
            else: 
                stats_collected.append({"table": table_name, "field": field_name, "stat":"valid_on_range", "value": "no number"})
        except Exception as e: 
            exc_type, exc_obj, exc_tb = sys.exc_info()
            print("üõë Erro range!")
            print(f"üìù Tipo de Erro: {type(e).__name__}")
            linha_do_erro = exc_tb.tb_lineno            
            print(f"üëâ Linha do C√≥digo que Gerou o Erro: {linha_do_erro}")
            print("\n--- Traceback Completo ---")
            traceback.print_exc()
  
            stats_collected.append({"table": table_name, "field": field_name, "stat":"valid_on_range", "value": "err"})    
    # stats consolidation     
    df_stats = pd.concat([df_stats,pd.DataFrame(stats_collected)])   
df_stats = df_stats.astype('object')

df_stats_pivot = (df_stats.pivot_table(index=["table","field"], columns="stat", values="value", fill_value=0,aggfunc='first').reset_index())
df_stats_pivot_para_update = df_stats_pivot.set_index(["table", "field"])
df_fields.update(df_stats_pivot_para_update)
col_list = ['nulls', 'blanks', 'int', 'float', 'str', 'date']
df_fields[col_list] = df_fields[col_list].fillna(0)

-----ses_cias-----
-----ses_ramos-----
-----ses_seguros-----
200009
200009
200009
200009
200009
200009
200009
200009
<NA>
200009
100009
200009
200009
200009
200009
200009
200009
200009
200009
200009
200009
200009
200009
200009
200009
200009
200009
200009
200009
200009
200009
200009
200009
200009
200009
200009
200009
200009
200009
200009
200009
200009
200009
200009
200009
199501
199501
199501
199501
110001
199501
199501
199501
199501
199501
199501
199501
199501
199501
199501
199501
199501
199501
199501
199501
199501
199501
199501
199501
199501
199501
199501
199501
199501
199501
199501
199501
199501
199501
199501
199501
199501
199501
199501
199501
199501
199501
199501
199501
199501
199501
199501
199501
199501
199501
199501
199501
199501
199501
199501


In [91]:
df_fld = df_fields.reset_index()
col_series = df_fld.columns 

for idx_row, fld_row in df_fld.iterrows(): 
    line = int(idx_row) + 3
    for col_name in col_series: 
        col = col_series.get_loc(col_name) + 1

        fields_sheet.cell(row=line, column=col).value = fld_row[col_name]
        # comment_txt = Comment("Teste", "eda_to_excel")
        # fields_sheet.cell(row=line, column=col).comment = comment_txt

wb.save(cfg.eda_sheet_full_path)