In [2]:
import pandas as pd
import pandas_gbq as pd_gbq
import jinja2
import os

import time

# function to get current date and time
def get_current_time():
    from datetime import datetime
    return "[" + datetime.now().strftime("%Y-%m-%d %H:%M:%S") + "]"

def read_gbq_(query):
  project_id = 'sfwthr2a4shdyuogrt3jjtygj160rs' # ri-nonprod
  print(f'{get_current_time()} Getting dataset from BQ...')
  return pd_gbq.read_gbq(query, progress_bar_type='tqdm',
                         use_bqstorage_api=True,
      project_id=project_id)

def read_gbq_from_template(template_query, dict_query):
  query = template_query
  if dict_query:
      from jinja2 import Template
      # Reads a query from a template and returns the query with the variables replaced
      # template_query: query as string, may use jinja2 templating
      # dict_query: dictionary of query parameters, to render from the template with jinja2
      query = Template(template_query).render(dict_query)
  return read_gbq_(query)



In [4]:
query = """
select * from  `dataplatform-prd.master_contact.aux_tam_final_nomes` 
where 1=1 {{add_filter}}
"""


dates = [ 
 '2024-06-30', #0
 '2024-05-31', #1 
 '2024-04-30', #2
 '2024-03-31',
 '2024-02-29',
 '2024-01-31',
 '2023-12-31',
 '2023-11-30',
 '2023-10-31',
 '2023-09-30']

date = dates[0]
anomes = date[:7].replace('-', '')

In [5]:
import pandas as pd
def read_nomes_agrupados(anomes):
    df = pd.read_parquet(f'data/agrupamento_nomes_{anomes}.parquet')
    df = df[['nome_master', 'inicio', 'resultado_names']]
    return df
    

In [6]:
df = read_gbq_from_template(query, {'add_filter': f'AND reference_month = "{date}"'})#.merge(read_nomes_agrupados(anomes), how='left', on=['nome_master', 'inicio'])

[2024-09-02 21:33:25] Getting dataset from BQ...
Downloading: 100%|[32m██████████[0m|


In [7]:
df

Unnamed: 0,reference_month,merchant_market_hierarchy_id,subs_asterisk,nome_master_com_espaco,nome_master,merchant_tax_id,cpf,cpf_brasil,cnpj,numero_inicio,id_ton,cod_muni,inicio
0,2024-06-30,507496538,Outros_Pags,VERAMARTINS,VERAMARTINS,,,,,,,4115200,VERAMA
1,2024-06-30,852891984,Outros_Pags,ELISIANEFABIANA,ELISIANEFABIANA,,,,,,,4316907,ELISIA
2,2024-06-30,836749092,Outros_Pags,WEUDESGONCALVES,WEUDESGONCALVES,,,,,,,2100055,WEUDES
3,2024-06-30,551492703,Outros_Pags,MARIAHELENADA,MARIAHELENADA,,,,,,,3515707,MARIAH
4,2024-06-30,706579219,Outros_Pags,SILVANAOLIVEIRA,SILVANAOLIVEIRA,,,,,,,3515004,SILVAN
...,...,...,...,...,...,...,...,...,...,...,...,...,...
19812642,2024-06-30,487198462,Outros_Pags,MISTURAMODAS,MISTURAMODAS,19002729000125,,,19002729000125,,,4318705,MISTUR
19812643,2024-06-30,748423592,Outros_Pags,BODYFITACADEMIA,BODYFITACADEMIA,40872032000100,,,40872032000100,,,3506003,BODYFI
19812644,2024-06-30,775999329,Outros_Pags,LUZIANACANDIDADE,LUZIANACANDIDADE,,,,,,,5215504,LUZIAN
19812645,2024-06-30,608041316,Outros_Pags,MICHELEBATISTADA,MICHELEBATISTADA,,,,,,,2307650,MICHEL


## df = df.merge(read_nomes_agrupados(anomes), how='left', on=['nome_master', 'inicio'])

In [8]:
from group_tam_id import (
    assign_group_ids,
    load_to_gbq,
    execute_with_context,
    init_group_id, 
    deal_merged_places, 
    deal_merged_docs,
    deal_unmerged_places, 
    #choose_prefered_document, 
    #create_agrupamento_inspecao, 
    #grouped_subs_asterisk, 
    #final_ajustes
     )

In [9]:
import pandas as pd
from warnings import filterwarnings
filterwarnings('ignore')
from itertools import cycle
import numpy as np
#%%
def is_cnpj_valido(cnpj: str) -> bool:
    LENGTH_CNPJ = 14
    if len(cnpj) != LENGTH_CNPJ:
        return False

    if cnpj in (c * LENGTH_CNPJ for c in "1234567890"):
        return False

    cnpj_r = cnpj[::-1]
    for i in range(2, 0, -1):
        cnpj_enum = zip(cycle(range(2, 10)), cnpj_r[i:])
        dv = sum(map(lambda x: int(x[1]) * x[0], cnpj_enum)) * 10 % 11
        if cnpj_r[i - 1:i] != str(dv % 10):
            return False

    return True


def is_cpf_valido(cpf: str) -> bool:
    TAMANHO_CPF = 11
    if len(cpf) != TAMANHO_CPF:
        return False

    if cpf in (c * TAMANHO_CPF for c in "1234567890"):
        return False

    cpf_reverso = cpf[::-1]
    for i in range(2, 0, -1):
        cpf_enumerado = enumerate(cpf_reverso[i:], start=2)
        dv_calculado = sum(map(lambda x: int(x[1]) * x[0], cpf_enumerado)) * 10 % 11
        if cpf_reverso[i - 1:i] != str(dv_calculado % 10):
            return False

    return True


def fix_cnpjs_errados(data, doc_col='cpf_cnpj'):
    '''
    A master faz um padding 14 digitos para
    muitos CPFs. Precisamos corrigir isso.
    Além disso, durante o merge com documento raiz
    alguns desses casos foram identificados com TPV
    e tbm precisam ser corrigidos.
    Nota: O método de validação de CNPJs valida muitos CPFs
    como CNPJ. Por isso fazemos o processo inverso.
    '''
    data['cpf_em_potencial'] = False
    data['cpf_em_potencial'] = np.where(
        (~data[doc_col].isna())\
            & (data[doc_col].str.startswith('000')) \
                & (data[doc_col].str.len()==14),
        True,
        False
    )
    data.loc[(data['cpf_em_potencial']),'cpf_valido'] = \
        data.loc[(data['cpf_em_potencial']),doc_col].apply(lambda x : is_cpf_valido(x[3:]))
    data['cpf_valido'] = data['cpf_valido'].fillna(False)
    
    data.loc[(data['cpf_valido']), doc_col] = \
        data.loc[(data['cpf_valido']), doc_col].apply(lambda x : x[3:])
    

    data.drop(
        columns=['cpf_em_potencial','cpf_valido'],
        inplace=True
    )

    
    return data

In [10]:
anomes = '202404'
df = pd.read_parquet(f'data/agrupamento_nomes_{anomes}.parquet')

import pandas as pd
import os
from tqdm.notebook import tqdm

def process_in_chunks(df, LEVEL_GROUP, output_dir="temp_dir"):
    from tqdm import tqdm

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Conta o número de grupos
    num_groups = df.groupby(LEVEL_GROUP).ngroups

    # Cria a barra de progresso
    progress_bar = tqdm(total=num_groups, desc="Processando grupos", unit="grupo")

    for group_values, group_df in df.groupby(LEVEL_GROUP):
        # Gera o nome do arquivo baseado nos valores do grupo
        filename = f"{output_dir}/processed_group_{group_values}.parquet"
        
        # Verifica se o arquivo já existe
        if os.path.exists(filename):
            progress_bar.update(1)  # Atualiza a barra de progresso
            continue  # Pula o processamento deste grupo

        # Processa e salva o resultado se o arquivo ainda não existir
        processed_df = group_df
        processed_df.to_parquet(filename, index=False)
        
        # Atualiza a barra de progresso
        progress_bar.update(1)
    
    progress_bar.close()

    # Carrega e concatena os resultados processados
    result_files = os.listdir(output_dir)
    result = pd.concat([pd.read_parquet(f"{output_dir}/{file}") for file in result_files])
    
    return result

In [10]:
process_in_chunks(df, ['cod_muni'], output_dir=f"data/temp_dir_{anomes}")

Processando grupos: 100%|██████████| 5571/5571 [00:48<00:00, 115.81grupo/s]


Unnamed: 0,cod_muni,nome_master,inicio,resultado_names,agrupamento_nome_1
0,2100055,AAACARVALHOGRA,AAACAR,[AAACARVALHOGRA],AAACARVALHOGRA2100055
1,2100055,AADESOUSANETOL,AADESO,[AADESOUSANETOL],AADESOUSANETOL2100055
2,2100055,AAVANTEENGENHARIA,AAVANT,[AAVANTEENGENHARIA],AAVANTEENGENHARIA2100055
3,2100055,ABACAXI,ABACAX,[ABACAXI],ABACAXI2100055
4,2100055,ABACONSTRU,ABACON,[ABACONSTRU],ABACONSTRU2100055
...,...,...,...,...,...
11922,3505500,ZHAOYINGLUE,ZHAOYI,[ZHAOYINGLUE],ZHAOYINGLUE3505500
11923,3505500,ZIINZIINCOMERCIOD,ZIINZI,[ZIINZIINCOMERCIOD],ZIINZIINCOMERCIOD3505500
11924,3505500,ZIURIMPORTS,ZIURIM,[ZIURIMPORTS],ZIURIMPORTS3505500
11925,3505500,ZOIOTATTOOACADEMY,ZOIOTA,[ZOIOTATTOOACADEMY],ZOIOTATTOOACADEMY3505500


In [12]:
input_date = '202404'
directory = f'temp_dir_{input_date}'
files = [x for x in os.listdir(directory) if x.endswith('.parquet')]
for file in files:
    os.remove(f'{directory}/{file}')
    print(f'File {file} removed from {directory}')

File processed_group_2100055.parquet removed from temp_dir_202404
File processed_group_4117271.parquet removed from temp_dir_202404
File processed_group_2210979.parquet removed from temp_dir_202404
File processed_group_2609006.parquet removed from temp_dir_202404
File processed_group_4217550.parquet removed from temp_dir_202404
File processed_group_3538303.parquet removed from temp_dir_202404
File processed_group_2202455.parquet removed from temp_dir_202404
File processed_group_3306206.parquet removed from temp_dir_202404
File processed_group_4205001.parquet removed from temp_dir_202404
File processed_group_2401107.parquet removed from temp_dir_202404
File processed_group_4103800.parquet removed from temp_dir_202404
File processed_group_3103306.parquet removed from temp_dir_202404
File processed_group_4125209.parquet removed from temp_dir_202404
File processed_group_2929404.parquet removed from temp_dir_202404
File processed_group_3109907.parquet removed from temp_dir_202404
File proce