In [93]:
from group_tam_id import (
    assign_group_ids,
    open_ton, load_to_gbq, create_doc_final, open_init_data,
    execute_with_context,
    init_group_id, 
    deal_merged_places, 
    deal_unmerged_places, 
    choose_prefered_document, 
    create_agrupamento_inspecao, 
    grouped_subs_asterisk, 
    final_ajustes)

In [94]:
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import os

import time

from projectutils import read_data 
import sys


In [95]:
def execute(func, df, display_flag=True, verbose=True, LEVEL_GROUP=None, use_level=False, **kwargs):
    """
    Executes a given function on the DataFrame, optionally displays the result, and shows execution time.

    Parameters:
    - func: The processing function to execute. It should accept a DataFrame and return a DataFrame.
    - df: The DataFrame to process.
    - display_flag: Boolean flag indicating whether to display the DataFrame after processing.
    - verbose: Boolean flag indicating whether to print execution details.
    - LEVEL_GROUP: Optional parameter to pass a level group to the function.
    - use_level: Boolean flag indicating whether to use the LEVEL_GROUP parameter.
    - **kwargs: Additional keyword arguments to pass to the function.

    Returns:
    - The processed DataFrame.
    """

    start_time = time.time()  # Start timing before function execution
  

    if verbose:
        print(f'_______________\nExecuting function {func.__name__}...')

    if use_level:
        df = func(df, LEVEL_GROUP=LEVEL_GROUP, **kwargs)
    else:
        df = func(df, **kwargs)

    end_time = time.time()  # End timing after function execution
    execution_time = end_time - start_time  # Calculate execution time

    if verbose:
        print(f'Function {func.__name__} executed successfully in {execution_time:.2f} seconds.')
    
    
      
    if 'group_id' in df.columns:
        s = df['group_id'].str.len()
        print(f'Result Max group id size: {s.max()}')
        
        display(df.groupby('group_id').agg({
            #'group_id': 'nunique',
            'merchant_market_hierarchy_id': 'nunique'
        }).max())
        
        
    if 'group_id_index' in df.columns:
        sizes = df.groupby('group_id_index').ngroups
        print(f'qtd names group id: {sizes}')
        
    
    if 'group_id' in df.columns:
        #sizes = df['group_id_index'].value_counts().values[0]
        #print(f'qtd names group id: {sizes}')
        #print('Group ID in cols')
        big_group = df['group_id'].value_counts().index[0]
        #display(df[df['group_id']==big_group])
    
    #if 'group_id' in df.columns: print(f'Number of groups: {df["group_id"].nunique()}')
    
    if display_flag:
        display(df)
    return df


def execute_with_context(LEVEL_GROUP, display_flag=False, verbose=False):
    import inspect
    def execute_context(func, df, **kwargs):
        # Check if 'LEVEL_GROUP' is in the function's parameters
        params = inspect.signature(func).parameters
        if 'LEVEL_GROUP' in params:
            # If 'LEVEL_GROUP' is a parameter, pass it along with 'df'
            result = execute(func, df, display_flag=display_flag, verbose=verbose, LEVEL_GROUP=LEVEL_GROUP,
                             use_level=True, **kwargs)
        else:
            # If 'LEVEL_GROUP' is not a parameter, call the function without it
            result = execute(func, df, display_flag=display_flag, verbose=verbose, use_level=False, **kwargs)
        return result
    return execute_context

In [96]:
df = pd.read_parquet('data/main_data.parquet')




In [97]:
s = df[['nome_master', 'cod_muni', 'nome_muni', 'uf']].value_counts()

In [98]:
s[(s>10) & (s<20)].sample(20)

nome_master          cod_muni  nome_muni             uf
DIVINOPOLIS          3122306   Divinópolis           MG    11
MARIAEDUARDA         4314902   Porto Alegre          RS    16
RESTAURANTE          4104808   Cascavel              PR    13
AUTOPOSTOCINCOESTRE  5300108   Brasília              DF    15
CASASBAHIA           3509502   Campinas              SP    11
DROGASIL             3548500   Santos                SP    13
ALINEMODAS           3550308   São Paulo             SP    14
DROGASIL             1721000   Palmas                TO    14
DISTRIBUIDORA        2933307   Vitória da Conquista  BA    13
ODONTOCOMPANY        4106902   Curitiba              PR    18
WALLACE              3550308   São Paulo             SP    13
ODONTOLOGIA          5208707   Goiânia               GO    13
ANACLAUDIA           1302603   Manaus                AM    16
MERCADO              2910800   Feira de Santana      BA    11
JOSEFERNANDES        3550308   São Paulo             SP    14
TALITADASILVA 

In [71]:
df = pd.read_parquet('data/intermediary/part_18.parquet')

In [72]:
df

Unnamed: 0,subs_asterisk,nome_master,nome_muni,uf,cpf,cpf_brasil,qtd_cpfs,qtd_cnpjs,cnpj,numero_inicio,cod_muni,merchant_tax_id,mmhid_merge,merchant_market_hierarchy_id,id_ton,nome_ton,inicio,resultado_names,agrupamento_nome_1,_merge
0,SumUp,REYTECH,São Paulo,SP,,,,1,55288394000117,,3550308,,,,,,REYTEC,[REYTECH],REYTECH3550308,both
1,SumUp,APPSERVIDORE,São Paulo,SP,,,,,,,3550308,,,,,,APPSER,[APPSERVIDORE],APPSERVIDORE3550308,both
2,SumUp,JOSEWILSONP,São Paulo,SP,17782026830,,26,10,20701303000149,,3550308,,,,,,JOSEWI,[JOSEWILSONP],"JOSEWILSONP, JOSEWILSONPEGADO3550308",both
3,SumUp,MARIAARTS,São Paulo,SP,,,,,,,3550308,,,,,,MARIAA,[MARIAARTS],MARIAARTS3550308,both
4,SumUp,CANILBUTDOGG,São Paulo,SP,,,,,,,3550308,,,,,,CANILB,[CANILBUTDOGG],CANILBUTDOGG3550308,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1334900,Ton,CLINICAODONTOLOGIADENISEFERES,São Paulo,SP,,,,,03670613000124,,3550308,,,,1818529.0,CLINICA ODONTOLOGIA DENISE FERES,CLINIC,[CLINICAODONTOLOGIADENISEFERES],"CLINICAODONTOLOGIA, CLINICAODONTOLOGIADENISEFE...",both
1334901,Ton,POINTDOCOYOTE,São Paulo,SP,16154481870,,,,,,3550308,,,,1818530.0,POINT DO COYOTE,POINTD,[POINTDOCOYOTE],POINTDOCOYOTE3550308,both
1334902,Ton,ANGELICADAMOTALIMA,São Paulo,SP,44972567821,,,,,,3550308,,,,1818554.0,ANGELICA DA MOTA LIMA,ANGELI,[ANGELICADAMOTALIMA],ANGELICADAMOTALIMA3550308,both
1334903,Ton,BRUNACECCHETTIESTETICA,São Paulo,SP,39824084827,,,,,,3550308,,,,1818555.0,BRUNA CECCHETTI ESTETICA,BRUNAC,[BRUNACECCHETTIESTETICA],BRUNACECCHETTIESTETICA3550308,both


In [32]:
def main_data_treat_muni(df, file_name):

    """
    Main function to process data with options to display intermediate DataFrames and print verbose messages.

    Parameters:
    - read_data_func: Function to read initial data.
    - display_flag: Boolean flag indicating whether to display the DataFrame after each processing step.
    - verbose: Boolean flag indicating whether to print verbose messages during processing.
    """
    display_flag=False
    
    len_df = len(df)
    
    #if len_df> 100000:
    #    verbose = True
    #else:
    #    verbose = False
    verbose = True
    LEVEL_GROUP = ['cod_muni']
    
    
    print(f'Starting main data treat for file {file_name}, with size {len(df)}')

    start = time.time()
    execute_ctx = execute_with_context(LEVEL_GROUP, display_flag=display_flag, verbose=verbose)

    df = execute_ctx(init_group_id, df)
    #df = execute_ctx(deal_unmerged_places, df)
    #df = execute_ctx(deal_merged_places, df)
    
    #df = execute_ctx(choose_prefered_document, df)
    #df = execute_ctx(create_agrupamento_inspecao, df)
    #df = execute_ctx(grouped_subs_asterisk, df)
    #df = execute_ctx(final_ajustes, df)
    #df = execute_ctx(create_doc_final, df)
    
    print(f'Main data treat Execution time file {file_name}: {time.time() - start:.2f} seconds')
    return df

In [33]:
df = main_data_treat_muni(df, 'teste SP')

Starting main data treat for file teste SP, with size 1334905
_______________
Executing function init_group_id...
Function init_group_id executed successfully in 15.47 seconds.
Result Max group id size: 115


merchant_market_hierarchy_id    472
dtype: int64

Main data treat Execution time file teste SP: 20.90 seconds


In [34]:


def deal_unmerged_places(df, LEVEL_GROUP):
    df = df.copy()

    df['mmhid_places'] = df['mmhid_merge'].apply(lambda x: str(x) if not pd.isna(x) else np.nan) #deal with nan in Int
    df['new_id'] = np.arange(df.shape[0])
    df.loc[df['subs_asterisk']=='Ton', 'mmhid_places'] = 'Ton' + df['id_ton'].astype(str)
    df.loc[df['mmhid_places'].isnull(), 'mmhid_places'] = 'created' + df['new_id'].astype(str)

    #df['muni'] =  df[LEVEL_GROUP].astype(str).apply(lambda x: ' '.join(x))

    if len(LEVEL_GROUP)>1:
        df['mmhid_places'] = df['mmhid_places'] + ' - ' + df[LEVEL_GROUP].astype(str).apply(lambda x: '|'.join(x), axis=1)#df['muni']
    elif len(LEVEL_GROUP)==1:
        df['mmhid_places'] = df['mmhid_places'] + ' - ' + df[LEVEL_GROUP[0]].astype(str)
        
    df = assign_group_ids(df, ['group_id'], final_col='group_id_idx1')
        
    df['tam_group1'] = df.groupby('group_id_idx1')['nome_master'].transform('count')


    df = assign_group_ids(df, ['group_id', 'mmhid_places'], final_col='group_id_idx2')
    
    df['tam_group2'] = df.groupby('group_id_idx2')['nome_master'].transform('count')
    
    #df_grouped = df.copy()
    # SPLIT group_id by | . Get first element
    #df_grouped['group_id'] = df_grouped['group_id'].str.split('|').apply(lambda x: x[0])

    # concat group_id with group_id_index
    #df_grouped = df_grouped.groupby(['group_id_index'], as_index=False)['group_id'].apply(lambda x: list(x.dropna().unique()))
    #df_grouped = df_grouped.groupby(['group_id_index'], as_index=False)['group_id'].apply(lambda x: list(x.dropna().unique()))
    #df_grouped = df_grouped[['group_id_index', 'group_id']].drop_duplicates().groupby('group_id_index', as_index=False).head(10)
    
    #df_grouped = df_grouped.groupby(['group_id_index'], as_index=False)['group_id'].apply(lambda x: ', '.join(x))

    #df = df.drop(columns=['group_id', 'mmhid_places', 'new_id', 'id_ton'])
    #df = df.merge(df_grouped, on='group_id_index', how='left')

    #df = df.drop(columns=['group_id_index'])
    return df

df = deal_unmerged_places(df, LEVEL_GROUP=['cod_muni'])
#df4['len_resultado'] = df4.groupby('group_id_index')['nome_master'].transform('count')
#df4[df4['len_resultado']>1000]['agrupamento_nome_1'].unique()

In [35]:
def deal_merged_places(df, LEVEL_GROUP):
    df = df.copy()

    
    df['mmhid_places'] = df.loc[df['subs_asterisk'].isin(OUTROS), 'merchant_market_hierarchy_id']

    df_grouped = df.copy()

    level_group2 = ['group_id_idx2']
    
    df_grouped = df_grouped[df_grouped['mmhid_places'].notnull()]
    df_grouped = df_grouped[LEVEL_GROUP + level_group2 +  ['mmhid_places']].drop_duplicates()
    # Group by the specified columns and aggregate
    df_grouped = df_grouped.groupby(LEVEL_GROUP + level_group2).agg(
        unique_mmhid_places=('mmhid_places', 'first'),
        nunique_mmhid_places=('mmhid_places', 'nunique')
    ).reset_index()

    df = df.merge(df_grouped, on=LEVEL_GROUP + level_group2, how='left')
    
    df['group_id_idx3'] = df['group_id_idx2']
    

    df.loc[(df['nunique_mmhid_places']>1)
           & (df['subs_asterisk'].isin(OUTROS))
           , 'group_id_idx3'] = df['group_id_idx3'].astype(str) + ' - ' + df['mmhid_places'].astype(str)
    
    df.loc[(df['nunique_mmhid_places']>1)
           & (~df['subs_asterisk'].isin(OUTROS))
           , 'group_id_idx3'] = df['group_id_idx3'].astype(str) + ' - ' + df['unique_mmhid_places'].astype(str)

    
    df = assign_group_ids(df, ['group_id_idx3'], final_col='group_id_idx3')
    
    df = df.drop(columns=['mmhid_places', 'unique_mmhid_places', 'nunique_mmhid_places'])
    df['tam_group3'] = df.groupby('group_id_idx3')['nome_master'].transform('count')
    
    return df


In [36]:
OUTROS = ['Outros', 'Outros_Pags', 'Outros_SumUp']
df = deal_merged_places(df, LEVEL_GROUP=['cod_muni'])

In [37]:
df.columns

Index(['subs_asterisk', 'nome_master', 'nome_muni', 'uf', 'cpf', 'cpf_brasil',
       'qtd_cpfs', 'qtd_cnpjs', 'cnpj', 'numero_inicio', 'cod_muni',
       'merchant_tax_id', 'mmhid_merge', 'merchant_market_hierarchy_id',
       'id_ton', 'nome_ton', 'inicio', 'grouped_names', 'agrupamento_nome_1',
       '_merge', 'group_id', 'new_id', 'group_id_idx1', 'tam_group1',
       'group_id_idx2', 'tam_group2', 'group_id_idx3', 'tam_group3'],
      dtype='object')

In [67]:
nome = 'RESTAURANTEYAMASU'

def show_dataframe(df, nome, gid):
    tamg = f'tam_group{gnumber}'
    
    grupo = df[df['nome_master']==nome][gid].values[0]
    return df[df[gid]==grupo][['subs_asterisk', 'nome_master', 'merchant_tax_id', 'mmhid_merge', 'agrupamento_nome_1']]	

show_dataframe(df, nome=nome, gid = 'group_id_idx1')#[['mmhid_merge', 'merchant_market_hierarchy_id']].value_counts()

Unnamed: 0,subs_asterisk,nome_master,merchant_tax_id,mmhid_merge,agrupamento_nome_1
1855,Ifood,RESTAURANTEYAMASU,,,"RESTAURANTEYAMASU, RESTAURANTEYAMASUSHI3550308"
257737,Outros,RESTAURANTEYAMASUSHI,29790794000116.0,505114550.0,"RESTAURANTEYAMASU, RESTAURANTEYAMASUSHI3550308"


In [66]:
df.columns

Index(['subs_asterisk', 'nome_master', 'nome_muni', 'uf', 'cpf', 'cpf_brasil',
       'qtd_cpfs', 'qtd_cnpjs', 'cnpj', 'numero_inicio', 'cod_muni',
       'merchant_tax_id', 'mmhid_merge', 'merchant_market_hierarchy_id',
       'id_ton', 'nome_ton', 'inicio', 'grouped_names', 'agrupamento_nome_1',
       '_merge', 'group_id', 'new_id', 'group_id_idx1', 'tam_group1',
       'group_id_idx2', 'tam_group2', 'group_id_idx3', 'tam_group3'],
      dtype='object')

In [68]:
show_dataframe(df, nome='RESTAURANTEYAMASUSHI', gid = 'mmhid_merge')

Unnamed: 0,subs_asterisk,nome_master,merchant_tax_id,mmhid_merge,agrupamento_nome_1
8364,Outros,RESTAURANTE,29790794000116,505114550,"RESTAURANTE, RESTAURANTE041946898603550308"
257737,Outros,RESTAURANTEYAMASUSHI,29790794000116,505114550,"RESTAURANTEYAMASU, RESTAURANTEYAMASUSHI3550308"


In [69]:
s = show_dataframe(df, nome='RESTAURANTE', gid = 'group_id_idx1')
s = s[s['mmhid_merge'].notnull()]
s
#[['mmhid_merge', 'merchant_market_hierarchy_id']].value_counts()

Unnamed: 0,subs_asterisk,nome_master,merchant_tax_id,mmhid_merge,agrupamento_nome_1
8339,Outros,RESTAURANTE,36589509000169,647431069,"RESTAURANTE, RESTAURANTE041946898603550308"
8340,Outros,RESTAURANTE,45663819000681,815781288,"RESTAURANTE, RESTAURANTE041946898603550308"
8341,Outros,RESTAURANTE,,866201937,"RESTAURANTE, RESTAURANTE041946898603550308"
8342,Outros,RESTAURANTE,,799200892,"RESTAURANTE, RESTAURANTE041946898603550308"
8343,Outros,RESTAURANTE,,763948377,"RESTAURANTE, RESTAURANTE041946898603550308"
...,...,...,...,...,...
8518,Outros_Pags,RESTAURANTE,41327703000106,715575740,"RESTAURANTE, RESTAURANTE041946898603550308"
8519,Outros_Pags,RESTAURANTE,46738754000144,565315603,"RESTAURANTE, RESTAURANTE041946898603550308"
8520,Outros_Pags,RESTAURANTE,28200757000148,847270011,"RESTAURANTE, RESTAURANTE041946898603550308"
8521,Outros_Pags,RESTAURANTE,47687834000180,539157007,"RESTAURANTE, RESTAURANTE041946898603550308"


In [57]:
show_dataframe(df, nome=nome, gid = 'group_id_idx2')#[['mmhid_merge', 'merchant_market_hierarchy_id']].value_counts()

Unnamed: 0,subs_asterisk,nome_master,nome_muni,uf,cpf,cpf_brasil,qtd_cpfs,qtd_cnpjs,cnpj,numero_inicio,...,agrupamento_nome_1,_merge,group_id,new_id,group_id_idx1,tam_group1,group_id_idx2,tam_group2,group_id_idx3,tam_group3
1855,Ifood,RESTAURANTEYAMASU,São Paulo,SP,,,,1,29790794000116,,...,"RESTAURANTEYAMASU, RESTAURANTEYAMASUSHI3550308",both,RESTAURANTEYAMASUSHI|São Paulo SP,1855,583,2,864830,1720,1605,128
2026,Ifood,CARLOS,São Paulo,SP,51223249808,,49359,11723,53140292000124,52561332,...,"CARLOS23136047800, CARLOS3550308",both,CARLOS|São Paulo SP,2026,677,72,864830,1720,1605,128
2027,Outros,CARLOS,São Paulo,SP,51223249808,,49359,11723,53140292000124,,...,"CARLOS23136047800, CARLOS3550308",both,CARLOS|São Paulo SP,2027,677,72,864830,1720,1605,128
2028,Outros,CARLOS,São Paulo,SP,51223249808,,49359,11723,53140292000124,,...,"CARLOS23136047800, CARLOS3550308",both,CARLOS|São Paulo SP,2028,677,72,864830,1720,1754,1
2029,Outros,CARLOS,São Paulo,SP,51223249808,,49359,11723,53140292000124,,...,"CARLOS23136047800, CARLOS3550308",both,CARLOS|São Paulo SP,2029,677,72,864830,1720,1755,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1305786,Ton,LANCHONETERESTAURANTEMAEEFILHO,São Paulo,SP,11127752855,,,,,,...,"LANCHONETERESTAURANTEMAEEFILHO, LANCHONETEREST...",both,LANCHONETERESTAURANTEMAEEFILHO|São Paulo SP,1305786,11481,15,864830,1720,1605,128
1306322,Ton,EMPORIOCOLINAS,São Paulo,SP,08896698804,,,,,,...,"EMPORIOCOLINAS, EMPORIOCOLINA3550308",both,EMPORIOCOLINAS|São Paulo SP,1306322,769177,2,864830,1720,1605,128
1319152,Ton,HAYAIRESTAURANTE,São Paulo,SP,,,,,10991077000124,,...,HAYAIRESTAURANTE3550308,both,HAYAIRESTAURANTE|São Paulo SP,1319152,955801,1,864830,1720,1605,128
1321180,Ton,SBTECHINFORMATICA,São Paulo,SP,33149421845,,,,,,...,SBTECHINFORMATICA3550308,both,SBTECHINFORMATICA|São Paulo SP,1321180,957525,1,864830,1720,1605,128


In [56]:
show_dataframe(df, nome=nome, gid = 'group_id_idx3')#[['mmhid_merge', 'merchant_market_hierarchy_id']].value_counts()

Unnamed: 0,subs_asterisk,nome_master,nome_muni,uf,cpf,cpf_brasil,qtd_cpfs,qtd_cnpjs,cnpj,numero_inicio,...,agrupamento_nome_1,_merge,group_id,new_id,group_id_idx1,tam_group1,group_id_idx2,tam_group2,group_id_idx3,tam_group3
1855,Ifood,RESTAURANTEYAMASU,São Paulo,SP,,,,1,29790794000116,,...,"RESTAURANTEYAMASU, RESTAURANTEYAMASUSHI3550308",both,RESTAURANTEYAMASUSHI|São Paulo SP,1855,583,2,864830,1720,1605,128
2026,Ifood,CARLOS,São Paulo,SP,51223249808,,49359,11723,53140292000124,52561332,...,"CARLOS23136047800, CARLOS3550308",both,CARLOS|São Paulo SP,2026,677,72,864830,1720,1605,128
2027,Outros,CARLOS,São Paulo,SP,51223249808,,49359,11723,53140292000124,,...,"CARLOS23136047800, CARLOS3550308",both,CARLOS|São Paulo SP,2027,677,72,864830,1720,1605,128
2033,PagSeguro,CARLOS,São Paulo,SP,51223249808,,49359,11723,53140292000124,,...,"CARLOS23136047800, CARLOS3550308",both,CARLOS|São Paulo SP,2033,677,72,864830,1720,1605,128
2079,MercadoPago_subPagarme,CARLOS,São Paulo,SP,13416663870,,1,0,,,...,"CARLOS23136047800, CARLOS3550308",both,CARLOS|São Paulo SP,2079,677,72,864830,1720,1605,128
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1305786,Ton,LANCHONETERESTAURANTEMAEEFILHO,São Paulo,SP,11127752855,,,,,,...,"LANCHONETERESTAURANTEMAEEFILHO, LANCHONETEREST...",both,LANCHONETERESTAURANTEMAEEFILHO|São Paulo SP,1305786,11481,15,864830,1720,1605,128
1306322,Ton,EMPORIOCOLINAS,São Paulo,SP,08896698804,,,,,,...,"EMPORIOCOLINAS, EMPORIOCOLINA3550308",both,EMPORIOCOLINAS|São Paulo SP,1306322,769177,2,864830,1720,1605,128
1319152,Ton,HAYAIRESTAURANTE,São Paulo,SP,,,,,10991077000124,,...,HAYAIRESTAURANTE3550308,both,HAYAIRESTAURANTE|São Paulo SP,1319152,955801,1,864830,1720,1605,128
1321180,Ton,SBTECHINFORMATICA,São Paulo,SP,33149421845,,,,,,...,SBTECHINFORMATICA3550308,both,SBTECHINFORMATICA|São Paulo SP,1321180,957525,1,864830,1720,1605,128


In [83]:
df4[df4['group_id']=='RESTAURANTEYAMASUSHI|São Paulo SP'][['nome_master', 'group_id', 'agrupamento_nome_1', 'mmhid_places', 'group_id_index']]

Unnamed: 0,nome_master,group_id,agrupamento_nome_1,mmhid_places,group_id_index
1855,RESTAURANTEYAMASU,RESTAURANTEYAMASUSHI|São Paulo SP,"RESTAURANTEYAMASU, RESTAURANTEYAMASUSHI3550308",created1855,864830
257737,RESTAURANTEYAMASUSHI,RESTAURANTEYAMASUSHI|São Paulo SP,"RESTAURANTEYAMASU, RESTAURANTEYAMASUSHI3550308",505114550,864830


In [85]:
df4[df4['mmhid_places']=='505114550'][['nome_master', 'group_id', 'agrupamento_nome_1', 'mmhid_places', 'group_id_index']]

Unnamed: 0,nome_master,group_id,agrupamento_nome_1,mmhid_places,group_id_index
8364,RESTAURANTE,RESTAURANTE|São Paulo SP,"RESTAURANTE, RESTAURANTE041946898603550308",505114550,864830
257737,RESTAURANTEYAMASUSHI,RESTAURANTEYAMASUSHI|São Paulo SP,"RESTAURANTEYAMASU, RESTAURANTEYAMASUSHI3550308",505114550,864830


In [86]:
df4[df4['group_id']=='RESTAURANTE|São Paulo SP'][['nome_master', 'group_id', 'agrupamento_nome_1', 'mmhid_places', 'group_id_index']]

Unnamed: 0,nome_master,group_id,agrupamento_nome_1,mmhid_places,group_id_index
8339,RESTAURANTE,RESTAURANTE|São Paulo SP,"RESTAURANTE, RESTAURANTE041946898603550308",647431069,864830
8340,RESTAURANTE,RESTAURANTE|São Paulo SP,"RESTAURANTE, RESTAURANTE041946898603550308",815781288,864830
8341,RESTAURANTE,RESTAURANTE|São Paulo SP,"RESTAURANTE, RESTAURANTE041946898603550308",866201937,864830
8342,RESTAURANTE,RESTAURANTE|São Paulo SP,"RESTAURANTE, RESTAURANTE041946898603550308",799200892,864830
8343,RESTAURANTE,RESTAURANTE|São Paulo SP,"RESTAURANTE, RESTAURANTE041946898603550308",763948377,864830
...,...,...,...,...,...
8518,RESTAURANTE,RESTAURANTE|São Paulo SP,"RESTAURANTE, RESTAURANTE041946898603550308",715575740,864830
8519,RESTAURANTE,RESTAURANTE|São Paulo SP,"RESTAURANTE, RESTAURANTE041946898603550308",565315603,864830
8520,RESTAURANTE,RESTAURANTE|São Paulo SP,"RESTAURANTE, RESTAURANTE041946898603550308",847270011,864830
8521,RESTAURANTE,RESTAURANTE|São Paulo SP,"RESTAURANTE, RESTAURANTE041946898603550308",539157007,864830


In [68]:
df3['len_resultado'] = df3.groupby('group_id')['nome_master'].transform('count')

In [69]:
df3

Unnamed: 0,subs_asterisk,nome_master,nome_muni,uf,cpf,cpf_brasil,qtd_cpfs,qtd_cnpjs,cnpj,numero_inicio,...,mmhid_merge,merchant_market_hierarchy_id,nome_ton,inicio,grouped_names,agrupamento_nome_1,_merge,group_id_index,group_id,len_resultado
0,SumUp,REYTECH,São Paulo,SP,,,,1,55288394000117,,...,,,,REYTEC,[REYTECH],REYTECH3550308,both,1,REYTECH|São Paulo SP,1
1,SumUp,APPSERVIDORE,São Paulo,SP,,,,,,,...,,,,APPSER,[APPSERVIDORE],APPSERVIDORE3550308,both,2,APPSERVIDORE|São Paulo SP,1
2,SumUp,JOSEWILSONP,São Paulo,SP,17782026830,,26,10,20701303000149,,...,,,,JOSEWI,[JOSEWILSONP],"JOSEWILSONP, JOSEWILSONPEGADO3550308",both,3,JOSEWILSONP|São Paulo SP,1
3,SumUp,MARIAARTS,São Paulo,SP,,,,,,,...,,,,MARIAA,[MARIAARTS],MARIAARTS3550308,both,4,MARIAARTS|São Paulo SP,1
4,SumUp,CANILBUTDOGG,São Paulo,SP,,,,,,,...,,,,CANILB,[CANILBUTDOGG],CANILBUTDOGG3550308,both,5,CANILBUTDOGG|São Paulo SP,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1334900,Ton,CLINICAODONTOLOGIADENISEFERES,São Paulo,SP,,,,,03670613000124,,...,,,CLINICA ODONTOLOGIA DENISE FERES,CLINIC,[CLINICAODONTOLOGIADENISEFERES],"CLINICAODONTOLOGIA, CLINICAODONTOLOGIADENISEFE...",both,878144,CLINICAODONTOLOGICADENISEFERESLTDA|São Paulo S...,2
1334901,Ton,POINTDOCOYOTE,São Paulo,SP,16154481870,,,,,,...,,,POINT DO COYOTE,POINTD,[POINTDOCOYOTE],POINTDOCOYOTE3550308,both,878145,"VANDERLEIBATISTABARROS|São Paulo SP, POINTDOCO...",2
1334902,Ton,ANGELICADAMOTALIMA,São Paulo,SP,44972567821,,,,,,...,,,ANGELICA DA MOTA LIMA,ANGELI,[ANGELICADAMOTALIMA],ANGELICADAMOTALIMA3550308,both,878146,"ANGLICADAMOTALIMA|São Paulo SP, ANGELICADAMOTA...",2
1334903,Ton,BRUNACECCHETTIESTETICA,São Paulo,SP,39824084827,,,,,,...,,,BRUNA CECCHETTI ESTETICA,BRUNAC,[BRUNACECCHETTIESTETICA],BRUNACECCHETTIESTETICA3550308,both,594277,"BRUNACECCHETTIMENDESDASILVA|São Paulo SP, BRUN...",3


In [70]:
df3[df3['len_resultado']>1000]['agrupamento_nome_1'].unique()

array(['RESTAURANTEYAMASU, RESTAURANTEYAMASUSHI3550308',
       'CARLOS23136047800, CARLOS3550308',
       'RESTAURANTED, RESTAURANTEDAANA, RESTAURANTEDA, RESTAURANTEDAA3550308',
       'RESTAURANTEFRIGID, RESTAURANTEFRIGIDEIRA, RESTAURANTEFRIGIDEIR3550308',
       'SABORNORDESTINO, SABORNORDESTI, SABORNORDESTINODALU, SABORNORDESTIN3550308',
       'RCDESOUSA, RCDESOUSAALIMENTO, RCDESOUSAALIME3550308',
       'RJGOMESLANCHON, RJGOMESLANCHONETEE3550308',
       'RESTAURANTE, RESTAURANTE041946898603550308', 'PERAEACAI3550308',
       'BARDALOIRA10152151818, BARDALOIRA3550308',
       'MARIORESTAURANTE, MARIORESTAUR3550308',
       'RESTAURANTEEPIZZAR, RESTAURANTEEPIZZARIAKINJOSLTDA, RESTAURANTEEPIZZARI, RESTAURANTEEPIZZ, RESTAURANTEEPIZZA, RESTAURANTEEPIZZARIA3550308',
       'LANCHSANTAVITORIA3550308',
       'LANCHONETEEACAIDOR, LANCHONETEEACA, LANCHONETEEACAI, LANCHONETEE, LANCHONETEEA3550308',
       'RESTAURANTEGAL, RESTAURANTEGA, RESTAURANTEGALERI, RESTAURANTEGALERIA3550308',
     

In [56]:
df3


Unnamed: 0,subs_asterisk,nome_master,nome_muni,uf,cpf,cpf_brasil,qtd_cpfs,qtd_cnpjs,cnpj,numero_inicio,...,merchant_tax_id,mmhid_merge,merchant_market_hierarchy_id,nome_ton,inicio,grouped_names,agrupamento_nome_1,_merge,group_id_index,group_id
0,SumUp,REYTECH,São Paulo,SP,,,,1,55288394000117,,...,,,,,REYTEC,[REYTECH],REYTECH3550308,both,1,REYTECH|São Paulo SP
1,SumUp,APPSERVIDORE,São Paulo,SP,,,,,,,...,,,,,APPSER,[APPSERVIDORE],APPSERVIDORE3550308,both,2,APPSERVIDORE|São Paulo SP
2,SumUp,JOSEWILSONP,São Paulo,SP,17782026830,,26,10,20701303000149,,...,,,,,JOSEWI,[JOSEWILSONP],"JOSEWILSONP, JOSEWILSONPEGADO3550308",both,3,JOSEWILSONP|São Paulo SP
3,SumUp,MARIAARTS,São Paulo,SP,,,,,,,...,,,,,MARIAA,[MARIAARTS],MARIAARTS3550308,both,4,MARIAARTS|São Paulo SP
4,SumUp,CANILBUTDOGG,São Paulo,SP,,,,,,,...,,,,,CANILB,[CANILBUTDOGG],CANILBUTDOGG3550308,both,5,CANILBUTDOGG|São Paulo SP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1334900,Ton,CLINICAODONTOLOGIADENISEFERES,São Paulo,SP,,,,,03670613000124,,...,,,,CLINICA ODONTOLOGIA DENISE FERES,CLINIC,[CLINICAODONTOLOGIADENISEFERES],"CLINICAODONTOLOGIA, CLINICAODONTOLOGIADENISEFE...",both,878144,CLINICAODONTOLOGICADENISEFERESLTDA|São Paulo S...
1334901,Ton,POINTDOCOYOTE,São Paulo,SP,16154481870,,,,,,...,,,,POINT DO COYOTE,POINTD,[POINTDOCOYOTE],POINTDOCOYOTE3550308,both,878145,"VANDERLEIBATISTABARROS|São Paulo SP, POINTDOCO..."
1334902,Ton,ANGELICADAMOTALIMA,São Paulo,SP,44972567821,,,,,,...,,,,ANGELICA DA MOTA LIMA,ANGELI,[ANGELICADAMOTALIMA],ANGELICADAMOTALIMA3550308,both,878146,"ANGLICADAMOTALIMA|São Paulo SP, ANGELICADAMOTA..."
1334903,Ton,BRUNACECCHETTIESTETICA,São Paulo,SP,39824084827,,,,,,...,,,,BRUNA CECCHETTI ESTETICA,BRUNAC,[BRUNACECCHETTIESTETICA],BRUNACECCHETTIESTETICA3550308,both,594277,"BRUNACECCHETTIMENDESDASILVA|São Paulo SP, BRUN..."


In [35]:
df2 = pd.read_parquet('data/nomes_agrupados/part_3550308.parquet')
df2['len_resultado'] = df2.groupby(['agrupamento_nome_1'])['nome_master'].transform('count')
df2

Unnamed: 0,nome_master,inicio,cod_muni,resultado_names,agrupamento_nome_1,len_resultado
0,,,3550308,[],,1
1,AAAA,AAAA,3550308,[AAAA],AAAA,1
2,AAAAAAAAAAAA,AAAAAA,3550308,[AAAAAAAAAAAA],AAAAAAAAAAAA,1
3,AAAAAAAAAAOKINAWA,AAAAAA,3550308,[AAAAAAAAAAOKINAWA],AAAAAAAAAAOKINAWA,1
4,AAAAAADOMONGE,AAAAAA,3550308,[AAAAAADOMONGE],AAAAAADOMONGE,1
...,...,...,...,...,...,...
1095648,ZZDISTRIBUIDORADEGLP,ZZDIST,3550308,[ZZDISTRIBUIDORADEGLP],ZZDISTRIBUIDORADEGLP,1
1095649,ZZJOIAS,ZZJOIA,3550308,[ZZJOIAS],ZZJOIAS,1
1095650,ZZMATPARA,ZZMATP,3550308,[ZZMATPARA],ZZMATPARA,1
1095651,ZZNSHOESCALCADOSE,ZZNSHO,3550308,[ZZNSHOESCALCADOSE],ZZNSHOESCALCADOSE,1


In [37]:
df2.sort_values('len_resultado')

Unnamed: 0,nome_master,inicio,cod_muni,resultado_names,agrupamento_nome_1,len_resultado
0,,,3550308,[],,1
663321,MARCELORAYMUNDODEOL,MARCEL,3550308,[MARCELORAYMUNDODEOL],MARCELORAYMUNDODEOL,1
663322,MARCELOREBEQUI,MARCEL,3550308,[MARCELOREBEQUI],MARCELOREBEQUI,1
663323,MARCELOREGIANI,MARCEL,3550308,[MARCELOREGIANI],MARCELOREGIANI,1
663326,MARCELOREGISFERREIRA,MARCEL,3550308,[MARCELOREGISFERREIRA],MARCELOREGISFERREIRA,1
...,...,...,...,...,...,...
700533,MATERIALDE,MATERI,3550308,"[MATERIALDEC, MATERIALDECONSTRUCA, MATERIALDE,...","MATERIALDEC, MATERIALDECONSTRUCAO01488093598, ...",10
700534,MATERIALDEC,MATERI,3550308,"[MATERIALDEC, MATERIALDECONSTRUCA, MATERIALDE,...","MATERIALDEC, MATERIALDECONSTRUCAO01488093598, ...",10
700535,MATERIALDECO,MATERI,3550308,"[MATERIALDEC, MATERIALDECONSTRUCA, MATERIALDE,...","MATERIALDEC, MATERIALDECONSTRUCAO01488093598, ...",10
700537,MATERIALDECONS,MATERI,3550308,"[MATERIALDEC, MATERIALDECONSTRUCA, MATERIALDE,...","MATERIALDEC, MATERIALDECONSTRUCAO01488093598, ...",10


In [34]:
df2[df2['agrupamento_nome_1']=='CLAUDIAAPARECIDATANI']

Unnamed: 0,nome_master,inicio,cod_muni,resultado_names,agrupamento_nome_1,len_resultado
204942,CLAUDIAAPARECIDATANI,CLAUDI,3550308,[CLAUDIAAPARECIDATANI],CLAUDIAAPARECIDATANI,486


In [38]:
df

Unnamed: 0,subs_asterisk,nome_master,nome_muni,uf,cpf,cpf_brasil,qtd_cpfs,qtd_cnpjs,cnpj,numero_inicio,cod_muni,merchant_tax_id,mmhid_merge,merchant_market_hierarchy_id,id_ton,nome_ton,inicio,resultado_names,agrupamento_nome_1,_merge
0,SumUp,REYTECH,São Paulo,SP,,,,1,55288394000117,,3550308,,,,,,REYTEC,[REYTECH],REYTECH3550308,both
1,SumUp,APPSERVIDORE,São Paulo,SP,,,,,,,3550308,,,,,,APPSER,[APPSERVIDORE],APPSERVIDORE3550308,both
2,SumUp,JOSEWILSONP,São Paulo,SP,17782026830,,26,10,20701303000149,,3550308,,,,,,JOSEWI,[JOSEWILSONP],"JOSEWILSONP, JOSEWILSONPEGADO3550308",both
3,SumUp,MARIAARTS,São Paulo,SP,,,,,,,3550308,,,,,,MARIAA,[MARIAARTS],MARIAARTS3550308,both
4,SumUp,CANILBUTDOGG,São Paulo,SP,,,,,,,3550308,,,,,,CANILB,[CANILBUTDOGG],CANILBUTDOGG3550308,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1334900,Ton,CLINICAODONTOLOGIADENISEFERES,São Paulo,SP,,,,,03670613000124,,3550308,,,,1818529.0,CLINICA ODONTOLOGIA DENISE FERES,CLINIC,[CLINICAODONTOLOGIADENISEFERES],"CLINICAODONTOLOGIA, CLINICAODONTOLOGIADENISEFE...",both
1334901,Ton,POINTDOCOYOTE,São Paulo,SP,16154481870,,,,,,3550308,,,,1818530.0,POINT DO COYOTE,POINTD,[POINTDOCOYOTE],POINTDOCOYOTE3550308,both
1334902,Ton,ANGELICADAMOTALIMA,São Paulo,SP,44972567821,,,,,,3550308,,,,1818554.0,ANGELICA DA MOTA LIMA,ANGELI,[ANGELICADAMOTALIMA],ANGELICADAMOTALIMA3550308,both
1334903,Ton,BRUNACECCHETTIESTETICA,São Paulo,SP,39824084827,,,,,,3550308,,,,1818555.0,BRUNA CECCHETTI ESTETICA,BRUNAC,[BRUNACECCHETTIESTETICA],BRUNACECCHETTIESTETICA3550308,both


In [39]:
df['len_resultado'] = df.groupby(['agrupamento_nome_1'])['nome_master'].transform('count')

In [40]:
df.sort_values('len_resultado')

Unnamed: 0,subs_asterisk,nome_master,nome_muni,uf,cpf,cpf_brasil,qtd_cpfs,qtd_cnpjs,cnpj,numero_inicio,...,merchant_tax_id,mmhid_merge,merchant_market_hierarchy_id,id_ton,nome_ton,inicio,resultado_names,agrupamento_nome_1,_merge,len_resultado
0,SumUp,REYTECH,São Paulo,SP,,,,1,55288394000117,,...,,,,,,REYTEC,[REYTECH],REYTECH3550308,both,1
831357,Outros_Pags,ABXGKS,São Paulo,SP,,,,,,,...,,,811272556,,,ABXGKS,[ABXGKS],ABXGKS3550308,both,1
831358,Outros_Pags,CRISTIANBARBOSADE,São Paulo,SP,,,,,,,...,39451476000184,706622354,706622354,,,CRISTI,[CRISTIANBARBOSADE],CRISTIANBARBOSADE3550308,both,1
831359,Outros_Pags,JACBOUTIQUE,São Paulo,SP,,,,,,,...,49990855000104,796744485,796744485,,,JACBOU,[JACBOUTIQUE],JACBOUTIQUE3550308,both,1
831360,Outros_Pags,ROSIMARDEJESUSDOS,São Paulo,SP,32804434826,,1,,,,...,,774904701,774904701,,,ROSIMA,[ROSIMARDEJESUSDOS],ROSIMARDEJESUSDOS3550308,both,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204950,Outros_Pags,MARIAAPARECIDA,São Paulo,SP,32646176848,,14261,2771,22971232000120,,...,,856914648,856914648,,,MARIAA,"[MARIAAPARECID, MARIAAPARECIDA, MARIAAPARE, MA...","MARIAAPARECID, MARIAAPARECIDA, MARIAAPARE, MAR...",both,486
205171,Outros_Pags,MARIAAPARECIDA,São Paulo,SP,32646176848,,14261,2771,22971232000120,,...,,820486249,820486249,,,MARIAA,"[MARIAAPARECID, MARIAAPARECIDA, MARIAAPARE, MA...","MARIAAPARECID, MARIAAPARECIDA, MARIAAPARE, MAR...",both,486
205170,Outros_Pags,MARIAAPARECIDA,São Paulo,SP,32646176848,,14261,2771,22971232000120,,...,08561701000101,606337622,606337622,,,MARIAA,"[MARIAAPARECID, MARIAAPARECIDA, MARIAAPARE, MA...","MARIAAPARECID, MARIAAPARECIDA, MARIAAPARE, MAR...",both,486
205175,Outros_Pags,MARIAAPARECIDA,São Paulo,SP,32646176848,,14261,2771,22971232000120,,...,,644081404,644081404,,,MARIAA,"[MARIAAPARECID, MARIAAPARECIDA, MARIAAPARE, MA...","MARIAAPARECID, MARIAAPARECIDA, MARIAAPARE, MAR...",both,486


In [41]:
df2[df2['nome_master']=='MARIAAPARECIDA']

Unnamed: 0,nome_master,inicio,cod_muni,resultado_names,agrupamento_nome_1,len_resultado
674829,MARIAAPARECIDA,MARIAA,3550308,"[MARIAAPARECID, MARIAAPARECIDA, MARIAAPARE, MA...","MARIAAPARECID, MARIAAPARECIDA, MARIAAPARE, MAR...",5
