In [1]:
import pandas as pd
path=r'C:\Users\SSN0609\Stanley Black & Decker\Latin America - Regional Marketing - Marketing Analytics\Data\Processed-Dataflow\Master_Customers\Master_Customers.xlsx'
df=pd.read_excel(path, sheet_name='customers', engine='openpyxl')

In [6]:
import pandas as pd
import numpy as np
import re
import networkx as nx
# 1. CORRECCIÓN DE IMPORTACIÓN: Usar el módulo distance.JaroWinkler
from rapidfuzz.distance import JaroWinkler as jaro_winkler 

# --- 1. Preprocesamiento (sin cambios) ---
def clean_text_strict(text):
    if pd.isna(text): 
        return ""
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text) 
    return re.sub(r'\s+', ' ', text).strip()


# --- 2. Función de Jaro-Winkler Optimizada (con llamada corregida) ---

def jaro_winkler_clustering(unique_names, threshold=95, batch_size=2000):
    N = len(unique_names)
    matches = []
    clean_names = [clean_text_strict(name) for name in unique_names]

    for i in range(0, N, batch_size):
        end = min(i + batch_size, N)
        
        for j in range(i, N): 
            name_j = clean_names[j]
            
            for k in range(i, end): 
                
                if k >= j: 
                    continue
                
                name_k = clean_names[k]
                
                # 2. CORRECCIÓN DE LLAMADA: Usar jaro_winkler.similarity()
                sim = jaro_winkler.normalized_similarity(name_k, name_j) 
                
                # JaroWinkler.normalized_similarity retorna un valor entre 0 y 1. 
                # Multiplicamos por 100 para usar el threshold en escala 0-100.
                if (sim * 100) >= threshold:
                    matches.append((k, j))
        
        print(f"Procesado batch {i} a {end}. Matches encontrados: {len(matches)}")
        
    return matches

# --- 3. Función Principal de Estandarización (sin cambios funcionales) ---

def standardize_names_jarowinkler(df, col_name='Sold-To Customer Name', similarity_threshold=95):
    
    print("1. Obteniendo nombres únicos...")
    unique_names = df[col_name].dropna().unique()
    
    if not 90 <= similarity_threshold <= 100:
        raise ValueError("El umbral Jaro-Winkler debe estar entre 90 y 100.")

    print(f"2. Calculando similitudes Jaro-Winkler por lotes (Umbral: {similarity_threshold})...")
    
    matched_pairs = jaro_winkler_clustering(
        unique_names, 
        threshold=similarity_threshold,
        batch_size=1000 
    )
    
    print(f"3. Construyendo grafo con {len(matched_pairs)} conexiones...")
    G = nx.Graph()
    G.add_nodes_from(range(len(unique_names)))
    G.add_edges_from(matched_pairs)
    
    canonical_map = {}
    print("4. Resolviendo clusters y seleccionando nombres canónicos...")
    
    for component in nx.connected_components(G):
        if len(component) > 1:
            names_in_group = [unique_names[i] for i in component]
            representative = max(names_in_group, key=len)
            
            for idx in component:
                canonical_map[unique_names[idx]] = representative
    
    print("5. Mapeando al dataframe original...")
    
    df['Canonical Name'] = df[col_name].map(canonical_map)
    df['Canonical Name'] = df['Canonical Name'].fillna(df[col_name])
    
    return df

In [7]:
df_result = standardize_names_jarowinkler(df, 
                                          col_name='Sold-To Customer Name', 
                                          similarity_threshold=97) 


1. Obteniendo nombres únicos...
2. Calculando similitudes Jaro-Winkler por lotes (Umbral: 97)...
Procesado batch 0 a 1000. Matches encontrados: 221
Procesado batch 1000 a 2000. Matches encontrados: 361
Procesado batch 2000 a 3000. Matches encontrados: 517
Procesado batch 3000 a 4000. Matches encontrados: 702
Procesado batch 4000 a 5000. Matches encontrados: 869
Procesado batch 5000 a 6000. Matches encontrados: 1093
Procesado batch 6000 a 7000. Matches encontrados: 1205
Procesado batch 7000 a 8000. Matches encontrados: 1334
Procesado batch 8000 a 9000. Matches encontrados: 1470
Procesado batch 9000 a 10000. Matches encontrados: 1595
Procesado batch 10000 a 11000. Matches encontrados: 1725
Procesado batch 11000 a 12000. Matches encontrados: 1865
Procesado batch 12000 a 13000. Matches encontrados: 2034
Procesado batch 13000 a 14000. Matches encontrados: 2174
Procesado batch 14000 a 15000. Matches encontrados: 2347
Procesado batch 15000 a 16000. Matches encontrados: 2495
Procesado batch 16

In [8]:
df.to_csv(r'C:\Users\SSN0609\Stanley Black & Decker\Latin America - Regional Marketing - Marketing Analytics\Data\Processed-Dataflow\Master_Customers\Master_Customers_name.csv', index=False, encoding='utf-8-sig')