# 🔄 Complete Business ID and Optical Name Pipeline (Fixed)

This notebook performs complete data processing in the following order:

## 📋 Processing Order:
1. **Data Loading**: `email_customer_matched_full.csv`, `processed_customer_data.csv`
2. **Business ID Assignment by from_address**: Same from_address gets same business_id
3. **Missing Business ID Handling**: Fill remaining NaN business_id with unused business_id from customer_data
4. **Optical Name Generation**: Generate consistent random optical names by business_id
5. **Mapping Application**: Add optical_name column to both DataFrames
6. **Verification**: Check if all business_id and optical_name are completely filled

## 🎯 Goals:
- Same from_address always gets same business_id
- No NaN business_id in any row
- No NaN optical_name in any row
- Same business_id always gets same optical_name
- Prioritize business_id from customer_data

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

print("Libraries loaded successfully!")
print("=" * 60)

In [None]:
# Step 1: Data Loading
print("Step 1: Data Loading")
print("-" * 30)

# Load Email data
email_df = pd.read_csv('../data/email_customer_matched_full.csv')
print(f"✅ Email data loaded: {email_df.shape}")

# Load Customer data
customer_df = pd.read_csv('../data/processed_customer_data.csv')
print(f"✅ Customer data loaded: {customer_df.shape}")

print(f"\n📊 Data Information:")
print(f"Email DataFrame columns: {list(email_df.columns)}")
print(f"Customer DataFrame columns: {list(customer_df.columns)}")

In [None]:
# Step 2: from_address column verification and analysis
print("Step 2: from_address column verification and analysis")
print("-" * 30)

# Find from_address related columns
from_address_cols = [col for col in email_df.columns if 'from' in col.lower() or 'sender' in col.lower()]
print(f"from_address related columns: {from_address_cols}")

# Select most appropriate column
if 'from_address' in email_df.columns:
    from_address_col = 'from_address'
elif 'from' in email_df.columns:
    from_address_col = 'from'
elif 'sender_email' in email_df.columns:
    from_address_col = 'sender_email'
elif len(from_address_cols) > 0:
    from_address_col = from_address_cols[0]
else:
    print("❌ Cannot find from_address related column.")
    print("Available columns:")
    for i, col in enumerate(email_df.columns):
        print(f"  {i+1}. {col}")
    from_address_col = input("Enter column name to use as from_address: ")

print(f"\n✅ from_address column to use: {from_address_col}")

# from_address analysis
unique_from_addresses = email_df[from_address_col].dropna().unique()
print(f"Unique from_address count: {len(unique_from_addresses):,}")
print(f"NaN from_address count: {email_df[from_address_col].isna().sum():,}")

# Email count by from_address
from_address_counts = email_df[from_address_col].value_counts()
print(f"\nEmail count by from_address (top 10):")
print(from_address_counts.head(10))

In [None]:
# Step 3: Business ID status analysis
print("Step 3: Business ID status analysis")
print("-" * 30)

# Email DataFrame business_id analysis
email_na_count = email_df['business_id'].isna().sum()
email_total = len(email_df)
email_unique = email_df['business_id'].nunique()

print(f"📧 Email DataFrame:")
print(f"   Total rows: {email_total:,}")
print(f"   NaN business_id: {email_na_count:,} ({email_na_count/email_total*100:.1f}%)")
print(f"   Valid business_id: {email_total - email_na_count:,}")
print(f"   Unique business_id: {email_unique}")

# Customer DataFrame business_id analysis
customer_na_count = customer_df['business_id'].isna().sum()
customer_total = len(customer_df)
customer_unique = customer_df['business_id'].nunique()

print(f"\n👥 Customer DataFrame:")
print(f"   Total rows: {customer_total:,}")
print(f"   NaN business_id: {customer_na_count:,} ({customer_na_count/customer_total*100:.1f}%)")
print(f"   Valid business_id: {customer_total - customer_na_count:,}")
print(f"   Unique business_id: {customer_unique}")

In [None]:
# Step 4: Business ID assignment by from_address
print("Step 4: Business ID assignment by from_address")
print("-" * 30)

# Available business_ids from Customer DataFrame
available_business_ids = customer_df['business_id'].dropna().unique()
print(f"Unique business_id in Customer DataFrame: {len(available_business_ids):,}")

# Already used business_ids in Email DataFrame
used_business_ids = email_df['business_id'].dropna().unique()
print(f"Already used business_id in Email DataFrame: {len(used_business_ids):,}")

# Available business_id pool (in Customer but not used in Email)
available_pool = set(available_business_ids) - set(used_business_ids)
print(f"Available business_id pool: {len(available_pool):,}")

# Assign business_id by from_address
print(f"\nAssigning business_id by from_address...")
print(f"Unique from_address to process: {len(unique_from_addresses):,}")

# Create from_address to business_id mapping
from_address_to_business_id = {}
available_list = list(available_pool)

# Set seed for reproducible results
np.random.seed(42)
random.seed(42)

print("Assigning business_id by from_address...")
for from_address in tqdm(unique_from_addresses, desc="Processing from_address"):
    # Check if already assigned
    if from_address in from_address_to_business_id:
        continue
    
    # Random selection from available pool
    if len(available_list) > 0:
        selected_business_id = np.random.choice(available_list)
        available_list.remove(selected_business_id)
    else:
        # Refill pool if empty (allow duplicates)
        available_list = list(available_pool)
        selected_business_id = np.random.choice(available_list)
        available_list.remove(selected_business_id)
        print(f"  Pool refilled: {len(available_list)} items")
    
    from_address_to_business_id[from_address] = selected_business_id

print(f"✅ Business ID assignment completed for {len(from_address_to_business_id)} from_addresses")

# Check assignment results
print(f"\nSample assigned from_address:")
sample_items = list(from_address_to_business_id.items())[:5]
for from_address, business_id in sample_items:
    print(f"  {from_address} -> {business_id}")

In [None]:
# Step 5: Apply from_address business_id to Email DataFrame
print("Step 5: Apply from_address business_id to Email DataFrame")
print("-" * 30)

# Copy Email DataFrame
email_df_with_from_business = email_df.copy()

# Apply from_address business_id
print("Applying from_address business_id...")
email_df_with_from_business['business_id'] = email_df_with_from_business[from_address_col].map(from_address_to_business_id)

# Check application results
applied_count = email_df_with_from_business['business_id'].notna().sum()
total_count = len(email_df_with_from_business)
print(f"from_address business_id application results:")
print(f"   Application success: {applied_count:,}/{total_count:,} ({applied_count/total_count*100:.1f}%)")