In [10]:
import pandas as pd

# Read the CSV file into a DataFrame
file_path = 'buyers_enriched.csv'  # Update with your file path
buyers_df = pd.read_csv(file_path)

# Ensure the necessary columns exist
if 'Domain' in buyers_df.columns and 'domain_found' in buyers_df.columns and 'List of Buyer Emails' in buyers_df.columns:
    # Create a new column 'is_domain_match' based on comparison
    buyers_df['is_domain_match'] = buyers_df['Domain'] == buyers_df['domain_found']

    # Extract the first email that does not end with certain domains and is not empty, then get its domain
    def extract_email_domain(email_list):
        if pd.isna(email_list):
            return None
        emails = email_list.split(',')
        excluded_domains = {'gmail.com', 'live.com', 'icloud.com', 'outlook.com'}
        for email in emails:
            email = email.strip()
            if email and email.split('@')[-1] not in excluded_domains:
                return email.split('@')[-1]  # Return the domain part
        return None

    buyers_df['email_domain'] = buyers_df['List of Buyer Emails'].apply(extract_email_domain)

    # Create new columns 'email_domain_match' and 'email_domain_found_match'
    buyers_df['email_domain_match'] = buyers_df['email_domain'] == buyers_df['Domain']
    buyers_df['email_domain_found_match'] = buyers_df['email_domain'] == buyers_df['domain_found']

    # Add this to become more restrictive buyers_df['email_domain_match'] | buyers_df['email_domain_found_match'] and remove | buyers_df['email_domain'].notna()
    # Create a new column 'is_valid_domain'
    buyers_df['is_valid_domain'] =  buyers_df['is_domain_match'] | buyers_df['email_domain'].notna()

    # Create a new column 'final_domain'
    buyers_df['final_domain'] = buyers_df['email_domain'].combine_first(buyers_df.apply(lambda row: row['Domain'] if row['is_domain_match'] else None, axis=1))

    # Save the updated DataFrame back to a CSV file if needed
    output_file = 'buyers_enriched_with_domain_match.csv'
    buyers_df.dropna(subset=["final_domain"], inplace=True)
    buyers_df.to_csv(output_file, index=False)
    print(f"Updated DataFrame saved to {output_file}")
else:
    print("Error: Required columns 'Domain', 'domain_found', and 'List of Buyer Emails' are not present in the CSV file.")


Updated DataFrame saved to buyers_enriched_with_domain_match.csv


In [5]:
import pandas as pd

# Function to remove duplicates within a cell
def clean_emails(cell):
    if pd.isna(cell):  # Check for NaN
        return cell
    emails = cell.split(",")
    unique_emails = sorted(set(email.strip() for email in emails if email.strip()))  # Remove duplicates and empty strings
    return ",".join(unique_emails)

buyers_df = pd.read_csv('buyers_enriched_with_domain_match.csv')
buyers_b2b_saas_df = pd.read_csv('b2b_saas_buyers.csv')

# Apply the function to the specified columns
buyers_df["List of Buyer Emails"] = buyers_df["List of Buyer Emails"].apply(clean_emails)
buyers_df["List of Vendor Emails"] = buyers_df["List of Vendor Emails"].apply(clean_emails)
buyers_df = pd.merge(buyers_df, buyers_b2b_saas_df, left_on=['final_domain'], right_on=['domain'], how="inner")
buyers_df.to_csv('buyers_enriched_with_domain_match.csv')