# Email Automation Data Analysis

This notebook provides a comprehensive analysis of customer data, sales data, inventory data, and email communications for the email automation project.

## Overview
- **Customer Data**: Account information and customer details
- **Sales Data**: Transaction records and sales history
- **Inventory Data**: Product catalog and stock information
- **Email Data**: Communication records from Zoho

## Data Sources
- `cc (1).csv`: Customer contact information
- `fi.CSV`: Inventory/financial data
- `s_by_c.CSV`: Sales by customer data
- `zoho_emails.json`: Email communication data

## 1. Import Libraries and Utilities

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

# Add src directory to path for importing utilities
sys.path.append('../src')

# Import utility functions
from data_utils import (
    load_data_files,
    load_email_data,
    process_account_data,
    analyze_business_data,
    process_email_dates,
    analyze_customer_matching,
    clean_customer_name
)

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)
pd.set_option('display.width', None)

print("Libraries imported successfully!")
print(f"Current working directory: {os.getcwd()}")

## 2. Load Data Files

In [None]:
# Load all CSV data files
df, df2, df3 = load_data_files()

print("\n" + "="*50)
print("DATA LOADING COMPLETED")
print("="*50)

## 3. Data Overview and Exploration

In [None]:
# Display basic information about each dataset
print("=== CUSTOMER DATA (df) ===")
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print("\nFirst 5 rows:")
display(df.head())

print("\n" + "="*50)
print("=== INVENTORY DATA (df2) ===")
print(f"Shape: {df2.shape}")
print(f"Columns: {list(df2.columns)}")
print("\nFirst 5 rows:")
display(df2.head())

print("\n" + "="*50)
print("=== SALES DATA (df3) ===")
print(f"Shape: {df3.shape}")
print(f"Columns: {list(df3.columns)}")
print("\nFirst 5 rows:")
display(df3.head())

## 4. Customer Data Processing

In [None]:
# Process customer account data
print("Processing customer account data...")

# Add clean customer names
df['customer_name_clean'] = df['Customer'].apply(clean_customer_name)

df3['customer_name_clean'] = df3['Name'].apply(clean_customer_name)



# 5. Remove uneccessary columns

In [None]:
# Remove 'Unnamed: 0' columns from all dataframes

df2 = df2.drop(columns=['Category','Preferred Vendor'])

## 6. Email Data Processing

In [None]:
# Load and process email data
print("Loading email data...")
email_df = load_email_data()

print("\nProcessing email dates and times...")
email_df = process_email_dates(email_df)

print("\nEmail data shape after processing:", email_df.shape)
print("\nEmail data columns:", list(email_df.columns))

print("\n" + "="*50)
print("EMAIL DATA PROCESSING COMPLETED")
print("="*50)

# 7. Inventory columns catagorizing

In [None]:
# Import and run the inventory category splitting function
from split_inventory_categories import split_inventory_categories

# Run the function to split df2 Item column into categories
df, df2_processed, df3 = split_inventory_categories()

# Display the results
print("\n=== PROCESSED INVENTORY DATA ===")
print(f"Shape: {df2_processed.shape}")
print(f"Columns: {list(df2_processed.columns)}")

# Show sample of the split data
print("\n--- Sample Split Data ---")
sample_cols = ['Item'] + [col for col in df2_processed.columns if col.startswith('Category_Level_')] + ['Product_Code']
display(df2_processed[sample_cols].head(10))

# Update the original df2 variable
df2 = df2_processed

print("\n" + "="*50)
print("INVENTORY CATEGORY SPLITTING COMPLETED")
print("="*50)

## 8. Customer Matching Analysis

In [None]:
# Process account data to extract business information
df_processed = process_account_data(df)

# Analyze the processed data
business_groups = analyze_business_data(df_processed)

# Save processed data
df_processed.to_csv('../data/processed_accounts.csv', index=False)
print("\nProcessed data saved to '../data/processed_accounts.csv'")

print("\n" + "="*50)
print("CUSTOMER DATA PROCESSING COMPLETED")
print("="*50)

In [None]:
# Analyze customer matching between customer data and sales data
print("Analyzing customer matching between datasets...")

matching_results = analyze_customer_matching(df_processed, df3)

print("\n" + "="*50)
print("CUSTOMER MATCHING ANALYSIS COMPLETED")
print("="*50)

## 9. Data Quality Assessment

In [None]:
# Assess data quality across all datasets
print("=== DATA QUALITY ASSESSMENT ===")

# Customer data quality
print("\n--- Customer Data Quality ---")
print(f"Total records: {len(df_processed)}")
print(f"Missing values in key columns:")
print(df_processed[['Customer', 'Account No.', 'Main Email', 'Main Phone']].isnull().sum())

# Sales data quality
print("\n--- Sales Data Quality ---")
print(f"Total records: {len(df3)}")
print(f"Missing values in key columns:")
print(df3[['Name', 'Date', 'Amount', 'Item']].isnull().sum())

# Email data quality
print("\n--- Email Data Quality ---")
print(f"Total records: {len(email_df)}")
print(f"Missing values in key columns:")
print(email_df[['fromAddress', 'toAddress', 'subject', 'sentDate']].isnull().sum())

print("\n" + "="*50)
print("DATA QUALITY ASSESSMENT COMPLETED")
print("="*50)

## 10. Summary Statistics

In [None]:
# Generate summary statistics
print("=== SUMMARY STATISTICS ===")

print("\n--- Dataset Overview ---")
print(f"Customer Records: {len(df_processed):,}")
print(f"Sales Records: {len(df3):,}")
print(f"Email Records: {len(email_df):,}")
print(f"Inventory Records: {len(df2):,}")

print("\n--- Business Information ---")
print(f"Unique Businesses: {df_processed['business_id'].nunique():,}")
print(f"Unique Customer Names: {df_processed['customer_name_clean'].nunique():,}")
print(f"Unique Email Senders: {email_df['fromAddress'].nunique():,}")
print(f"Unique Email Recipients: {email_df['toAddress'].nunique():,}")

print("\n--- Account Types ---")
account_type_summary = df_processed['account_type'].value_counts()
for account_type, count in account_type_summary.items():
    print(f"{account_type}: {count:,}")

print("\n--- Email Communication ---")
print(f"Date Range: {email_df['sentDate'].min()} to {email_df['sentDate'].max()}")
print(f"Total Email Volume: {len(email_df):,}")

print("\n" + "="*50)
print("SUMMARY STATISTICS COMPLETED")
print("="*50)

## 11. Data Export and Next Steps

In [None]:
# Export processed data for further analysis
print("Exporting processed data...")

# Export processed customer data
df_processed.to_csv('../data/processed_customer_data.csv', index=False)
print("✓ Processed customer data exported")

# Export processed email data
email_df.to_csv('../data/processed_email_data.csv', index=False)
print("✓ Processed email data exported")

# Export sales data with clean customer names
if 'customer_name_clean' not in df3.columns:
    df3['customer_name_clean'] = df3['Name'].apply(clean_customer_name)
df3.to_csv('../data/processed_sales_data.csv', index=False)
print("✓ Processed sales data exported")

# Export inventory data with sub catogories 
df2.to_csv('../data/processed_inventory _data.csv', index=False)
print("✓ Processed inventory data exported")


print("\n=== EXPORT COMPLETED ===")
print("All processed data has been exported to the 'data' folder.")
print("\nNext steps:")
print("1. Review the processed data files")
print("2. Perform additional analysis as needed")
print("3. Create visualizations and reports")
print("4. Implement email automation logic")

print("\n" + "="*50)
print("DATA ANALYSIS COMPLETED")
print("="*50)