# Measuring Completeness

**Activity Overview**: Evaluate data completeness by checking missing data rates and handling partially available records.

## Title: Customer Profiles

**Task**: Calculate the missing data rate for customer profiles.

**Steps**:
1. List all required fields for a complete customer profile (e.g., name, address, email,
phone number).
2. Analyze the dataset to count how many profiles have missing fields.
3. Calculate the percentage of missing data fields across all profiles.

In [None]:
# Write your code from here

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns # For nicer plots

# --- Task: Measuring Completeness - Customer Profiles ---
print("--- Task: Measuring Completeness - Customer Profiles ---")

# Load the customer data dataset
try:
    customer_df = pd.read_csv('customer_data.csv')
except FileNotFoundError:
    print("Error: Make sure 'customer_data.csv' is in the same directory.")
    customer_df = pd.DataFrame() # Create empty DataFrame to avoid errors later

if not customer_df.empty:
    # 1. Define required fields for a complete customer profile based on our sample data
    # (As per task description, this would include 'name', 'address', 'email', 'phone number'.
    # We'll use the fields available in our customer_data.csv sample: 'name', 'age', 'email', 'city')
    required_fields = ['name', 'age', 'email', 'city']

    print(f"\nAnalyzing completeness for customer profile fields: {required_fields}")

    # 2. Analyze the dataset to count how many profiles have missing fields
    # A profile has 'missing fields' if any of the required_fields are null for that row.
    
    # Create a boolean Series where True means at least one required field is missing in that row
    customer_df['has_missing_required_field'] = customer_df[required_fields].isnull().any(axis=1)
    
    profiles_with_missing_fields_count = customer_df['has_missing_required_field'].sum()
    total_profiles = len(customer_df)

    print(f"Total customer profiles: {total_profiles}")
    print(f"Number of profiles with at least one missing required field: {profiles_with_missing_fields_count}")

    # 3. Calculate the percentage of missing data fields across all profiles.
    # This can be interpreted in a few ways:
    # a) Overall percentage of cells that are missing in the required fields subset
    # b) Percentage of profiles that are 'incomplete' (i.e., have at least one missing required field)
    # The task asks for "percentage of missing data fields across all profiles", which suggests (a).
    # Let's calculate both for clarity.

    # --- Interpretation A: Overall percentage of missing cells within specified fields ---
    total_cells_in_required_fields = customer_df[required_fields].size
    total_missing_cells_in_required_fields = customer_df[required_fields].isnull().sum().sum()

    if total_cells_in_required_fields > 0:
        overall_missing_percentage_fields = (total_missing_cells_in_required_fields / total_cells_in_required_fields) * 100
        print(f"\nOverall percentage of missing data across specified required fields: {overall_missing_percentage_fields:.2f}%")
    else:
        print("\nNo cells in required fields to evaluate completeness.")


    # --- Interpretation B: Percentage of incomplete profiles ---
    if total_profiles > 0:
        percentage_incomplete_profiles = (profiles_with_missing_fields_count / total_profiles) * 100
        print(f"Percentage of customer profiles that are incomplete (have at least one missing required field): {percentage_incomplete_profiles:.2f}%")
    else:
        print("No profiles to calculate incomplete percentage.")

    # --- Visualization of missing data rates per field ---
    # This helps understand which specific fields contribute most to incompleteness.
    missing_percentage_per_field = customer_df[required_fields].isnull().sum() / total_profiles * 100

    plt.figure(figsize=(10, 6))
    sns.barplot(x=missing_percentage_per_field.index, y=missing_percentage_per_field.values, palette='coolwarm')
    plt.title('Percentage of Missing Data per Required Customer Profile Field')
    plt.xlabel('Required Field')
    plt.ylabel('Percentage Missing (%)')
    plt.ylim(0, 100) # Ensure y-axis goes from 0 to 100
    for index, value in enumerate(missing_percentage_per_field.values):
        plt.text(index, value + 2, f"{value:.2f}%", ha='center') # Add percentage labels on bars
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

else:
    print("Skipping Task: Measuring Completeness - Customer Profiles due to file loading error.")