In [None]:
# Part 1: Load a Dataset & Check Missing Values

# Task 1: Customer Dataset
# - Load a custom CSV file named customer_data.csv .
# - Find any missing values in specific columns like 'Email' and 'Phone'.






# Part 2: Identify Duplicates & Inconsistencies

# Task 2: Duplicate Emails in Customer Dataset
# - Identify duplicate emails which might indicate duplicate customer records.







# Part 3: Generate a Data Quality Report

# Task 3: Customer Dataset Report
# - Summarize the data quality with missing values, duplicates, and inconsistencies for customer_data.csv .







In [1]:
import pandas as pd

def analyze_customer_data(file_path="customer_data.csv"):
    """
    Analyzes customer data from a CSV file, checks for missing values in
    'Email' and 'Phone', identifies duplicate emails, and generates a
    data quality report.

    Args:
        file_path (str, optional): Path to the customer data CSV file.
            Defaults to "customer_data.csv". If the file is not found,
            it will use placeholder data.

    Returns:
        pandas.DataFrame: Returns the original customer dataframe.
            Prints a data quality report to the console. Returns None on error.
    """
    try:
        # Task 1: Load the dataset using Pandas
        df = pd.read_csv(file_path)
        print(f"Loaded customer data from: {file_path}")
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}. Using placeholder data.")
        # Create placeholder customer data.
        df = pd.DataFrame({
            'CustomerID': [1, 2, 3, 4, 5, 6],
            'Name': ['Alice Smith', 'Bob Johnson', 'Charlie Brown', 'Diana Miller', 'Ethan Davis', 'Alice Smith'],
            'Email': ['alice.smith@example.com', 'bob.johnson@example.com', 'charlie.brown@example.com',
                      'diana.miller@example.com', 'ethan.davis@example.com', 'alice.smith@example.com'],
            'Phone': ['123-456-7890', '987-654-3210', '555-123-4567', None, '111-222-3333', '123-456-7890'],
            'Address': ['123 Main St', '456 Oak Ave', '789 Pine Ln', '101 Elm St', '202 Cedar Rd', '123 Main St'],
            'Age': [25, 30, 22, 35, 28, 25],
            'Gender': ['Female', 'Male', 'Male', 'Female', 'Male', 'Female']
        })

    # Check if the DataFrame is empty
    if df.empty:
        print("Error: The DataFrame is empty.  Please check the data source.")
        return None

    # --- Data Quality Analysis for Customer Dataset ---
    print("\n--- Data Quality Report for Customer Dataset ---")

    # Task 1: Find missing values in specific columns
    print("\nMissing Values in 'Email' and 'Phone' columns:")
    print(df[['Email', 'Phone']].isnull().sum())

    # Task 2: Identify duplicate emails
    print("\nDuplicate Emails:")
    duplicate_emails = df[df['Email'].duplicated(keep=False)]['Email'] # Corrected line
    if duplicate_emails.empty:
        print("No duplicate emails found.")
    else:
        print(duplicate_emails.to_string(index=False))

    # Task 3: Generate a data quality summary
    print("\nData Quality Summary:")
    print(df.info())  # Get an overview of the data

    # Print descriptive statistics for numerical columns
    print("\nDescriptive Statistics for numerical columns:")
    numeric_df = df.select_dtypes(include=pd.np.number)
    print(numeric_df.describe())

    # Print the first few rows of the dataframe
    print("\nFirst 5 rows of the customer dataframe:")
    print(df.head().to_markdown(index=False, numalign="left", stralign="left"))
    return df

if __name__ == "__main__":
    # You can specify the file path if it's not the default "customer_data.csv"
    customer_df = analyze_customer_data() # Uses default
    # customer_df = analyze_customer_data("path/to/your/customer_data.csv")
    if customer_df is not None:
        print("\nAnalysis Complete.")


Error: File not found at customer_data.csv. Using placeholder data.

--- Data Quality Report for Customer Dataset ---

Missing Values in 'Email' and 'Phone' columns:
Email    0
Phone    1
dtype: int64

Duplicate Emails:
alice.smith@example.com
alice.smith@example.com

Data Quality Summary:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   CustomerID  6 non-null      int64 
 1   Name        6 non-null      object
 2   Email       6 non-null      object
 3   Phone       5 non-null      object
 4   Address     6 non-null      object
 5   Age         6 non-null      int64 
 6   Gender      6 non-null      object
dtypes: int64(2), object(5)
memory usage: 464.0+ bytes
None

Descriptive Statistics for numerical columns:


AttributeError: module 'pandas' has no attribute 'np'