In [4]:
# Common Data Errors Examples

# 1. Missing Data:
# Task 1: Review a dataset where some customer emails are missing. Identify how
# many records are incomplete.
# Task 2: Examine a sales dataset with missing transaction dates and determine the
# percentage of missing data.
# Task 3: Identify missing department information in an employee registry.






# 2. Duplicate Data:
# Task 1: Analyze a customer dataset with duplicate entries and count the number of
# duplicates.
# Task 2: Review supplier data and identify any repeated supplier names.
# Task 3: Examine a product inventory list for duplicates in product IDs.






# 3. Inconsistent Formatting:
# Task 1: Spot inconsistencies in date formats (e.g., DD/MM/YYYY vs. MM/DD/YYYY)
# in a dataset.
# Task 2: Identify phone numbers with varying formats in a contact list.
# Task 3: Review address data for discrepancies in state abbreviations (e.g., CA vs.
# Calif.).





# 4. Data Drift:
# Task 1: Compare monthly revenues over six months to identify data drift.
# Task 2: Analyze user engagement metrics from a web application over different
# quarters.
# Task 3: Review a stock price dataset to detect any anomalies over a year.





In [5]:
import pandas as pd
import numpy as np

print("--- Analyzing Common Data Errors ---")


customer_data = pd.DataFrame({'CustomerID': [1, 2, 3, 4, 5],
                              'Email': ['abc@example.com', None, 'def@sample.org', np.nan, 'ghi@work.net']})

sales_data = pd.DataFrame({'TransactionID': [101, 102, 103, 104, 105],
                           'TransactionDate': ['2025-01-15', None, '2025-02-20', None, '2025-03-10']})

employee_registry = pd.DataFrame({'EmployeeID': [1, 2, 3, 4, 5],
                                  'Department': ['HR', 'Sales', None, 'Technology', np.nan]})

supplier_data = pd.DataFrame({'SupplierID': [1, 2, 3, 4, 2],
                              'SupplierName': ['Alpha Corp', 'Beta Inc', 'Gamma Ltd', 'Delta Group', 'Beta Inc']})

product_inventory = pd.DataFrame({'ProductID': ['P100', 'P101', 'P102', 'P100', 'P103'],
                                  'ProductName': ['Laptop', 'Mouse', 'Keyboard', 'Laptop', 'Monitor']})

contact_list = pd.DataFrame({'Name': ['Alice', 'Bob', 'Charlie', 'David'],
                             'Phone': ['123-456-7890', '987 654 3210', '(555) 123-4567', '1234567890']})

address_data = pd.DataFrame({'City': ['Los Angeles', 'Sacramento', 'San Francisco', 'Oakland'],
                             'State': ['CA', 'Calif.', 'CA', 'California']})

revenue_data = pd.DataFrame({'Month': ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun'],
                             'Revenue': [100, 110, 105, 150, 90, 160]})

engagement_data = pd.DataFrame({'Quarter': ['Q1', 'Q1', 'Q2', 'Q2', 'Q3', 'Q3'],
                                 'Users': [1000, 1050, 1200, 1150, 950, 1000]})

stock_prices = pd.DataFrame({'Date': pd.to_datetime(['2024-01-01', '2024-04-01', '2024-07-01', '2024-10-01', '2025-01-01']),
                             'Price': [150, 155, 160, 140, 170]})

print("\n--- 1. Missing Data ---")

# Task 1: Review a dataset where some customer emails are missing. Identify how many records are incomplete.
incomplete_emails_count = customer_data['Email'].isnull().sum()
print(f"\nTask 1: Number of records with missing customer emails: {incomplete_emails_count}")
print("Incomplete records:")
print(customer_data[customer_data['Email'].isnull()])

# Task 2: Examine a sales dataset with missing transaction dates and determine the percentage of missing data.
missing_dates_count = sales_data['TransactionDate'].isnull().sum()
total_records_sales = len(sales_data)
percentage_missing_dates = (missing_dates_count / total_records_sales) * 100
print(f"\nTask 2: Percentage of missing transaction dates: {percentage_missing_dates:.2f}%")
print("Records with missing transaction dates:")
print(sales_data[sales_data['TransactionDate'].isnull()])

# Task 3: Identify missing department information in an employee registry.
missing_departments = employee_registry[employee_registry['Department'].isnull()]
print(f"\nTask 3: Records with missing department information:")
print(missing_departments)

print("\n--- 2. Duplicate Data ---")

# Task 1: Analyze a customer dataset with duplicate entries and count the number of duplicates.
duplicates_customer = customer_data.duplicated().sum()
print(f"\nTask 1: Number of duplicate rows in customer data: {duplicates_customer}")
print("Duplicate rows:")
print(customer_data[customer_data.duplicated(keep=False)]) # keep=False shows all duplicates

# Task 2: Review supplier data and identify any repeated supplier names.
duplicate_suppliers = supplier_data[supplier_data.duplicated(subset=['SupplierName'], keep=False)]
print(f"\nTask 2: Repeated supplier names:")
print(duplicate_suppliers)

# Task 3: Examine a product inventory list for duplicates in product IDs.
duplicate_product_ids = product_inventory[product_inventory.duplicated(subset=['ProductID'], keep=False)]
print(f"\nTask 3: Products with duplicate IDs:")
print(duplicate_product_ids)

print("\n--- 3. Inconsistent Formatting ---")

# Task 1: Spot inconsistencies in date formats (e.g., DD/MM/YYYY vs. MM/DD/YYYY) in a dataset.

print("\nTask 1: Identifying inconsistent date formats requires careful parsing and standardization.")
print("Assuming a column with mixed date formats would need a function to try different parsers.")

import re
def find_inconsistent_phone_formats(phone_number):
    patterns = [r'^\d{3}-\d{3}-\d{4}$', r'^\d{10}$', r'^\(\d{3}\) \d{3}-\d{4}$']
    for pattern in patterns:
        if re.match(pattern, phone_number):
            return True
    return False

contact_list['consistent_format'] = contact_list['Phone'].apply(find_inconsistent_phone_formats)
inconsistent_phones = contact_list[contact_list['consistent_format'] == False]
print("\nTask 2: Phone numbers with varying formats:")
print(inconsistent_phones)

# Task 3: Review address data for discrepancies in state abbreviations (e.g., CA vs. Calif.).
state_abbreviations = ['CA', 'NY', 'TX'] # Add more as needed
inconsistent_states = address_data[~address_data['State'].isin(state_abbreviations)]
print(f"\nTask 3: Discrepancies in state abbreviations (compared to: {state_abbreviations}):")
print(inconsistent_states)

print("\n--- 4. Data Drift ---")

# Task 1: Compare monthly revenues over six months to identify data drift.

revenue_change = revenue_data['Revenue'].pct_change() * 100
print("\nTask 1: Monthly revenue change (%):")
print(revenue_change)


# Task 2: Analyze user engagement metrics from a web application over different quarters.
engagement_grouped = engagement_data.groupby('Quarter')['Users'].agg(['mean', 'std'])
print("\nTask 2: User engagement metrics per quarter:")
print(engagement_grouped)


# Task 3: Review a stock price dataset to detect any anomalies over a year.

stock_prices['Price Change'] = stock_prices['Price'].diff()
print("\nTask 3: Stock price changes:")
print(stock_prices)


--- Analyzing Common Data Errors ---

--- 1. Missing Data ---

Task 1: Number of records with missing customer emails: 2
Incomplete records:
   CustomerID Email
1           2  None
3           4   NaN

Task 2: Percentage of missing transaction dates: 40.00%
Records with missing transaction dates:
   TransactionID TransactionDate
1            102            None
3            104            None

Task 3: Records with missing department information:
   EmployeeID Department
2           3       None
4           5        NaN

--- 2. Duplicate Data ---

Task 1: Number of duplicate rows in customer data: 0
Duplicate rows:
Empty DataFrame
Columns: [CustomerID, Email]
Index: []

Task 2: Repeated supplier names:
   SupplierID SupplierName
1           2     Beta Inc
4           2     Beta Inc

Task 3: Products with duplicate IDs:
  ProductID ProductName
0      P100      Laptop
3      P100      Laptop

--- 3. Inconsistent Formatting ---

Task 1: Identifying inconsistent date formats requires caref