## Compare Data Completeness Over Time

**Description**: Analyze the trend of missing data in `"sales_data.csv"` over several months stored in a "date" column. Visualize missing data rates by month.

In [None]:
# Write your code from here

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# --- Task: Compare Data Completeness Over Time ---
print("--- Task: Compare Data Completeness Over Time ---")

# Load the sales data
try:
    sales_df = pd.read_csv('company_data.csv')
except FileNotFoundError as e:
    print(f"Error loading file: {e}. Please ensure 'sales_data.csv' is in the same directory.")
    sales_df = pd.DataFrame() # Create empty DataFrame to prevent errors

if not sales_df.empty:
    # 1. Ensure 'date' column is in datetime format
    sales_df['date'] = pd.to_datetime(sales_df['date'])

    # 2. Extract Year and Month for grouping
    sales_df['year_month'] = sales_df['date'].dt.to_period('M')

    # 3. Calculate missing values per row for each month
    # We'll calculate the missing percentage for all columns except 'order_id' and 'date'
    # as these are usually expected to be present.
    # Adjust `columns_to_check` based on which columns you care about for completeness.
    columns_to_check = ['product_id', 'quantity', 'price', 'customer_id', 'region']

    # Initialize a list to store missing data rates per month
    missing_rates_by_month = []

    # Iterate through each unique month-year period
    for ym_period, group_df in sales_df.groupby('year_month'):
        total_rows_in_month = len(group_df)
        if total_rows_in_month == 0: # Handle empty groups if any
            continue

        # Calculate missing counts for the relevant columns in this month's data
        missing_counts = group_df[columns_to_check].isnull().sum()

        # Calculate percentage of missing values for each relevant column
        # and then average those percentages, or sum total missing cells
        total_missing_cells_in_month = missing_counts.sum()
        total_data_cells_in_month = total_rows_in_month * len(columns_to_check)

        if total_data_cells_in_month > 0:
            overall_missing_percentage_for_month = (total_missing_cells_in_month / total_data_cells_in_month) * 100
        else:
            overall_missing_percentage_for_month = 0 # No data cells to check

        missing_rates_by_month.append({
            'year_month': str(ym_period), # Convert Period object to string for plotting
            'missing_percentage': overall_missing_percentage_for_month
        })

    # Convert the list of dictionaries to a DataFrame for easier plotting
    missing_trends_df = pd.DataFrame(missing_rates_by_month)
    missing_trends_df['year_month'] = pd.to_datetime(missing_trends_df['year_month']) # Convert back to datetime for plotting

    print("\nMonthly Missing Data Trends:")
    print(missing_trends_df.round(2))

    # 4. Visualize missing data rates by month
    plt.figure(figsize=(12, 6))
    sns.lineplot(data=missing_trends_df, x='year_month', y='missing_percentage', marker='o')
    plt.title('Trend of Overall Missing Data Percentage by Month')
    plt.xlabel('Month')
    plt.ylabel('Missing Data Percentage (%)')
    plt.grid(True)
    plt.xticks(rotation=45) # Rotate x-axis labels for better readability
    plt.tight_layout() #

--- Task: Compare Data Completeness Over Time ---


KeyError: 'date'