## Defining Data Quality SLAs
### Data Completeness
**Description**: Set an SLA that ensures that 95% of data fields in your dataset are filled (non-null values). Practice by checking a dataset of your choice and calculate its completeness.

In [None]:
# write your code from here

In [1]:
import pandas as pd

def check_data_completeness(df, threshold=0.95):
    completeness = df.notnull().mean()  # fraction of non-null per column
    overall_completeness = completeness.mean()  # average across all columns
    print(f"Overall data completeness: {overall_completeness:.2%}")
    if overall_completeness >= threshold:
        print("SLA Passed: Data completeness meets the threshold.")
    else:
        print("SLA Failed: Data completeness below threshold.")
    return overall_completeness >= threshold

# Example dataset
data = {
    'A': [1, 2, None, 4, 5],
    'B': [5, None, 7, 8, 9],
    'C': ['x', 'y', 'z', None, 'w']
}
df = pd.DataFrame(data)

check_data_completeness(df)


Overall data completeness: 80.00%
SLA Failed: Data completeness below threshold.


False

### Data Timeliness:
**Description**: Establish an SLA that specifies that data should be integrated and processed within 24 hours of acquisition. Monitor the data pipeline for timeliness.

In [None]:
# write your code from here

In [2]:
from datetime import datetime, timedelta

def check_data_timeliness(acquisition_times, processing_times, max_delay_hours=24):
    """
    acquisition_times: pd.Series of datetime when data was acquired
    processing_times: pd.Series of datetime when data was processed
    """
    delays = processing_times - acquisition_times
    delays_in_hours = delays.dt.total_seconds() / 3600
    timely = delays_in_hours <= max_delay_hours
    timely_pct = timely.mean()
    print(f"Timely processing: {timely_pct:.2%} within {max_delay_hours} hours")
    if timely_pct >= 1.0:  # SLA: 100% processed within 24 hrs
        print("SLA Passed: All data processed within SLA window.")
    else:
        print("SLA Failed: Some data processed beyond SLA window.")
    return timely_pct >= 1.0

# Example data
acquisition_times = pd.to_datetime([
    "2025-05-25 08:00:00",
    "2025-05-25 09:30:00",
    "2025-05-25 11:00:00"
])
processing_times = pd.to_datetime([
    "2025-05-26 07:00:00",  # within 24 hrs
    "2025-05-26 10:00:00",  # within 24 hrs
    "2025-05-27 12:00:00"   # > 24 hrs, late
])

check_data_timeliness(acquisition_times, processing_times)


AttributeError: 'TimedeltaIndex' object has no attribute 'dt'

### Data Consistency:
**Description**: Define an SLA for maintaining consistency across various related datasets. Implement a check to ensure that 99% of data entries are consistent.

In [None]:
# write your code from here

In [3]:
def check_data_consistency(df1, df2, key_columns, threshold=0.99):
    """
    Check consistency between two datasets on key columns.
    Consistency = fraction of matching entries on keys.
    """
    # Merge datasets on key columns (inner join)
    merged = pd.merge(df1, df2, on=key_columns, how='inner', suffixes=('_1', '_2'))

    # Check matching rows on all columns except keys (assumes same columns)
    data_cols = [c for c in df1.columns if c not in key_columns]
    consistent_rows = (merged[[col + '_1' for col in data_cols]] ==
                       merged[[col + '_2' for col in data_cols]].values).all(axis=1)
    consistency_ratio = consistent_rows.mean()

    print(f"Data consistency: {consistency_ratio:.2%}")
    if consistency_ratio >= threshold:
        print("SLA Passed: Data consistency meets threshold.")
    else:
        print("SLA Failed: Data consistency below threshold.")
    return consistency_ratio >= threshold

# Example datasets
df1 = pd.DataFrame({
    'id': [1, 2, 3, 4],
    'value': [10, 20, 30, 40]
})

df2 = pd.DataFrame({
    'id': [1, 2, 3, 4],
    'value': [10, 20, 35, 40]  # one mismatch on id=3
})

check_data_consistency(df1, df2, key_columns=['id'])


Data consistency: 75.00%
SLA Failed: Data consistency below threshold.


False