In [None]:
# Data Drift Impact on Model
# Question: Use a simple linear regression model to demonstrate how data drift affects model predictions.

# 1. Train a model on the original data:
# 2. Evaluate on the drifted data:
# 3. Compare errors:




In [None]:
# Monitoring Data Distribution Changes
# Question: Use Python to monitor distribution changes in features to detect potential data drift.

# 1. Calculate feature statistics (mean and standard deviation) for both original and drifted data:
# 2. Compare statistics:
# 3. Set thresholds to detect significant drift:




In [None]:
# Automating Data Quality Checks with Python
# Question: Automate a basic data validation process using Python to ensure the dataset's
# structural integrity.

# 1. Define validation checks:
# 2. Apply validation:




In [None]:
# Introducing Great Expectations for Data Validation
# Question: Use Great Expectations to set up data validation checks for a dataset.

# 1. Install Great Expectations:
# 2. Create a new expectations suite:
# 3. Load data and generate expectations:




In [None]:
# Automating Constraint Checks with Python
# Question: Automate primary key and foreign key constraint checks using Python to ensure dataset compliance.


# 1. Assuming datasets exist with primary and foreign key relationships in pandas dataframes employees_df and departments_df :




In [None]:
# Advanced Data Drift Detection using Statistical Tests
# Question: Implement Kolmogorov-Smirnov test using Python to detect data drift at a more sophisticated level.

# 1. Use SciPy to perform KS test:




In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.stats import kstest
import great_expectations as ge

# --- Data Drift Impact on Model ---
print("\n--- Data Drift Impact on Model ---")

# 1. Train a model on the original data:
original_data = pd.DataFrame({'feature': np.random.rand(100), 'target': 2 * np.random.rand(100) + np.random.normal(0, 0.1, 100)})
X_original = original_data[['feature']]
y_original = original_data['target']
model = LinearRegression()
model.fit(X_original, y_original)

# 2. Evaluate on the drifted data:
drifted_data = pd.DataFrame({'feature': np.random.rand(100) + 0.5, 'target': 2 * (np.random.rand(100) + 0.5) + np.random.normal(0, 0.2, 100)})
X_drifted = drifted_data[['feature']]
y_drifted = drifted_data['target']
predictions_drifted = model.predict(X_drifted)
mse_drifted = mean_squared_error(y_drifted, predictions_drifted)
print(f"Mean Squared Error on drifted data: {mse_drifted:.4f}")

# 3. Evaluate on the original data for comparison:
predictions_original = model.predict(X_original)
mse_original = mean_squared_error(y_original, predictions_original)
print(f"Mean Squared Error on original data: {mse_original:.4f}")

print("Observation: The error is likely higher on the drifted data, demonstrating the impact of data distribution changes on model performance.")

# --- Monitoring Data Distribution Changes ---
print("\n--- Monitoring Data Distribution Changes ---")

# 1. Calculate feature statistics (mean and standard deviation) for both original and drifted data:
original_mean = original_data['feature'].mean()
original_std = original_data['feature'].std()
drifted_mean = drifted_data['feature'].mean()
drifted_std = drifted_data['feature'].std()

print(f"Original data - Mean: {original_mean:.4f}, Standard Deviation: {original_std:.4f}")
print(f"Drifted data - Mean: {drifted_mean:.4f}, Standard Deviation: {drifted_std:.4f}")

# 2. Compare statistics:
mean_diff = abs(original_mean - drifted_mean)
std_diff = abs(original_std - drifted_std)
print(f"Difference in means: {mean_diff:.4f}")
print(f"Difference in standard deviations: {std_diff:.4f}")

# 3. Set thresholds to detect significant drift:
mean_threshold = 0.2
std_threshold = 0.1

if mean_diff > mean_threshold:
    print(f"Potential data drift detected based on mean difference (>{mean_threshold}).")
else:
    print(f"No significant drift detected based on mean difference (>{mean_threshold}).")

if std_diff > std_threshold:
    print(f"Potential data drift detected based on standard deviation difference (>{std_threshold}).")
else:
    print(f"No significant drift detected based on standard deviation difference (>{std_threshold}).")

# --- Automating Data Quality Checks with Python ---
print("\n--- Automating Data Quality Checks with Python ---")

# Sample data
data_quality = pd.DataFrame({'ID': [1, 2, 3, 4, 5],
                            'value': [10, 20, None, 40, 50],
                            'category': ['A', 'B', 'A', 'C', 'B']})

# 1. Define validation checks:
def check_missing_values(df, column):
    if df[column].isnull().sum() > 0:
        print(f"Validation failed: Column '{column}' has missing values.")
        return False
    return True

def check_data_type(df, column, expected_type):
    if df[column].dtype != expected_type:
        print(f"Validation failed: Column '{column}' has incorrect data type (expected {expected_type}, got {df[column].dtype}).")
        return False
    return True

def check_categorical_values(df, column, allowed_values):
    invalid_values = df[column][~df[column].isin(allowed_values)].unique()
    if len(invalid_values) > 0:
        print(f"Validation failed: Column '{column}' has invalid values: {invalid_values}.")
        return False
    return True

# 2. Apply validation:
print("Running data quality checks:")
is_valid = True
is_valid &= check_missing_values(data_quality, 'value')
is_valid &= check_data_type(data_quality, 'ID', 'int64')
is_valid &= check_categorical_values(data_quality, 'category', ['A', 'B', 'C'])

if is_valid:
    print("Data quality checks passed.")

# --- Introducing Great Expectations for Data Validation ---
print("\n--- Introducing Great Expectations for Data Validation ---")

# 1. Install Great Expectations:
print("Assuming Great Expectations is already installed (`pip install great_expectations`).")

# 2. Create a new expectations suite:
print("To create a new expectations suite, you would typically run:")
print("`great_expectations init` and then `great_expectations suite new` in your terminal.")
print("This interactive process helps you define your data source and create a suite.")

# For demonstration purposes, let's simulate loading data and adding an expectation programmatically:
context = ge.get_context()
datasource_name = "my_pandas_datasource"
data_asset_name = "my_data_asset"
expectation_suite_name = "my_expectation_suite"

if datasource_name not in context.list_datasources():
    context.add_pandas(name=datasource_name, batch_kwargs_generators=None)

batch_kwargs = {"dataset": data_quality}

if expectation_suite_name not in context.list_expectation_suite_names():
    suite = context.create_expectation_suite(expectation_suite_name=expectation_suite_name)
    validator = context.get_validator(batch_kwargs=batch_kwargs, expectation_suite_name=expectation_suite_name, datasource_name=datasource_name, data_asset_name=data_asset_name)
    validator.expect_column_to_not_have_missing_values("value")
    validator.expect_column_values_to_be_in_set("category", ["A", "B", "C"])
    validator.save_expectation_suite()
    print(f"Expectation suite '{expectation_suite_name}' created and saved.")
else:
    print(f"Expectation suite '{expectation_suite_name}' already exists.")
    validator = context.get_validator(batch_kwargs=batch_kwargs, expectation_suite_name=expectation_suite_name, datasource_name=datasource_name, data_asset_name=data_asset_name)

# 3. Load data and generate expectations:
print("\nRunning expectations:")
validation_result = validator.validate()
print(validation_result)
if not validation_result["success"]:
    print("Great Expectations found issues with the data.")
else:
    print("Great Expectations validation passed.")

# --- Automating Constraint Checks with Python ---
print("\n--- Automating Constraint Checks with Python ---")

# 1. Assuming datasets exist with primary and foreign key relationships in pandas dataframes employees_df and departments_df :
employees_data = {'employee_id': [101, 102, 103, 104, 105],
                  'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
                  'department_id': [1, 2, 1, 3, 2]}
employees_df = pd.DataFrame(employees_data)

departments_data = {'department_id': [1, 2, 3],
                    'department_name': ['HR', 'Engineering', 'Sales']}
departments_df = pd.DataFrame(departments_data)

# Define primary key and foreign key columns
employee_pk_col = 'employee_id'
department_pk_col = 'department_id'
employee_fk_col = 'department_id'

# 2. Apply constraint checks:

def check_primary_key_uniqueness(df, pk_col):
    if df[pk_col].duplicated().any():
        print(f"Constraint check failed: Primary key column '{pk_col}' has duplicate values.")
        return False
    print(f"Constraint check passed: Primary key column '{pk_col}' is unique.")
    return True

def check_foreign_key_references(fk_df, fk_col, pk_df, pk_col):
    invalid_fk_values = fk_df[fk_col][~fk_df[fk_col].isin(pk_df[pk_col])].unique()
    if len(invalid_fk_values) > 0:
        print(f"Constraint check failed: Foreign key column '{fk_col}' has values that do not exist in primary key column '{pk_col}': {invalid_fk_values}.")
        return False
    print(f"Constraint check passed: Foreign key column '{fk_col}' references valid values in primary key column '{pk_col}'.")
    return True

print("Running constraint checks:")
check_primary_key_uniqueness(employees_df, employee_pk_col)
check_primary_key_uniqueness(departments_df, department_pk_col)
check_foreign_key_references(employees_df, employee_fk_col, departments_df, department_pk_col)

# --- Advanced Data Drift Detection using Statistical Tests ---
print("\n--- Advanced Data Drift Detection using Statistical Tests ---")

# 1. Use SciPy to perform KS test:
original_feature = original_data['feature'].values
drifted_feature = drifted_data['feature'].values

ks_statistic, p_value = kstest(original_feature, drifted_feature)

print(f"Kolmogorov-Smirnov Statistic: {ks_statistic:.4f}")
print(f"P-value: {p_value:.4f}")

alpha = 0.05
if p_value < alpha:
    print(f"Reject null hypothesis. Significant data drift detected (p < {alpha}).")
else:
    print(f"Fail to reject null hypothesis. No significant data drift detected (p >= {alpha}).")


--- Data Drift Impact on Model ---
Mean Squared Error on drifted data: 1.0812
Mean Squared Error on original data: 0.3052
Observation: The error is likely higher on the drifted data, demonstrating the impact of data distribution changes on model performance.

--- Monitoring Data Distribution Changes ---
Original data - Mean: 0.4706, Standard Deviation: 0.3158
Drifted data - Mean: 0.9739, Standard Deviation: 0.2845
Difference in means: 0.5034
Difference in standard deviations: 0.0313
Potential data drift detected based on mean difference (>0.2).
No significant drift detected based on standard deviation difference (>0.1).

--- Automating Data Quality Checks with Python ---
Running data quality checks:
Validation failed: Column 'value' has missing values.

--- Introducing Great Expectations for Data Validation ---
Assuming Great Expectations is already installed (`pip install great_expectations`).
To create a new expectations suite, you would typically run:
`great_expectations init` and 

AttributeError: 'EphemeralDataContext' object has no attribute 'add_pandas'