In [1]:
import pandas as pd
import numpy as np
from scipy.stats import ks_2samp, chisquare

# Training DataFrame
data_train = {
    'age': [25, 32, 47, 51, 62, 33, 40, 59, 48, 52],
    'income': [50000, 60000, 75000, 80000, 120000, 65000, 70000, 115000, 76000, 85000],
    'gender': ['M', 'F', 'F', 'M', 'F', 'M', 'F', 'M', 'F', 'M']
}
df_train = pd.DataFrame(data_train)

# Batch Prediction DataFrame
data_batch = {
    'age': [29, 36, 50, 55, 64, 34, 42, 57, 49, 53],
    'income': [52000, 61000, 74000, 82000, 125000, 64000, 72000, 110000, 78000, 86000],
    'gender': ['M', 'F', 'F', 'M', 'F', 'M', 'F', 'M', 'F', 'M']
}
df_batch = pd.DataFrame(data_batch)

# Check Schema Compatibility
def check_schema_compatibility(df1, df2):
    return df1.dtypes.equals(df2.dtypes)

schema_compatible = check_schema_compatibility(df_train, df_batch)
print(f'Schema Compatible: {schema_compatible}')

# Continuous Columns Validation
def validate_continuous_columns(df1, df2, columns):
    results = {}
    for column in columns:
        stat, p_value = ks_2samp(df1[column], df2[column])
        results[column] = {'statistic': stat, 'p_value': p_value}
    return results

continuous_columns = ['age', 'income']
continuous_validation = validate_continuous_columns(df_train, df_batch, continuous_columns)
print('Continuous Columns Validation:')
print(continuous_validation)

# Categorical Columns Validation
def validate_categorical_columns(df1, df2, columns):
    results = {}
    for column in columns:
        df1_counts = df1[column].value_counts()
        df2_counts = df2[column].value_counts()
        all_categories = set(df1_counts.index).union(set(df2_counts.index))
        df1_counts = df1_counts.reindex(all_categories, fill_value=0)
        df2_counts = df2_counts.reindex(all_categories, fill_value=0)
        stat, p_value = chisquare(df1_counts, df2_counts)
        results[column] = {'statistic': stat, 'p_value': p_value}
    return results

categorical_columns = ['gender']
categorical_validation = validate_categorical_columns(df_train, df_batch, categorical_columns)
print('Categorical Columns Validation:')
print(categorical_validation)


Schema Compatible: True
Continuous Columns Validation:
{'age': {'statistic': 0.2, 'p_value': 0.9944575548290717}, 'income': {'statistic': 0.1, 'p_value': 1.0}}
Categorical Columns Validation:
{'gender': {'statistic': 0.0, 'p_value': 1.0}}


# Interpret Results
# Interpret the p-values from the statistical tests:

# For continuous columns, a high p-value (typically > 0.05) in the KS test
# suggests that the distributions are similar.
# For categorical columns, a high p-value in the Chi-square test suggests that # the category distributions are similar.