In [3]:
from sklearn.tree import DecisionTreeRegressor
import numpy as np

# Original Data
X = np.array([[0.5, 1.2], [0.7, 1.4], [0.2, 0.9], [0.3, 1.1]])
y = np.array([10, 15, 8, 12])

# Train CART model
cart_model = DecisionTreeRegressor()
cart_model.fit(X, y)

# Function to generate synthetic data
def generate_synthetic_data(model, n_samples, X):
    n_features = X.shape[1]
    feature_min = X.min(axis=0)
    feature_max = X.max(axis=0)

    synthetic_data = []
    for _ in range(n_samples):
        sample = np.zeros(n_features)
        node_indicator = model.decision_path([sample]).indices

        for node_id in node_indicator:
            feature = model.tree_.feature[node_id]
            if feature == -2:  # leaf node
                continue
            threshold = model.tree_.threshold[node_id]

            # Ensure the generated value is within the range of the original feature
            if np.random.rand() > 0.5:
                new_value = min(threshold + np.random.rand() * (feature_max[feature] - threshold), feature_max[feature])
            else:
                new_value = max(threshold - np.random.rand() * (threshold - feature_min[feature]), feature_min[feature])

            sample[feature] = new_value

        synthetic_data.append(sample)
    return np.array(synthetic_data)

# Generate 5 synthetic samples
synthetic_X = generate_synthetic_data(cart_model, 5, X)
print(synthetic_X)


[[0.51220501 0.        ]
 [0.22533911 0.        ]
 [0.42074022 0.        ]
 [0.20319691 0.        ]
 [0.55169377 0.        ]]


In [2]:
from scipy.stats import ks_2samp, chi2_contingency

def compare_distributions(real_data, synthetic_data):
    results = {}
    for column in real_data.columns:
        if real_data[column].dtype == 'object':
            # Categorical feature
            real_counts = real_data[column].value_counts()
            synthetic_counts = synthetic_data[column].value_counts()
            chi2, p, _, _ = chi2_contingency([real_counts, synthetic_counts])
            results[column] = p
        else:
            # Numerical feature
            ks_stat, p = ks_2samp(real_data[column], synthetic_data[column])
            results[column] = p
    return results

p_values = compare_distributions(X, synthetic_X)
print(p_values)


AttributeError: 'numpy.ndarray' object has no attribute 'columns'